I am practicing my python skills by scraping different websites and I came across The Hacker News Website I manage to scrape the article's title, links, author names..etc.The problem occurs when i try to advance the scraper feature by filtering between years e.g scrape all the articles in between 2018 and 2019 I try to implement it and it gives me some results but not exact results final output it produces include articles of 2017 as well.Here is my code:
from bs4 import BeautifulSoup
import requests
import csv
import time
results = []
def fetch(url):
response = requests.get(url)
#print(f' | Status code: {response.status_code}')
return response
def parse(response):
#print(f'HTTP GET: {response.url} | Status code: {response.status_code}')
content = BeautifulSoup(response.text, 'lxml')
#Extract Data Fields
labels = content.findAll('div', {'class': 'item-label'})
story_date = [[tag for tag in date][1] for date in labels]
if '2019' or '2018' in story_date:
story_title = [title.text for title in content.find_all('h2', {'class': 'home-title'})]
story_link = [story_link['href'] for story_link in content.find_all('a', {'class': 'story-link'})]
story_author = [[tag for tag in author][2].text.strip('\n')[1:] for author in labels]
for index in range(0, len(story_date)):
results.append({
'date': story_date[index],
'title': story_title[index],
'link': story_link[index],
'author': story_author[index]
})
def export_to_csv(filename):
with open(filename, 'w', newline='') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=results[0].keys())
writer.writeheader()
for row in results:
writer.writerow(row)
if __name__ == '__main__':
baseURL = 'https://thehackernews.com/search/label/'
categories = ['data%20breach', 'Cyber%20Attack', 'Vulnerability', 'Malware']
years = ['2018', '2019', '2020', '2021']
for category in categories:
for year in years:
for page in range(0, 5):
index = page + 19
url = baseURL + category + f'?updated-max={year}-06-09T13:30:00-07:00&max-results=20&start=' + str(index) + '&by-date=false'
res = fetch(url)
html_parsing = parse(res)
export_to_csv('thn.csv')
time.sleep(2)Desire Output: Filter out all the articles under the N number of years.
