Feb-26-2024, 08:16 PM
I have the below code which scrapes this page: https://www.eeoc.gov/newsroom/search.
It works well but I also want it to open each url and scrape the full text on the page for each. Any suggestions on how to modify this code to achieve?
It works well but I also want it to open each url and scrape the full text on the page for each. Any suggestions on how to modify this code to achieve?
import csv
import requests
from bs4 import BeautifulSoup
def scrape_eec_news():
base_url = "https://www.eeoc.gov/newsroom/search?page="
results = []
page_number = 0
while True:
page_number += 1
url = base_url + str(page_number)
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
entries = soup.find_all("div", class_="views-row")
if not entries:
break
print("Scraping page", page_number) # Print the page number
for entry in entries:
title_elem = entry.h2
description_elem = entry.p
date_elem = entry.find("div", class_="field--type-datetime")
url_elem = entry.a
title = title_elem.text.strip()
description = description_elem.text.strip() if description_elem else ""
date = date_elem.text.strip() if date_elem else "" # Check if date_elem is not None
url = url_elem["href"]
# Add the 'agency' column with the value "United States Equal Employment Opportunity Commission"
results.append(
{
"title": title,
"description": description,
"date": date,
"url": url,
"agency": "United States Equal Employment Opportunity Commission"
}
)
return results
def export_to_csv(data, filename):
with open(filename, "w", newline="", encoding="utf-8") as csvfile:
fieldnames = ["title", "description", "date", "url", "agency"] # Include 'agency' in the fieldnames
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for entry in data:
writer.writerow(entry)
if __name__ == "__main__":
news_entries = scrape_eec_news()
export_to_csv(news_entries, "eec_news.csv")
print("Data exported to eec_news.csv")
