###If you run the code before the two "write" chuncks, then everything works out. What in the heck is going wrong when the scrape is coped to excel?!? I would be very grateful if someone who has mastered this could shed some insight.
It won't let me edit it till 10 minutes. Basically the csv file is repeating the reviews from only one product source. I need three separate review data from my specified attributes and tags. The code runs perfectly before writing it...
import requests as r
from bs4 import BeautifulSoup
#Get URL
main_url = 'http://drd.ba.ttu.edu/isqs6339/imbadproducts/'
response = r.get(main_url)
#Set filepaths
filepath = 'dataout1.csv'
filepath2 = 'dataout2.csv'
#Check for good link and get headers
print(response.status_code)
print (response.headers)
soup = BeautifulSoup(response.text, 'lxml')
print(soup.prettify())
#Find all anchors on the page
search_results = soup.find('div', attrs={'id' : 'searchresults'})
product_results = search_results.find_all('a')
#Define product link, id, title, price, and description for all products
for link in product_results:
link_url = main_url + link.get('href')
productId = link.find('span', attrs={'class' : 'productid'}).text
product_title = link.find('span', attrs={'class' : 'producttitle'}).text
product_price = link.find('span', attrs={'class' : 'productprice'}).text
product_description = link.find('span', attrs={'class' : 'productdesc'}).text
#Get links for each product
response2 = r.get(link_url)
soup2 = BeautifulSoup(response2.text, 'lxml')
#Find each user review for the product on the page
user_review = soup2.find('div', attrs={'id' : 'userreviews'})
review_results = user_review.find_all('div')
#Find author, stars, and review info for each review of the page's product and print results
for rev in review_results:
print ('ProductID: ' + productId)
print ('Product Title: ' + product_title)
print ('Product Price: ' + product_price)
print('Product Description: ' + product_description)
print ('User Review: ' )
author = rev.find('span', attrs={'class' : 'rauthor'}).text
print('Author: ' + author)
stars = rev.find('span', attrs={'class' : 'rstars'}).text
print('Stars: ' + stars)
review_of_product = rev.find('span' , attrs={'class' : 'rtext'}).text
print('Review: ' + review_of_product)
review_length = len(review_of_product)
print('Length: ')
print(review_length)
print('------------')
#Import CSV
import csv
#Open File 1 in CSV File
with open(filepath, 'w') as dataout:
datawriter = csv.writer(dataout, delimiter= ',', quotechar= '"', quoting = csv.QUOTE_NONNUMERIC)
headers = ['ProductId', 'Product Title', 'Product Price', 'Author', 'Stars', 'Length of Review']
datawriter.writerow(headers)
for link in product_results:
productId = link.find('span', attrs={'class' : 'productid'}).text
product_title = link.find('span', attrs={'class' : 'producttitle'}).text
product_price = link.find('span', attrs={'class' : 'productprice'}).text
for rev in review_results:
author = rev.find('span', attrs={'class' : 'rauthor'}).text
stars = rev.find('span', attrs={'class' : 'rstars'}).text
review_of_product = rev.find('span' , attrs={'class' : 'rtext'}).text
datawriter.writerow([productId, product_title, product_price, author, stars, len(review_of_product)])
#Open File 2 in CSV File
with open(filepath2, 'w') as dataout2:
datawriter = csv.writer(dataout2, delimiter= ',', quotechar= '"', quoting = csv.QUOTE_NONNUMERIC)
headers = ['ProductId', 'Author', 'Stars', 'Review Text']
datawriter.writerow(headers)
for link in product_results:
productId = link.find('span', attrs={'class' : 'productid'}).text
for rev in review_results:
author = rev.find('span', attrs={'class' : 'rauthor'}).text
stars = rev.find('span', attrs={'class' : 'rstars'}).text
review_of_product = rev.find('span' , attrs={'class' : 'rtext'}).text
datawriter.writerow([productId, author, stars, review_of_product])It won't let me edit it till 10 minutes. Basically the csv file is repeating the reviews from only one product source. I need three separate review data from my specified attributes and tags. The code runs perfectly before writing it...
