Sep-06-2022, 05:24 AM
from scrapy import Spider
from scrapy.http import Request
class TesterSpider(Spider):
name = 'tester'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com/']
def parse(self, response):
books = response.xpath("//h3/a/@href").extract()
for book in books:
absolute_url = response.urljoin(book)
yield Request(absolute_url, callback=self.parse_book)
# process next page
next_page_url = response.xpath("//a[text()='next']/@href").extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
yield Request(absolute_next_page_url)
def parse_book(self, response):
title = response.xpath("//h1/text()").extract_first()
price = response.xpath("//*[@class='price_color']/text()").extract_first()
img_url = response.xpath("//img/@src").extract_first()
img_url = img_url.replace('../..', 'https://books.toscrape.com')
rating = response.xpath("//p[starts-with(@class,'star-rating')]/@class").extract_first()
rating = rating.replace('star-rating ', '')
desc = response.xpath("//div[(@id='product_description')]/following-sibling::p/text()").extract_first()
# Product Description
upc = product_desc(response, 'UPC')
product_type = product_desc(response, 'Product Type')
availability = product_desc(response, 'Availability')
number_of_reviews = product_desc(response, 'Number of reviews')
yield{
'Title': title,
'Price': price,
'Location': img_url,
'Rating': rating,
'Description': desc,
'UPC': upc,
'Product Type': product_type,
'Availability': availability,
'Reviews': number_of_reviews
}
def product_desc(response, lookup):
return response.xpath("//th[text()='" + lookup + "']/following-sibling::td/text()").extract_first()As you can see, at the very bottom, the function 'product_desc' is defined, but just above that where I called it just above the yield block, my IDE, VS Code reports that it is undefined. Can anyone spot what I am missing.Thank you
