Nov-12-2018, 12:39 PM
(This post was last modified: Nov-12-2018, 02:33 PM by Baggelhsk95.)
i was testing the following code to see the results and on debugging i saw the scraped_items and it was like 4.777,....that wasnt the results i wanted to get....second i wanted to scrape each def function to different file and finnaly to scrape all the functions...and not only the first and second functions..... :(
Thank you very much!!! :D
here's is my actual code:
Thank you very much!!! :D
here's is my actual code:
# -*- coding: utf-8 -*-
import scrapy
class SccbotSpider(scrapy.Spider):
name = 'SccBot'
start_urls = ['https://spurverbreiterung.de/index.php?cat=c182_Radbefestigungsteile.html']
def parse(self, response):
tab1 = response.css('#tab1')
for container in tab1.css('tr > td[align="center"]'):
scraped_info = {
'TextBox' : container.css('a::text').extract(),
'LinkBox' : container.css('a::attr(href)').extract(),
'CurrentUrl' : response.url
}
yield scraped_info
urls = tab1.css('tr > td[align="center"] > a::attr(href)').extract()
for url in urls:
url = response.urljoin(url)
yield scrapy.Request(url=url, callback=self.parse_details)
def parse_details(self, response):
for containerx in response.css('tr > td[align="center"]'):
scraped_items = {
'TextBox' : containerx.css('a::text').extract(),
'LinkBox' : containerx.css('a::attr(href)').extract(),
'CurrentUrl' : response.url
}
yield scraped_items
urls = response.css('tr > td[align="center"] > a::attr(href)').extract()
for url in urls:
url = response.urljoin(url)
yield scrapy.Request(url=url, callback=self.parse_items)
def parse_items(self, response):
for products in response.css('div.inhalt > a.product_link'):
scraped_products = {
'Category' : response.css('#main_content > h1::text').extract(),
'CategoryType' : response.css('div.content_boxes > div.rad_header::text').extract(),
'ProductName' : products.css('div.prod-name::text').extract(),
'ProductNumber' : products.css('div.art-nr > span::text').extract(),
'Price' : products.css('div.preis').extract(),
'AvaibilityIcon' : products.css('div.ampel > img::attr(src)').extract(),
'ProductLink' : products.css('a.product_link::attr(href)').extract(),
'CurrentURL' : response.url
}
yield scraped_products
urls = response.css('div.inhalt > a.product_link::attr(href)').extract()
for url in urls:
url = response.urljoin(url)
yield scrapy.Request(url=url, callback=self.parse_ims)
def parse_ims(self, response):
for productss in response.css('div.wrapper'):
scraped_rads = {
'Title' : productss.css('#product_info > h1::text').extract(),
'Price' : productss.css('div.productsinfo_price > span::text').extract(),
'ProductDetails' : productss.css('div.product_details.clear > table').extract(),
'ProductInfo' : productss.css('div.productsinfo_right').extract(),
'ProductImg' : productss.css('div.productsinfo_img > ul > img::attr(src)').extract(),
'MoreDetails' : productss.css('div.textf_rechts').extract(),
'CurrentURL' : response.url,
}
yield scraped_rads
