Apr-05-2020, 04:06 PM
I'am creating a spider to crawl webpage' json-ld schema markup and store data in mongodb. actually I want to scrape json-ld schema markup and extract the data type("@type" : "_____") from schema markup and store this @type in mongodb. My spiders crawl well whole schema markup code. But I want to know that How to extract @type from that json-ld schema markup and store it in mongodb.
This is my spider files
apple_spider.py
This is my spider files
apple_spider.py
import scrapy
from pprint import pprint
from extruct.jsonld import JsonLdExtractor
from ..items import ApplespiderItem
class AppleSpider(scrapy.Spider):
name = 'apple'
allowed_domains = ['apple.com']
start_urls = (
'http://www.apple.com/shop/mac/mac-accessories',
)
def parse(self, response):
extractor = JsonLdExtractor()
items = extractor.extract(response.body_as_unicode(), response.url)
pprint(items)
for item in items:
if item.get('properties', {}).get('name'):
properties = item['properties']
yield {
'name': properties['name'],
'price': properties['offers']['properties']['price'],
'url': properties['url']
}items.py import scrapy
class ApplespiderItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
price = scrapy.Field()
url = scrapy.Field()pipelines.pyimport pymongo class ApplespiderPipeline(object): def __init__(self): self.conn = pymongo.MongoClient( 'localhost', 27017 ) db = self.conn['newdb'] self.collection = db['app_tb'] def process_item(self, item, spider): self.collection.insert(dict(item)) return item
