def parse(self, response): print('当当网') li = response.xpath('//ul[@id="component_59"]/li')
#src,name,price有个共同的父元素li,但是对于第一个li,没有data-original,所以遍历根据li的索引判断是否为none for item in li: srcFirst = item.xpath('./a/img/@src') src = item.xpath('./a/img/@data-original') name = item.xpath('./a/img/@alt')
#获取内容 price = item.xpath( './p[@class="price"]/span[@class="search_now_price"]/text()') if(src.extract_first()): resSrc = 'http:' + src.extract_first() else: resSrc = 'http:' + srcFirst.extract_first() resName = name.extract_first() resPrice = price.extract_first() print(resSrc,resName,resPrice) book = ScrapyproItem(src=resSrc,name=resName,price=resPrice) #交给pipeline yield book pass
settings.py
ITEM_PIPELINES = { 'scrapyPro.pipelines.ScrapyproPipeline': 300, }
items.py
class ScrapyproItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() src = scrapy.Field() name = scrapy.Field() price = scrapy.Field() pass
piplines.py
class ScrapyproPipeline: def process_item(self, item, spider): with open('book.json','a',encoding='utf-8')as fp: fp.write(str(item)) return item
新定义一个pepeline用来下载图片:
class DangDownloadPicture: def process_item(self, item, spider): url = item.get('src') name = './books/' + item.get('name') + '.jpg' urllib.request.urlretrieve(url=url,filename=name) return item
settings.py 301表示优先级,数字越小优先级越高
ITEM_PIPELINES = { 'scrapyPro.pipelines.DangDownloadPicture': 301, }
下载100页的图片和json数据:
class DangSpider(scrapy.Spider): name = 'dang' allowed_domains = ['category.dangdang.com'] start_urls = ['http://category.dangdang.com/cp01.01.02.00.00.00.html'] # http://category.dangdang.com/pg2-cp01.01.02.00.00.00.html base_url = 'http://category.dangdang.com/pg' page = 1 def parse(self, response): print('当当网') li = response.xpath('//ul[@id="component_59"]/li') for item in li: srcFirst = item.xpath('./a/img/@src') src = item.xpath('./a/img/@data-original') name = item.xpath('./a/img/@alt') price = item.xpath( './p[@class="price"]/span[@class="search_now_price"]/text()') if(src.extract_first()): resSrc = 'http:' + src.extract_first() else: resSrc = 'http:' + srcFirst.extract_first() resName = name.extract_first() resPrice = price.extract_first() print(resSrc,resName,resPrice) book = ScrapyproItem(src=resSrc,name=resName,price=resPrice) #交给pipeline yield book pass if self.page < 100: self.page = self.page + 1 url = self.base_url + str(self.page) + '-cp01.01.02.00.00.00.html' yield scrapy.Request(url=url,callback=self.parse)
标签:src,name,price,练习,当当网,item,scrapy,extract,first From: https://www.cnblogs.com/sgj191024/p/17742406.html