代码只能作为学习,请不要用于其他。
一、效果图
二、代码编写
1、items.py
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class Scrapy77DianshiItem(scrapy.Item): # define the fields for your item here like: #电影标题 title = scrapy.Field() #电影封面 pic = scrapy.Field() #电影描述 desc = scrapy.Field() #电影类型 remarks = scrapy.Field() #详情地址 link = scrapy.Field() pass
2、movie.py
# -*- coding: utf-8 -*- import scrapy from ..items import Scrapy77DianshiItem class MovieSpider(scrapy.Spider): name = 'movie' allowed_domains = ['77dianshi.com'] page = 1 host = "http://77dianshi.com" url = host + "/iTe5kdy/page_{0}.html" start_urls = [url.format(str(page))] def parse(self, response): print("当前采集第{0}页".format(self.page)) # 获取列表 for each in response.xpath("//ul[@class='fed-list-info fed-part-rows']//li"): item = Scrapy77DianshiItem() item['title'] = each.xpath('./a[2]//text()').extract()[0] item['link'] = self.host + each.xpath('./a[1]/@href').extract()[0] item['desc'] = each.xpath('./span[1]//text()').extract()[0] item['remarks'] = each.xpath('./a[1]//span[3]//text()').extract()[0] item['pic'] = each.xpath('./a[1]/@data-original').extract()[0] yield item #判断是否到最后一页 last_page = response.xpath("//div[@class='pages text-center']//a[last()]//text()").extract()[0] if last_page == "»": #不是最后一页 self.page += 1 yield scrapy.Request(self.url.format(self.page), callback=self.parse) else: print('结束采集,最后一页:' + str(self.page))
3、pipelines.py
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import json class Scrapy77DianshiPipeline(object): def __init__(self): #打开json文件 self.file = open('movie.json', 'w') def process_item(self, item, spider): # print(item) # 保存到json文件 self.file.write(json.dumps(dict(item), ensure_ascii=False)+'\n') return item def close_spider(self, spider): self.file.close()
4、start.py启动文件
from scrapy import cmdline import os if __name__ == "__main__": # 获取当前路径 dirpath=os.path.dirname(os.path.abspath(__file__)) # 切换到当前目录 os.chdir(dirpath) #执行爬虫 cmdline.execute("scrapy crawl movie".split())
标签:__,视频,xpath,案例,self,scrapy,item,Scrapy,page From: https://www.cnblogs.com/yang-2018/p/16774382.html