- 增量式
- 概念:用于监测网站数据更新的情况。
- 核心机制:去重。redis的set实现去重
- 总结反爬机制:
- robots
- UA伪装
- 验证码
- 代理
- cookie
- 动态变化的请求参数
- js加密
- js混淆
- 图片懒加载
- 动态数据的捕获
- seleium:规避检测
1、创建scrapy startproject 项目名
2、创建虫子scrapy genspider -t crawl zjs www.xxx.com
3、配置文件UA伪装、日志等级、robots
4、虫子里面解析电影名称、详情页url、描述
5、items里面配置
6、虫子里面导入items
7、pipelines管道里面配置输入数据
8、配置文件开启管道
9、执行虫子scrapy crawl zjs
10、redis启动客户端:redis-cli.exe
查询所有的keys:keys *
查询长度:llen moiveData
查询数据:smembers movie_detail_urls
清空:flushall
zjs.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from redis import Redis
from zjsPro1.items import Zjspro1Item
class ZjsSpider(CrawlSpider):
conn = Redis(host='127.0.0.1', port=6379)
name = 'zjs'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://www.4567tv.tv/index.php/vod/show/class/%E7%88%B1%E6%83%85/id/1.html']
rules = (
Rule(LinkExtractor(allow=r'/page/\d+\.html'), callback='parse_item', follow=False),
)
def parse_item(self, response):
# 解析电影名称和详情页url:
li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li')
for li in li_list:
name = li.xpath('./div/a/@title').extract_first()
detail_url = 'https://www.4567kan.com/' + li.xpath('./div/a/@href').extract_first()
item = Zjspro1Item()
item['name'] = name
# 可以将爬过的电影的详情页的url记录起来
# ex == 0:数据插入失败 ex==1:数据插入成功
ex = self.conn.sadd("movie_detail_urls", detail_url)
if ex == 1:
print('捕获到最新更新出来的数据!')
yield scrapy.Request(detail_url, callback=self.parse_detail, meta={'item': item})
else:
print('暂无数据的更新!!!')
def parse_detail(self, response):
# 解析描述
desc = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[3]/text()').extract_first()
item = response.meta['item']
item['desc'] = desc
yield item