1 """ 2 配置 redis 3 安装 pip3 install scrapy-redis 4 修改scrapy项目(先正常实现scrapy爬虫): 5 """ 6 7 # ----1 导入分布式爬虫类 8 from scrapy_ redis.spiders import RedisSpider, RedisCrawlSpider 9 # ----2 继承分布式爬虫类 10 class BookSpider(RedisSpider): 11 # ----3. 初始的 start_urls 改为 redis_key。 12 redis_key = 'key' 13 # ----4. settings中 修改调度器类和去重类 14 SCHEDULER = "scrapy_redis.scheduler.Scheduler" 15 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 16 SCHEDULER_PERSIST = True 17 # ----5. 在settings.py文件中配置Redis 18 REDIS_URL = redis://:[password]@host:port 19 20 ----------------------------------------------------------------------- 21 # ----1 导入分布式爬虫类 22 from scrapy_ redis.spiders import RedisSpider, RedisCrawlSpider 23 # ----2 继承分布式爬虫类 24 class BookSpider(RedisSpider): 25 26 # ----3 注销start_url & allowed_domains 27 # ----4 设置redis_key 28 redis_key = 'key' 29 # ----5 设置__init__ 30 def __init__(self, *args, **kwargs): 31 domain = kwargs.pop('domain', '') 32 self.allowed_domains = list(filter(None, domain.split(','))) 33 super(BookSpider, self).__init__(*args, **kwargs) 34 # ----5 修改配置文件 35 SCHEDULER = "scrapy_redis.scheduler.Scheduler" 36 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 37 SCHEDULER_PERSIST = True 38 REDIS_URL = redis://:[password]@host:port 39 40 # 注:运行时要加上domain指定的域
标签:__,实现,redis,----,scrapy,key,分布式 From: https://www.cnblogs.com/modly/p/16842765.html