废话不多说,先卡主代码
1 import scrapy 2 from selenium.webdriver import Chrome 3 from selenium.webdriver.common.by import By 4 import time 5 from CodeNav.items import CodenavItem 6 7 8 class CodeSpider(scrapy.Spider): 9 name = 'code' 10 # allowed_domains = ['www.xxx.com'] 11 start_urls = ['https://music.163.com/#/playlist?id=2329680016'] 12 bro = Chrome() 13 14 # bro.page_source 15 16 def parse(self, response, **kwargs): 17 div_list = response.xpath('/html/body/div[3]/div[1]/div/div/div[3]/div/div[2]/div[2]/div') 18 for div in div_list: 19 title = div.xpath('./div[2]/div[1]/div/a/text()').extract_first() 20 cnt = div.xpath('./div[2]/div[1]/div/text()').extract_first() 21 item = CodenavItem() 22 item['title'] = title 23 item['cnt'] = cnt 24 25 # print(title + ' ' + cnt) 26 yield item 27 28 def close(self, spider): 29 self.bro.quit()
```
再看中间件的代码
```
```
1 # Define here the models for your spider middleware 2 # 3 # See documentation in: 4 # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 5 6 from scrapy import signals 7 8 # useful for handling different item types with a single interface 9 from itemadapter import is_item, ItemAdapter 10 from scrapy.http import HtmlResponse 11 from selenium.webdriver.common.by import By 12 from time import sleep 13 14 15 class CodenavDownloaderMiddleware: 16 17 def process_request(self, request, spider): 18 web = spider.bro 19 web.get(request.url) 20 sleep(2) 21 web.execute_script('window.scrollTo(0,document.body.scrollHeight)') 22 sleep(2) 23 web.switch_to.frame('g_iframe') # 评论嵌套在了iframe里,括号里是iframe的id 24 sleep(2) 25 page_count = web.page_source 26 # print(page_count) 27 return HtmlResponse(url=request.url, body=page_count, encoding='utf-8', request=request) 28 # return None 29 30 def process_response(self, request, response, spider): 31 # Called with the response returned from the downloader. 32 33 # Must either; 34 # - return a Response object 35 # - return a Request object 36 # - or raise IgnoreRequest 37 return response 38 39 def process_exception(self, request, exception, spider): 40 # Called when a download handler or a process_request() 41 # (from other downloader middleware) raises an exception. 42 43 # Must either: 44 # - return None: continue processing this exception 45 # - return a Response object: stops process_exception() chain 46 # - return a Request object: stops process_exception() chain 47 pass 48 49 def spider_opened(self, spider): 50 spider.logger.info('Spider opened: %s' % spider.name)
***Note***:网易云音乐的评论都嵌套在一个iframe中,需要switch to到对应的iframe框架中,改好以后可以打印出来看下
剩下的就是scrapy的基本操作了,这里我只为了练习selenium在scrapy的应用,所以只是爬了第一页评论,后续页的评论要做适当修改