Scrapy框架爬取cnblog下一页简单实例
犯了一个错误:直接拿浏览器点出来第二页链接去做拼接,导致一直爬不到下一页
实际上应该是:
blog.py
import scrapy
from scrapy import Request
from bs4 import BeautifulSoup
import time
class BlogSpider(scrapy.Spider):
name = "blog"
allowed_domains = ["www.cnblogs.com"]
start_urls = ["http://www.cnblogs.com/"]
def parse(self, response):
### 测试是否能正确获得下一页链接
print(response)
soup = BeautifulSoup(response.text,'lxml')
atrlists = soup.find_all(class_="post-item")
for article in atrlists:
title = article.find(class_="post-item-title").text
link = article.find(class_="post-item-title").attrs.get('href')
time.sleep(3)
page = soup.find(class_="pager")
next_list = page.find_all('a')
for i in next_list:
next_page = i.attrs.get('href')
if next_page !='/' :
Next_url = response.urljoin(next_page)
yield Request(Next_url)
### 打印每一页最后一个标题链接,测试正常否
print('标题:%s; 链接:%s'%(title,link))
标签:title,find,爬取,Scrapy,cnblog,import,next,class,page
From: https://www.cnblogs.com/yigehulu/p/17545342.html