简介
本文主要讲常规分页爬取与利用Scrapy框架怎么快捷的爬取分页的数据以及cookie登录,案例网站时17k小说网,url是https://www.17k.com/
- 常规分页爬取
- Scrapy框架分页爬取
- cookie登录
分页
常规分页爬取
常规分页爬取,直接观察页面数据,一共有多少页数据,就for循环多少次
class Xiaoshuo17ptSpider(scrapy.Spider):
name = "xiaoshuo17pt"
allowed_domains = ["17k.com"]
start_urls = ["https://www.17k.com/all/book/2_0_0_0_0_0_0_0_1.html"]
def start_requests(self):
for i in range(1,6):
url=f'https://www.17k.com/all/book/2_0_0_0_0_0_0_0_{i}.html'
yield Request(url,meta={"i":i})
def parse(self, response,**kwargs):
trs =response.xpath("//table/tbody/tr")
for tr in trs:
xiaoshuo_type = tr.xpath("./td[@class='td2']/a/text()").extract()
if len(xiaoshuo_type ) > 0:
xiaoshuo_type =xiaoshuo_type[0]
else:
continue
xiaoshuo_name = tr.xpath("./td[@class='td3']//a/text()").extract()
if len(xiaoshuo_type ) > 0:
xiaoshuo_name = xiaoshuo_name[0]
else:
continue
print(f"小说类别:{xiaoshuo_type},小说名称:{xiaoshuo_name}")
print(f"==============>爬取第{response.meta['i']}页数据结束")
Scrapy分页爬取
class Xiaoshuo17scSpider(scrapy.Spider):
name = "xiaoshuo17sc"
allowed_domains = ["17k.com"]
start_urls = ["https://www.17k.com/all/book/2_0_0_0_0_0_0_0_1.html"]
def parse(self, response):
fenye_urls = response.xpath('//div[@class="page"]//@href').extract()
for fenye_url in fenye_urls:
if fenye_url.startswith("javascript"):
continue
else:
fenye_url = response.urljoin(fenye_url)
"""
print(fenye_url) #在当前URL中会有很多重复的URL,
调用Request对象的时候scrapy调度器中通过集合过滤掉了重复项
"""
yield scrapy.Request(fenye_url,callback=self.parse)
cookie登录
复制粘贴cookie
def start_requests(self):
# 直接从浏览器复制
cookies = "GUID=bbb5f65a-2fa2-40a0-ac87-49840eae4ad1; c_channel=0; c_csc=web; Hm_lvt_9793f42b498361373512340937deb2a0=1627572532,1627711457,1627898858,1628144975; accessToken=avatarUrl%3Dhttps%253A%252F%252Fcdn.static.17k.com%252Fuser%252Favatar%252F16%252F16%252F64%252F75836416.jpg-88x88%253Fv%253D1610625030000%26id%3D75836416%26nickname%3D%25E5%25AD%25A4%25E9%25AD%2582%25E9%2587%258E%25E9%25AC%25BCsb%26e%3D1643697376%26s%3D73f8877e452e744c; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2275836416%22%2C%22%24device_id%22%3A%2217700ba9c71257-035a42ce449776-326d7006-2073600-17700ba9c728de%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%2C%22first_id%22%3A%22bbb5f65a-2fa2-40a0-ac87-49840eae4ad1%22%7D; Hm_lpvt_9793f42b498361373512340937deb2a0=1628145672"
cookie_dic = {}
for c in cookies.split("; "):
k, v = c.split("=")
cookie_dic[k] = v
yield Request(
url=LoginSpider.start_urls[0],
cookies=cookie_dic,
callback=self.parse
)
这种方案和原来的requests几乎一模一样. 需要注意的是: cookie需要通过cookies参数进行传递!
直接登录获取cookie
def start_requests(self):
# 登录流程
username = "18614075987"
password = "q6035945"
url = "https://passport.17k.com/ck/user/login"
# 发送post请求
# yield Request(
# url=url,
# method="post",
# body="loginName=18614075987&password=q6035945",
# callback=self.parse
# )
# 发送post请求
yield FormRequest(
url=url,
formdata={
"loginName": username,
"password": password
},
callback=self.parse
)
def parse(self, response):
# 得到响应结果. 直接请求到默认的start_urls
yield Request(
url=LoginSpider.start_urls[0],
callback=self.parse_detail
)
def parse_detail(self, resp):
print(resp.text)
注意, 发送post请求有两个方案,
-
Scrapy.Request(url=url, method='post', body=数据)
-
Scarpy.FormRequest(url=url, formdata=数据) -> 推荐
在settings文件中给出cookie值.
在settings中.有一个配置项: DEFAULT_REQUEST_HEADERS, 在里面可以给出默认的请求头信息. 但是要注意, 需要在settings中把COOKIES_ENABLED设置成False. 否则, 在下载器中间件中, 会被干掉.
COOKIES_ENABLED = False
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'Cookie': 'xxxxxx',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36"
}
标签:17k,Python,22%,爬取,url,cookie,xiaoshuo,self
From: https://www.cnblogs.com/fuchangjiang/p/17900547.html