小说,宅男必备,也是当今社会人们打发时间的一种方式。今天给大家分享的,喜欢看小说的同学可以收藏起来了!!
目标站点:
https://www.zhenhunxiaoshuo.com
爬取思路:
- 获取站点分类下的书籍链接
- 请求书籍链接,获取书籍章节
- 请求章节内容,保存为txt
因为之前也写过关于 爬取笔趣阁的小说爬虫 ~点击查看~ ,具体的爬取过程不做过多分析,因代码比较简单,使用的 xpath 进行的数据获取,难度不是太大,看得不懂的可以参考 笔趣阁小说爬虫。
右击运行代码,即可成功下载整个站点的小说,还不快来试试,建议大家在代码跑起来后,差不多的话就把程序终止了,看下代码,学习下思路即可,因为站点太小,怕站长的服务器扛不住,还望手下留情。代码获取,回复:“多进程小说爬取”。
示例代码如下:
# #!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import os
import requests
import traceback
import multiprocessing
import lxml.etree
from lxml import etree
from multiprocessing.dummy import Pool
class ZhenHun(object):
def __init__(self):
self._root_url = 'https://www.zhenhunxiaoshuo.com'
self._headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36 Edg/99.0.1150.39',
}
self._chunai_path = './纯爱小说类'
self._yanqing_path = './言情小说类'
self._priest_path = './priest小说集'
def get_pagination(self, url) -> int:
'''
获取当前分类共有多少页
:param page_source: requests请求获得的页面文本
:return: 返回页面数pagination或者只有一页返回0
'''
response = requests.get(url=url, headers=self._headers)
response.close()
tree = etree.HTML(response.text)
try:
th_list = tree.xpath('//div[@class="content"]/article/table[2]//tr/th')
if th_list:
pagination = len(th_list) - 2
return pagination
else:
return 0
except Exception as e:
return 0
def parse_outline(self, url: str) -> list:
'''
解析大纲中的tr标签
:param url:页面url
:return:tr标签的列表tr_list
'''
response = requests.get(url=url, headers=self._headers)
response.close()
tree = etree.HTML(response.text)
tr_list = tree.xpath('//article[@class="article-content"]/table[1]//tr')
del tr_list[0]
# 返回tr标签的列表tr_list
return tr_list
def get_novel_data(self, tr: lxml.etree) -> tuple:
'''
解析tr标签中对应的小说url,小说名字,小说作者
:param tr:标签对象
:return:(path_novel, novel_url)
'''
try:
novel_name = tr.xpath('./td/a/strong/text()')[0]
novel_author = tr.xpath('./td[2]/text()')[0]
novel_url = self._root_url + tr.xpath('./td/a/@href')[0]
path_novel = f'{self._chunai_path}/{novel_name}---{novel_author}'
if not os.path.exists(path_novel):
os.mkdir(path_novel)
return path_novel, novel_name, novel_url
except Exception as e:
pass
def get_section_data(self, novel_data: dict) -> None:
'''
通过novel_data获取每一章节对应的内容
:return:None
'''
novel_path = novel_data[0]
novel_url = novel_data[2]
novel_name = novel_data[1]
response = requests.get(url=novel_url, headers=self._headers)
response.close()
tree = etree.HTML(response.text)
article_list = tree.xpath('//div[@class="excerpts"]/article')
for article in article_list:
section_name = article.xpath('./a/text()')[0]
section_url = article.xpath('./a/@href')[0]
try:
r = re.findall('[¥#%*&@]', section_name)
for i in r:
section_name = section_name.replace(i, '')
except Exception as e:
pass
path = novel_path + f'/{section_name}.txt'
name = novel_name + f'---{section_name}'
response = requests.get(url=section_url, headers=self._headers)
response.close()
tree = etree.HTML(response.text)
content = tree.xpath('//article[@class="article-content"]//text()')
content = ''.join(content)
# 保存
self.save_data(path=path, name=name, content=content)
return
def save_data(self, path: str, name: str, content: str) -> None:
'''
保存数据
:param path:保存的路径
:param name: 提示信息需要的文件名
:param content: 要保存的数据
:return:
'''
if not os.path.exists(path):
with open(file=path, mode='w', encoding='utf-8') as f:
f.write(content)
print(f'下载完成---{name}')
else:
print(f'已经下载---{name}')
return
def spider(self, url: str) -> None:
'''
开始爬虫
:param url:页面url
:return:None
'''
pool = Pool(15)
# 解析第一页大纲每本小说的tr标签
tr_list = self.parse_outline(url=url)
# 通过tr标签解析对应的小说url, 小说名字,小说作者
novel_data_list = pool.map(self.get_novel_data, tr_list)
# 获取小说内容并保存
pool.map(self.get_section_data, novel_data_list)
return
def chunai(self):
if not os.path.exists(self._chunai_path):
os.mkdir(self._chunai_path)
url_outline = self._root_url + '/chunai'
# 获取页数
pagination = self.get_pagination(url=url_outline)
if pagination == 0:
print('纯爱类小说---共1页')
self.spider(url=url_outline)
else:
print(f'纯爱类小说---共{pagination}页')
# 爬取第一页
self.spider(url=url_outline)
# 爬取以后的:
for i in range(2, pagination + 1):
page_url = self._root_url + f'/chunai{i}'
self.spider(url=page_url)
def yanqing(self):
if not os.path.exists(self._yanqing_path):
os.mkdir(self._yanqing_path)
url_outline = self._root_url + '/yanqing'
# 获取页数
pagination = self.get_pagination(url=url_outline)
if pagination == 0:
print('言情类小说---共1页')
self.spider(url=url_outline)
else:
print(f'言情类小说---共{pagination}页')
# 爬取第一页
self.spider(url=url_outline)
# 爬取以后的:
for i in range(2, pagination + 1):
page_url = self._root_url + f'/yanqing{i}'
self.spider(url=page_url)
def priest(self):
if not os.path.exists(self._priest_path):
os.mkdir(self._priest_path)
url_outline = self._root_url + '/priest'
# 获取页数
pagination = self.get_pagination(url=url_outline)
if pagination == 0:
print('priest小说集---共1页')
self.spider(url=url_outline)
else:
print(f'priest小说集---共{pagination}页')
# 爬取第一页
self.spider(url=url_outline)
# 爬取以后的:
for i in range(2, pagination + 1):
page_url = self._root_url + f'/priest{i}'
self.spider(url=page_url)
if __name__ == '__main__':
try:
zh = ZhenHun()
zh.chunai()
zh.yanqing()
zh.priest()
input('全部小说获取完成,回车键退出')
except Exception as e:
print(traceback.format_exc())
input('有异常,请保存异常信息提交管理员\n回车键退出')