1、 导入模块
import requests
from lxml import etree
2、获取小说名字以及章节地址
# 请求路由地址
b_resp = requests.get(b_url, headers=headers)
# 网页编码设置
b_resp.encoding = 'gb18030'
# 解析html页面
html = etree.HTML(b_resp.text)
# 书名
b_title = html.xpath('//h1/span/text()')[0]
# 章节路由地址
b_list = html.xpath('//tr/td//div/a/@href')[9:]
3、 获取小说内容
# 循环获取小说章节url
for item in b_list:
c_resp = requests.get(item, headers=headers)
c_resp.encoding = 'gb18030'
c_page = etree.HTML(c_resp.text)
# 小说章节标题
c_title = c_page.xpath('//h2/text()')[0]
# 获取小说章节内容
c_list = c_page.xpath('//div[@class = "noveltext"]/text()')[3:-5]
# 小说章节内容处理
c_list = '\n'.join(c_list).replace(' ', '').replace('\u3000', '').replace('\r', '')
# 小说章节标题和内容合并
c_content = f'\n{c_title}\n{c_list}\n\n'
# 写入文件
with open(f"{b_title}.txt", mode='a', encoding='utf-8') as f:
f.write(c_content)
print(f'{c_title}下载完成')
print(f"{b_title}下载完成")
4、完整代码
import requests
from lxml import etree
import re
b_url = 'https://www.jjwxc.net/onebook.php?novelid=3472688'
headers = {
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
# 请求路由地址
b_resp = requests.get(b_url, headers=headers)
# 网页编码设置
b_resp.encoding = 'gb18030'
# 解析html页面
html = etree.HTML(b_resp.text)
# 书名
b_title = html.xpath('//h1/span/text()')[0]
# 章节路由地址
b_list = html.xpath('//tr/td//div/a/@href')[9:]
print(b_title)
print(b_list)
# 循环获取小说章节url
for item in b_list:
c_resp = requests.get(item, headers=headers)
c_resp.encoding = 'gb18030'
c_page = etree.HTML(c_resp.text)
# 小说章节标题
c_title = c_page.xpath('//h2/text()')[0]
# 获取小说章节内容
c_list = c_page.xpath('//div[@class = "noveltext"]/text()')[3:-5]
# 小说章节内容处理
c_list = '\n'.join(c_list).replace(' ', '').replace('\u3000', '').replace('\r', '')
# 小说章节标题和内容合并
c_content = f'\n{c_title}\n{c_list}\n\n'
# 写入文件
with open(f"{b_title}.txt", mode='a', encoding='utf-8') as f:
f.write(c_content)
print(f'{c_title}下载完成')
print(f"{b_title}下载完成")