import re标签:title,url,resp,爬取,headers,正则,实例,print,data From: https://www.cnblogs.com/longwanghzx/p/16601958.html
import requests
url = 'https://b.faloo.com/1183478 1.html'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
res = requests.get(url,headers=headers)
# 匹配每个章节的链接
chapter_url = re.findall('<a class="c_con_li_datail" href="(.*?)',res.text)
# print(chapter_url)
for data_url in chapter_url:
new_url = 'https:'+data_url
# print(new_url)
# 请求单个章节的完整内容
resp = requests.get(new_url,headers=headers)
# 提取标题
title = re.findall('<h1>(.*?)</h1>',resp.text)
# print(title)
# 提取文章内容
content = re.findall('<div class="noveContent">([\s\S]*?)<b><font color=red>',resp.text)
# 内容处理
data = content[0].replace('<p>','\n').replace('</p>','')
with open('./novel/{}.txt'.format(title),'w',encoding='utf-8') as f1:
f1.write(data)
print('正在写入--{}--'.format(title))