目录
1)使用正则表达式
#使用requests库和正则表达式抓取在https://www.kanunu8.com/book3/任选的一本电子书
import requests
import re
import os
import time
header = {
'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
def main(base_url,book_name):
os.makedirs(book_name,exist_ok=True)
#请求网页
req=requests.get(base_url,headers=header)
content=req.content.decode(req.apparent_encoding)
content = re.findall('正文(.*?)</tbody>', content, re.S)
chap_url_list=get_condition(content)[0]
chap_name_list=get_condition(content)[1]
x=0
for i in range(len(chap_name_list)):
chap_name=chap_name_list[i]
chap_url=chap_url_list[i]
chap_txt=get_txt(base_url,chap_url)
save(book_name,chap_name,chap_txt)
if chap_name=='前言':
print("前言保存成功")
x=x+1
else:
print(f"第{x}章保存成功")
x=x+1
# 获取 章节标题列表 和 章节url列表
def get_condition(content):
for item in content:
chap_url_list=re.findall('[0-9]{6}.html',item)
for item in content:
chap_name_list=re.findall('<a href="1831[0-9][0-9].html">(.+)</a></td>',item)
return chap_url_list,chap_name_list
#获取章节内容
def get_txt(base_url,chap_url):
base_url=re.search('https://www.kanunu8.com/book3/[0-9]{4}/',base_url).group(0)
url=base_url+chap_url
#请求每个章节的url
req=requests.get(url,headers=header)
chap_txt=req.content.decode('gbk')
#选取源码中的书本内容
#chap_content=re.search("<p>(.+)</p>",chap_content,re.S).group(0)
chap_txt=re.findall(r'<p>(.*?)</p>',chap_txt,re.S)[0]
#数据清洗 处理 、</br>等字符
chap_txt=chap_txt.replace(' ',"")
chap_txt=chap_txt.replace('<br />',"")
return chap_txt
#保存到当前目录
def save(book_name,chap_name,chap_txt):
chap_name=chap_name+'.txt'
with open(os.path.join(book_name,chap_name),'w',encoding='gbk') as file:
file.write(chap_txt)
if __name__ == '__main__':
base_url="https://www.kanunu8.com/book3/8259/index.html"
book_name="孽海花"
main(base_url,book_name)
2)使用bs4
#使用requests库和beautifulsoup4库在https://www.kanunu8.com/book2上抓取一本电子书
import requests
from lxml import html
from fake_useragent import UserAgent
import cchardet
import os
import re
#获取随机请求头
header={
'user-agent':UserAgent().random
}
#获取网页源码
def getSource(url):
req=requests.get(url,headers=header)
req.encoding=cchardet.detect(req.content)['encoding']
return req.text.replace("<br />","")
#解析网页
def getUrl(source):
bs=BeautifulSoup(source,'lxml')
#获取作者
actor=bs.select('tr>td[align="center"][height="30"][valign="middle"]')[0].string
actor=re.search('作者:(.*?) ',str(actor).strip()).group(1)
#获取书名
book_name=bs.select('h1>strong>font[color="#dc143c"]')[0].string
#获取内容简介
introduction=bs.select('tr[align="left"]>td[class="p10-24"]')[0].strings
introduction=''.join([i.replace("内容简介:","").strip() for i in introduction])
print("书名:",book_name,"\n作者:",actor,"\n内容简介:",introduction)
#获取每个章节链接和章节名
a=bs.select('tr[bgcolor="#ffffff"]>td>a[href]')
for i in a:
chap_name=i.string
chap_url=i['href']
chap_txt=getContent(chap_url)
save(book_name,chap_name,chap_txt)
print(f'{chap_name}保存成功')
#获取章节链接下的章节内容
def getContent(chap_url):
url=base_url+chap_url
print(url)
source=getSource(url)
bs=BeautifulSoup(source,'lxml')
content=str(bs.select('p')[0].string).replace(' ',"")
#print(content)
return content
#保存
def save(book_name,chap_name,chap_txt):
chap_name=chap_name+'.txt'
#判断文件夹是否存在
if not os.path.exists(book_name):
os.mkdir(book_name)
with open(os.path.join(book_name,chap_name),'w+',encoding='utf-8') as f:
f.write(chap_txt)
if __name__ == '__main__':
base_url = "https://www.kanunu8.com/book3/8196/"
source=getSource(base_url)
getUrl(source)
标签:www,name,url,book2,content,chap,book,https,txt
From: https://www.cnblogs.com/Gimm/p/18116338