#bs4解析实战
import requests
from bs4 import BeautifulSoup
import re
url='https://www.shicimingju.com/book/sanguoyanyi.html'
head={
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.30'
}
firsttext= requests.get(url=url,headers=head).text
#将抓取的文本先解析一下,其实没太大区别,内容没有改变,但是bs4可以直接查找标签
soup=BeautifulSoup(firsttext,'lxml')
lilist= soup.select('.book-mulu>ul>li')
fp=open('F:/spidertest/sanguo.txt','w',encoding='utf-8')
for li in lilist:
title=li.a.string
urlta='https://www.shicimingju.com'+li.a['href']
dtext= requests.get(url=urlta,headers=head).text
dsoup=BeautifulSoup(dtext,'lxml')
tag=dsoup.find('div',class_='chapter_content')
content=tag.text
fp.write(title+':'+content+'\n')
print(title,'爬取成功')
标签:bs4,text,BeautifulSoup,li,content,url,解析
From: https://www.cnblogs.com/tgfoven/p/17028569.html