1 import requests 2 from bs4 import BeautifulSoup 3 headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'} 4 5 url= 'https://sanguo.5000yan.com/' 6 page_text = requests.get(url=url,headers=headers).content 7 soup = BeautifulSoup(page_text, 'lxml') 8 li_list = soup.select('.sidamingzhu-list-mulu>ul>li') 9 fp= open('三国演义.txt', 'w', encoding='utf-8') 10 for li in li_list: 11 title = li.a.string 12 # print(title) 13 detail_url = li.a['href'] 14 # print(detail_url) 15 # 对详情页发请求 16 detail_page_text = requests.get(url=detail_url, headers=headers).content 17 # 解析出详情页的内容 18 soup1 = BeautifulSoup(detail_page_text, 'lxml') 19 div_tag = soup1.find('div',class_='grap') 20 content = div_tag.text 21 # 保存 22 fp.write(title+':'+content+'\n') 23 print(title, '爬取成功!')
标签:url,text,detail,li,爬取,headers,BeautifulSoup,三国演义 From: https://www.cnblogs.com/chengshu1258/p/17034129.html