首页如图
抓包工具显示如下
点击作者后,转到详情页
抓包工具显示如下、
爬取代码如下
1 时间:2023/1/9 22:10 2 功能描述 3 4 ''' 5 import requests 6 from lxml import etree 7 8 url = 'https://tangshi.5000yan.com/shiren/' 9 headers = { 10 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'} 11 12 13 def get_url_xpath(url): 14 url_1 = requests.get(url=url, headers=headers).content # 解决乱码问题,用content而不用text 15 zuozhe = etree.HTML(url_1) 16 return zuozhe 17 # print(url_1) # 检验一下是否乱码 18 19 if __name__=='__main__': 20 xpath1 = get_url_xpath(url) 21 li_list = xpath1.xpath('//main/div/ul/li') # 获取li标签列表 22 fp = open('11.txt', 'w', encoding='utf-8') 23 for li in li_list: 24 tang_name = li.xpath('./a/text()') # 对应每个li标签,获取每个标签下的a标签下的文本和地址 25 # print(tang_name[0]) 26 #fp.write(tang_name[0] + '\n\n') 27 detail_url = li.xpath('./a/@href')[0] # ./表示在当前目录下,继续层级 28 # print(detail_url) 29 fp.write(tang_name[0]+'\n') 30 xpath2 = get_url_xpath(detail_url) 31 article_list = xpath2.xpath('//section[@class="blockGroup"]/article') 32 for article in article_list: 33 biaoti_list = article.xpath('./h2/a/text()') 34 neirong_list = article.xpath('./div/div[@class="shi-zhong"]/span/a/div/text()') 35 #print(biaoti_list, neirong_list) 36 for biaoti in biaoti_list: 37 fp.write(biaoti) 38 for neirong in neirong_list: 39 fp.write(neirong) 40 fp.write('\n') 41 print('下载完成!')
标签:xpath,fp,url,list,唐诗三百,li,爬取,article From: https://www.cnblogs.com/chengshu1258/p/17044678.html