import requests from lxml import etree def create_request(page): if page == 1: url = 'http://www.zhb.org.cn/hbzx/news_2' else: url = 'http://www.zhb.org.cn/hbzx/news_2/index_' + str(page) + '.html' headers = { 'Host': 'www.zhb.org.cn', 'Upgrade-Insecure-Requests': '1', 'User-Agent' : 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36' } request = requests.get(url, headers=headers) return request def get_content(request): content = request.content return content def get_hreflist(content): url_list = [] tree = etree.HTML(content) href_list = tree.xpath('//div[@class="newsbox_2"]//li/a/@href') href_list = [item for item in href_list if item != "javascript:;"] url = 'http://www.zhb.org.cn' for i in range(len(href_list)): new_url = url + href_list[i] url_list.append(new_url) return url_list def download_text(url_list): failed_page_num = 0 for url in url_list: try: headers = { 'Host': 'www.zhb.org.cn', 'If-Modified-Since': 'Sun, 17 Sep 2023 16:48:28 GMT', 'If-None-Match': '"42c4b-7865-60590cb85967c"', 'Upgrade-Insecure-Requests': '1', 'User-Agent' : 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36' } response = requests.get(url, headers=headers) content = response.content tree = etree.HTML(content) name = tree.xpath('//div[@class="news_titlenr"]/text()')[0]+'.txt' name = name.replace("/","") save_path = './中国环境保护协会/新闻/'+name text = tree.xpath('//div[@class="news_nrbox"]//p/text()') result = '' for t in text: result = result + '\n' + t with open(save_path,'w',encoding='utf-8') as fp: fp.write(result) except: failed_page_num += 1 print("{} pages failed in this page".format(failed_page_num)) pass if __name__ == '__main__': start_page = 2 end_page = 263 for page in range(start_page,end_page+1): request = create_request(page) # 导入了第page页 content = get_content(request) # 获得第page页的源代码 url_list = get_hreflist(content) # 获得第page页所有的新闻链接 download_text(url_list) #下载第page页所有的新闻文本 print('第' + str(page) + '页下载完成')
标签:url,list,urllib3,request,urllib,content,text,page From: https://www.cnblogs.com/qtdwz/p/17713042.html