for page in range(start_page, end_page + 1): url = url.format(page) request = urllib.request.Request(url, headers=headers) response = urllib.request.urlopen(request) contents = response.read() # a标签中前一个是信件的类型,后面那个是信件编号 # 发送GET请求获取网页内容 # 使用BeautifulSoup解析网页内容 soup = BeautifulSoup(contents, "html.parser") a_tags = soup.find_all('a', onclick=True)
整个代码为(爬取的结果,根据不同类型存入了不同的txt文件中)
import requests from bs4 import BeautifulSoup import urllib.request import json import re def huoqu(): url = "https://www.beijing.gov.cn/hudong/hdjl/com.web.search.mailList.flow" # 替换为目标网站的URL cookie = "__jsluid_s=7e6494284621930c061e56e28c73fe04; arialoadData=false; __jsluid_h=babf6155559102d42f5b7f0b024bab8e;" \ "sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22188a626b5289cc-04b250d08e6751-7e56547b-1638720-188a626b529108d%22%7D;" \ " sensorsdata_is_new_user=true; bjah7webroute=83fabc8af7a68a44338f4ee9b2831e7d; BJAH7WEB1VSSTIMEID=4065C3D9D249C359ABB3E1EBF7BD9553; " \ "JSESSIONID=MDkwMjUwODgtM2E5YS00N2QzLWExYWItMmE2OWJjZTM1ZmI0; _va_ref=%5B%22%22%2C%22%22%2C1686446660%2C%22https%3A%2F%2Fcn.bing.com%2F%22%5D;" \ " _va_ses=*; route=c5730edea4c5f2b5d7a6534850353a0c; JSESSIONID=56EE4BE6A09AA5BE642BA33CE292B0D3; " \ "_va_id=d80e32c2da04fb2f.1686412321.2.1686447410.1686446660." user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)" \ " Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.43" headers = {"User-Agent": user_agent, "Cookie": cookie} request = urllib.request.Request(url, headers=headers) response = urllib.request.urlopen(request) contents = response.read() # a标签中前一个是信件的类型,后面那个是信件编号 # 发送GET请求获取网页内容 # 使用BeautifulSoup解析网页内容 soup = BeautifulSoup(contents, "html.parser") return soup def huoqu1(start_page, end_page): url = "https://www.beijing.gov.cn/hudong/hdjl/com.web.search.mailList.flow" # 替换为目标网站的URL cookie = "__jsluid_s=7e6494284621930c061e56e28c73fe04; arialoadData=false; __jsluid_h=babf6155559102d42f5b7f0b024bab8e;" \ "sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22188a626b5289cc-04b250d08e6751-7e56547b-1638720-188a626b529108d%22%7D;" \ " sensorsdata_is_new_user=true; bjah7webroute=83fabc8af7a68a44338f4ee9b2831e7d; BJAH7WEB1VSSTIMEID=4065C3D9D249C359ABB3E1EBF7BD9553; " \ "JSESSIONID=MDkwMjUwODgtM2E5YS00N2QzLWExYWItMmE2OWJjZTM1ZmI0; _va_ref=%5B%22%22%2C%22%22%2C1686446660%2C%22https%3A%2F%2Fcn.bing.com%2F%22%5D;" \ " _va_ses=*; route=c5730edea4c5f2b5d7a6534850353a0c; JSESSIONID=56EE4BE6A09AA5BE642BA33CE292B0D3; " \ "_va_id=d80e32c2da04fb2f.1686412321.2.1686447410.1686446660." user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)" \ " Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.43" headers = {"User-Agent": user_agent, "Cookie": cookie} f1 = open('G:/python/pythonProject/信件爬取/1.txt', 'a') f2 = open('G:/python/pythonProject/信件爬取/2.txt', 'a') f3 = open('G:/python/pythonProject/信件爬取/3.txt', 'a') for page in range(start_page, end_page + 1): url = url.format(page) request = urllib.request.Request(url, headers=headers) response = urllib.request.urlopen(request) contents = response.read() # a标签中前一个是信件的类型,后面那个是信件编号 # 发送GET请求获取网页内容 # 使用BeautifulSoup解析网页内容 soup = BeautifulSoup(contents, "html.parser") a_tags = soup.find_all('a', onclick=True) for element in a_tags: onclick_value = element["onclick"] match = re.search(r"letterdetail\('(\d+)', '([^']+)'\)", onclick_value) if match: onclick_param1 = match.group(1) # print(type(onclick_param1)) onclick_param2 = match.group(2) if onclick_param1 == '1': f1.write(onclick_param2+'\n') if onclick_param1 == '2': f2.write(onclick_param2+'\n') if onclick_param1 == '3': f3.write(onclick_param2+'\n') print(f"onclick param 1: {onclick_param1}, onclick param 2: {onclick_param2}") f1.flush() f2.flush() f3.flush() f1.close() f2.close() f3.close() if __name__ == '__main__': huoqu1(1, 173)标签:url,22%,request,BeautifulSoup,爬取,onclick,requests,page From: https://www.cnblogs.com/lss1226/p/17674262.html