import requests
import os
import re
import json
import certifi
os.environ[‘REQUESTS_CA_BUNDLE’] = certifi.where()
#注意cookie的值有时效性,当用不了的时候就更新一下。
基础链接,去除原有的pageNum参数
base_url = ‘https://we.51job.com/api/job/search-pc?api_key=51job×tamp=1735109432&keyword=python%E7%88%AC%E8%99%AB&searchType=2&function=&industry=&jobArea=000000&jobArea2=&landmark=&metro=&salary=&workYear=°ree=&companyType=&companySize=&jobType=&issueDate=&sortType=0&requestId=&pageSize=20&source=1&accountId=217570594&pageCode=sou%7Csou%7Csoulb&scene=7’
he = {
‘referer’: “https://we.51job.com/pc/search?keyword=python%E7%88%AC%E8%99%AB”,
‘user-agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0’,
“cookie”: “guid=a39a45bebcd3beb5e30c402b643815bc; ps=needv%3D0; sensor=createDate%3D2022-10-21%26%7C%26identityType%3D1; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22217570594%22%2C%22first_id%22%3A%22193f677f656a9-01c79a4d65a09f9-4c657b58-655360-193f677f657623%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTkzZjY3N2Y2NTZhOS0wMWM3OWE0ZDY1YTA5ZjktNGM2NTdiNTgtNjU1MzYwLTE5M2Y2NzdmNjU3NjIzIiwiJGlkZW50aXR5X2xvZ2luX2lkIjoiMjE3NTcwNTk0In0%3D%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%22217570594%22%7D%2C%22%24device_id%22%3A%22193f677f656a9-01c79a4d65a09f9-4c657b58-655360-193f677f657623%22%7D; partner=SEM_pcbingpz_02; slife=lowbrowser%3Dnot%26%7C%26lastlogindate%3D20241225%26%7C%26; 51job=cuid%3D217570594%26%7C%26cusername%3D5A%252FqMpj9ez9tY9J3CrfQpFiX42NHUDXtEKDnWGNs8os%253D%26%7C%26cpassword%3D%26%7C%26cname%3DVeJutepWmSe61aTXMh0UtQ%253D%253D%26%7C%26cemail%3D5KeCgONl9DxjAYYJtjU%252B1nQ6PZhyh640BzGmNjc983o%253D%26%7C%26cemailstatus%3D0%26%7C%26cnickname%3D%26%7C%26ccry%3D.0vuORyJTH4uQ%26%7C%26cconfirmkey%3D2806qjteGgR4Q%26%7C%26cautologin%3D1%26%7C%26cenglish%3D0%26%7C%26sex%3D0%26%7C%26cnamekey%3D28.I0RusXfUQU%26%7C%26to%3D93c08ae46279873b2d6a0674746981e8676b6929%26%7C%26; tfstk=fk9x3ljeT40D-5E8w5GlSmGVjtooKdK2VE-QINb01ULJ5HQMmVRMW1QRvitGnfRRWUTRlZXi0CLJyUo4iNvMyNTHWGvDInt1WtJkiRbmuOK6-O3n-vDH0nWOC20nvgg7v9sQhiaGl3a5XQtGxvDH0nZccDxSKK0t6f_C5O1fGua5zGEfCST1P_sCxr665OiRNGjThow1l365zG615F6skjGAWo_LBCmnWZ8zW1e_CKMNMwUhPgSJe25f-nt_CRiCDsQBca3UFwWhNh59QlyTdnOe7MTblVQwe39XOFgzewOpVUCHM2evJES11ZJjjS7XnQO5hQuLtgxDXLBXZ2FO3ItBe6OQl2QBiEJV3d03sTAHuB1eaVFdRBxFHa8xd-K5TZ6RVpigTavRoF99lmwlqebD0KO-bztk-ZBf6K0Q6NORz1OHH4w6R3xH3_JtJz_wiMW5XdoQ6NC14TpHpFy3-wIgG0n8_55f4Nx23sUA6n9C2wmP45PNNgSR-0n8_55f4gQn4yPa__sP.; search=jobarea%7E%60%7C%21recentSearch0%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FApython%C5%C0%B3%E6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch1%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%C5%C0%B3%E6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21; Hm_lvt_1370a11171bd6f2d9b1fe98951541941=1735085136,1735093043,1735119972,1735124633; HMACCOUNT=2C756155F43A3C7F; Hm_lpvt_1370a11171bd6f2d9b1fe98951541941=1735124685; JSESSIONID=5C95DFBFD5CBC0A00353C3FC3D184D3B; acw_tc=ac11000117351275777543826e009118df4b0cd1da2fb8b2572d320961c0f3; ssxmod_itna=eqRxuQPWqiqCw4BPiKGdD7fuB0OWqTiteHQKCCDl=YxA5D8D6DQeGTrRQDBCr1=r2PP3WoecDGQWx0KKHW=rf2EiZh40aDbqGkK9jxii9DCeDIDWeDiDG4GmB4GtDpxG=DjAKiTDzqG78ayCfx51c2KDbxi3f2G8DQcCDYvFux0CfhKDBxDCbzoInDwFC8gC2EKDKxi7DnFGDmHkSEPDwoosFR5DIh3=loovDX8C825Hfow87xDrQLv8qW2DQRjFsFkCK0QxkfKD9OoDsOGBHAFEmkzz8x0ODiKDccSMta2Mcav+fiKslcUGYeHfaP8Thjw+A0bIDq+GNI0t+irKD49xxz+xC4592HW4rmBe7biKDTDb3l2DCYDGR5AWDD==; ssxmod_itna2=eqRxuQPWqiqCw4BPiKGdD7fuB0OWqTiteHQKCD6pzx0H307UDLxY9pD=; acw_sc__v2=676bf2335cfea4d782def75da4c8be42036552fc”
}
如果不存在保存文件的目录则创建
if not os.path.exists(‘m’):
os.mkdir(‘m’)
使用for循环来遍历页码,提取多页内容并保存,不再使用函数
for page_num in range(1, 11): # 这里假设提取1到10页,可根据实际需求修改范围
url = base_url + ‘&pageNum=’ + str(page_num)
res = requests.get(url=url, headers=he)
res.encoding = ‘utf-8’
a=res.text
#print(a)
pattern = r’“jobId”:“.*?html”’
u = re.findall(pattern, a)
#print(u)
t=str(u)
# for i in u:
# #print(i)
with open(f"m/{page_num}.html", ‘w’, encoding=‘utf-8’) as f:
f.write(t)
print(f"第{page_num}页文件保存完成!")