import json import os.path import time from jsonpath import * # import jsonpath as jsonpath import pandas as pd import requests # url = "http://www.whggzy.com/front/search/category" def get_resp(url,name,i): headers = { "Referer": "http://www.whggzy.com/PoliciesAndRegulations/index.html?utm=sites_group_front.26a79a93.0.0.715108e02e0e11ee837be5c5ca3fd993", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36", "Accept": "*/*", "Content-Type": "application/json", "X-Requested-With": "XMLHttpRequest" } data = { "utm":"sites_group_front.26a79a93.0.0.715108e02e0e11ee837be5c5ca3fd993", "categoryCode":f"{name}", "pageSize":15, "pageNo":f"{i}" } # json = data,json传参就算将参数转化为json格式进行传递的 resp = requests.post(url, headers=headers, json=data).json() return resp def save_json(content): data = json.dumps(content) with open("wh_data.json",'w',encoding="utf-8") as w: w.write(data) def get_data(data_list,csv_path,i): base_url = 'http://www.whggzy.com/' pathName = '' for data in data_list: pathName = jsonpath(data,'$..pathName')[0] if jsonpath(data,'$..pathName') else None title = jsonpath(data,'$..title')[0] if jsonpath(data,'$..title') else None publishDate = jsonpath(data,'$..publishDate')[0] if jsonpath(data,'$..publishDate') else None date = time.strftime('%Y-%m-%d',time.localtime(publishDate / 1000)) attachmentUrl = jsonpath(data,'$..attachmentUrl')[0] if jsonpath(data,'$..attachmentUrl') else None url = base_url + jsonpath(data,'$..url')[0] if jsonpath(data,'$..url') else None csv_list = [pathName,title,date,attachmentUrl,url] save_csv(csv_list,csv_path) print(f'政策法规-->>{pathName}-->> 第{i}页下爬取完毕 !!!') def judge_csv_file(): # 当前脚本文件的绝对路径,_file_代表的是appLogger 这个文件 current_path = os.path.abspath(__file__) # 定义一个类属性,保存的文件名称 csv_path = os.path.join(os.path.abspath(os.path.dirname(current_path)), 'wh_data.csv') print(csv_path) if not os.path.exists(csv_path): head_list = ['项目','标题','日期','附件网址','内容地址'] tb_head = ",".join(head_list) + '\n' with open(csv_path,'w',encoding="utf-8") as wf: wf.write(tb_head) return csv_path def save_csv(data_list,csv_path): data = pd.DataFrame(data=[data_list]) # 追加数据,mode = 'a',表示追加,index=False 表示不给每行数据加索引序号,header=False 表示不加标题 data.to_csv(csv_path,mode='a',index=False,header=False,encoding='utf-8') def run(url): csv_path = judge_csv_file() name_list = ["GovernmentProcurement","BidAndEngineerConstruction","LandAndMineralRightsTransaction", "TransactionOfPropertyRights","TransactionOfPublicResources"] for name in name_list: i = 1 while True: content = get_resp(url,name,i) save_json(content) data_list = content['hits']['hits'] if data_list: get_data(data_list,csv_path,i) else: break i += 1 if __name__ == '__main__': url = "http://www.whggzy.com/front/search/category" run(url)
标签:url,list,爬虫,保存,jsonpath,path,csv,data From: https://www.cnblogs.com/xingmeng63/p/17590669.html