互联互通的时代,几乎任何行业有关的数据都可以在网络上查到,那么作为大数据公司,如何借用爬虫实现公开数据抓取数据就显得尤为重要,下面就是有关招聘信息等数据的抓取代码案例可以参考下。
# coding=utf-8标签:index,python,22%,pprint,爬虫,3A%,爬取,2C%,data From: https://blog.51cto.com/u_13488918/5996897
import csv
import json
import os.path
import pprint
import requests
import re
if not os.path.exists('info/'):
os.makedirs('info/')
f = open('info/招聘.csv', encoding='utf-8', mode='a', newline='')
csv_writer = csv.DictWriter(f, fieldnames=[
'职位名字',
'公司名字',
'工作城市',
'学历要求',
'经验要求',
'薪资要求',
'公司地址',
'详情页'
])
# 写入表头
csv_writer.writeheader()
for page in range(1, 11):
url = f'https://www.lagou.com/wn/jobs?pn={page}&fromSearch=true&kd=python'
headers = {
'Host': 'www.lagou.com',
'Referer': 'https://www.lagou.com/utrack/trackMid.html?f=https%3A%2F%2Fwww.lagou.com%2Fwn%2Fjobs%3Fpn%3D2%26fromSearch%3Dtrue%26kd%3Dpython&t=1648984113&_ti=1',
'Cookie': 'user_trace_token=20211122110451-60eec88a-fbaf-47fd-9a53-188f3632144b; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1637550277; _ga=GA1.2.1219095688.1637550277; LGUID=20211122110452-94ffa347-2c46-4c2d-8429-b83e30e86693; RECOMMEND_TIP=true; __lg_stoken__=9ec31e7a3301bab4f215bd5f80c8af0ab0dc2b8ce81af654fe848cf33ad7c4f33d0748020b30281d56a28a756342ce5d42e6c218bcfd56dbf764c51686741cbaf14de987ef24; JSESSIONID=ABAAABAABEIABCIA45B6C458598FF70789BDFD5A4574786; WEBTJ-ID=20220403173842-17feeca7ea0402-090b1b6ee61841-a3e3164-3686400-17feeca7ea15f1; sensorsdata2015session=%7B%7D; X_HTTP_TOKEN=1ca92d1d8ffe4ecb3114898461b10fa2c7054519c6; X_MIDDLE_TOKEN=3e27b9a5a69f9fa78d5d2fe99174c9c5; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%229659966%22%2C%22%24device_id%22%3A%2217d459f2858540-02719bae0efae1-4343363-2073600-17d459f2859704%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24os%22%3A%22Windows%22%2C%22%24browser%22%3A%22Chrome%22%2C%22%24browser_version%22%3A%2298.0.4758.102%22%7D%2C%22first_id%22%3A%2217d459f2858540-02719bae0efae1-4343363-2073600-17d459f2859704%22%7D',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'
}
rsp = requests.get(url=url, headers=headers)
print(rsp.status_code)
# print(rsp.text)
# <script id="__NEXT_DATA__" type="application/json">(.*?)</script>
html_data = re.findall('<script id="__NEXT_DATA__" type="application/json">(.*?)</script>', rsp.text)[0]
# print(html_data)
json_data = json.loads(html_data)
# print(json_data)
# pprint.pprint(json_data)
result = json_data['props']['pageProps']['initData']['content']['positionResult']['result']
# print(result)
# 格式输出
# pprint.pprint(result)
for index in result:
# pprint.pprint(index)
# 岗位职责
job_index = index['positionDetail'].replace('<br />', '').replace('<br>', '')
href = f'https://www.lagou.com/wn/jobs{index["positionId"]}.html'
dict1 = {
'职位名字': index['positionName'],
'公司名字': index['companyFullName'],
'工作城市': index['city'],
'学历要求': index['education'],
'经验要求': index['workYear'],
'薪资要求': index['workYear'],
'公司地址': index['positionAddress'],
'详情页': href
}
csv_writer.writerow(dict1)
title = index['positionName'] + index['companyFullName']
new_title = re.sub(r'[\/?:"<>|]', '', title)
with open('info/' + new_title + '.txt', 'w', encoding='utf-8') as f:
f.write(job_index)
print(dict1)