python爬虫如何爬取招聘数据

标签：index python 22% pprint 爬虫 3A% 爬取 2C% data

互联互通的时代，几乎任何行业有关的数据都可以在网络上查到，那么作为大数据公司，如何借用爬虫实现公开数据抓取数据就显得尤为重要，下面就是有关招聘信息等数据的抓取代码案例可以参考下。

# coding=utf-8

import csv
import json
import os.path
import pprint

import requests
import re


if not os.path.exists('info/'):
    os.makedirs('info/')

f = open('info/招聘.csv', encoding='utf-8', mode='a', newline='')
csv_writer = csv.DictWriter(f, fieldnames=[
    '职位名字',
    '公司名字',
    '工作城市',
    '学历要求',
    '经验要求',
    '薪资要求',
    '公司地址',
    '详情页'
])
# 写入表头
csv_writer.writeheader()

for page in range(1, 11):
    url = f'https://www.lagou.com/wn/jobs?pn={page}&fromSearch=true&kd=python'
    headers = {
        'Host': 'www.lagou.com',
        'Referer': 'https://www.lagou.com/utrack/trackMid.html?f=https%3A%2F%2Fwww.lagou.com%2Fwn%2Fjobs%3Fpn%3D2%26fromSearch%3Dtrue%26kd%3Dpython&t=1648984113&_ti=1',
        'Cookie': 'user_trace_token=20211122110451-60eec88a-fbaf-47fd-9a53-188f3632144b; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1637550277; _ga=GA1.2.1219095688.1637550277; LGUID=20211122110452-94ffa347-2c46-4c2d-8429-b83e30e86693; RECOMMEND_TIP=true; __lg_stoken__=9ec31e7a3301bab4f215bd5f80c8af0ab0dc2b8ce81af654fe848cf33ad7c4f33d0748020b30281d56a28a756342ce5d42e6c218bcfd56dbf764c51686741cbaf14de987ef24; JSESSIONID=ABAAABAABEIABCIA45B6C458598FF70789BDFD5A4574786; WEBTJ-ID=20220403173842-17feeca7ea0402-090b1b6ee61841-a3e3164-3686400-17feeca7ea15f1; sensorsdata2015session=%7B%7D; X_HTTP_TOKEN=1ca92d1d8ffe4ecb3114898461b10fa2c7054519c6; X_MIDDLE_TOKEN=3e27b9a5a69f9fa78d5d2fe99174c9c5; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%229659966%22%2C%22%24device_id%22%3A%2217d459f2858540-02719bae0efae1-4343363-2073600-17d459f2859704%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24os%22%3A%22Windows%22%2C%22%24browser%22%3A%22Chrome%22%2C%22%24browser_version%22%3A%2298.0.4758.102%22%7D%2C%22first_id%22%3A%2217d459f2858540-02719bae0efae1-4343363-2073600-17d459f2859704%22%7D',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'
    }

    rsp = requests.get(url=url, headers=headers)
    print(rsp.status_code)
    # print(rsp.text)
    # <script id="__NEXT_DATA__" type="application/json">(.*?)</script>
    html_data = re.findall('<script id="__NEXT_DATA__" type="application/json">(.*?)</script>', rsp.text)[0]
    # print(html_data)


    json_data = json.loads(html_data)
    # print(json_data)
    # pprint.pprint(json_data)
    result = json_data['props']['pageProps']['initData']['content']['positionResult']['result']
    # print(result)
    # 格式输出
    # pprint.pprint(result)
    for index in result:
        # pprint.pprint(index)
        # 岗位职责
        job_index = index['positionDetail'].replace('<br />', '').replace('<br>', '')
        href = f'https://www.lagou.com/wn/jobs{index["positionId"]}.html'
        dict1 = {
            '职位名字': index['positionName'],
            '公司名字': index['companyFullName'],
            '工作城市': index['city'],
            '学历要求': index['education'],
            '经验要求': index['workYear'],
            '薪资要求': index['workYear'],
            '公司地址': index['positionAddress'],
            '详情页': href
        }
        csv_writer.writerow(dict1)
        title = index['positionName'] + index['companyFullName']
        new_title = re.sub(r'[\/?:"<>|]', '', title)
        with open('info/' + new_title + '.txt', 'w', encoding='utf-8') as f:
            f.write(job_index)
        print(dict1)

标签：index,python,22%,pprint,爬虫,3A%,爬取,2C%,data
From： https://blog.51cto.com/u_13488918/5996897

python爬虫如何爬取招聘数据

相关文章

赞助商

阅读排行