# coding = utf-8
'''
# 获取实习僧招聘信息
# (https://www.shixiseng.com/interns?page=2&type=intern&keyword=%E6%95%B0%E6%8D%AE%E6%8C%96%E6%8E%98&area=&months=&days=°ree=&official=&enterprise=&salary=-0&publishTime=&sortType=&city=%E5%85%A8%E5%9B%BD&internExtend=)
# 获取前5页全部数据存储csv文件
'''
import csv
import io
import json
import re
import crawles
import fontTools.ttx
import requests
from fontTools.ttLib import TTFont
url = 'https://www.shixiseng.com/app/interns/search/v2'
cookies = {
'utm_source_first': 'PC',
'utm_source': 'PC',
'utm_campaign': 'PC',
'Hm_lvt_03465902f492a43ee3eb3543d81eba55': '1681560188',
'RANGERS_WEB_ID': '7147610799707866638',
'RANGERS_SAMPLE': '0.31832347309583',
'adCloseOpen': 'true',
'position': 'pc_search_syss',
'Hm_lpvt_03465902f492a43ee3eb3543d81eba55': '1681560491',
}
headers = {
'authority': 'www.shixiseng.com',
'accept': 'application/json, text/plain, */*',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'no-cache',
'content-type': 'application/x-www-form-urlencoded',
'pragma': 'no-cache',
'referer': 'https://www.shixiseng.com/interns?keyword=%E6%95%B0%E6%8D%AE%E6%8C%96%E6%8E%98&city=%E5%85%A8%E5%9B%BD&type=intern',
'sec-ch-ua': '\\',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '\\',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
}
params = {
'build_time': '1681560501489',
'page': '{page}',
'type': 'intern',
'keyword': '数据挖掘',
'area': '',
'months': '',
'days': '',
'degree': '',
'official': '',
'enterprise': '',
'salary': '-0',
'publishTime': '',
'sortType': '',
'city': '全国',
'internExtend': '',
}
#存储文件
csv_file = open('sxs_info.csv', 'w+', encoding='gbk', newline='') # 文件存储
csv_f = csv.writer(csv_file)
csv_f.writerow(['岗位', '日薪', '行业', '公司名称', '地点时间', '工作描述', '内容描述', ])
# 请求网页数据解密返回输出
def sxs_get(page):
response = crawles.get(url, headers=headers, params=params, cookies=cookies)
text = response.text
# print(text)
# 查看内容
# for i in response.json['msg']['data']:
# print(i)
# 获取字体地址解析地址保存xml文件
font_url = 'https://www.shixiseng.com/interns/iconfonts/file'
ttf = TTFont(io.BytesIO(requests.get(font_url).content))
# ttf.saveXML('sxs.xml') #首次运行打开
# 对文件读取
f = open('sxs.xml', 'r+', encoding='utf-8')
sxs_data = f.read() # 读取sxs文件存放到sxs_data
f.close()
# 用re取文件里面的编码及索引
grapy = re.findall('<map code="0x(.*?)" name="(.*?)"/>', sxs_data) # 取数字及汉字对应编码
sequence = re.findall(' <GlyphID id="(\d+)" name="(\w+)"/>', sxs_data) # 取编码对应的索引
grapy_dict = {k: v for k, v in grapy}
sequence_dict = {k: v for v, k in sequence}
str_data = '0123456789一师x会四计财场DHLPT聘招工d周L端p年hx设程二五天tXG前KO网SWcgkosw广市月个BF告NRVZ作bfjnrvz三互生人政AJEI件M行QUYaeim软qu银y联'
str_list = [' ', ''] + [i for i in str_data] # 第一个没有,第二个空 加入字符列表
# 对数据进行关联放入字典里,按照下面逻辑进行匹配
# -> 0xe283 -> uni30 -> 数据索引 -> 文字
grapy_dict = {f'&#x{k}': str_list[int(sequence_dict[v])] for k, v in grapy}
# print(grapy_dict)
# 对文件进行解码
for k, v in grapy_dict.items():
text = text.replace(k, v)
# 将解密后的数据转换回去
data_list = json.loads(text)['msg']['data']
# print(data_list)
for i, d in enumerate(data_list):
print(i + 1, f"岗位:{d['name']},日薪:{d['minsal']}-{d['maxsal']}/'天',"
f"行业:{d['industry']}/{d['scale']},公司名称:{d['cname']}"
f"地点时间:{d['city']}|{d['day']}天/周|{d['month_num']}个月,"
f"描述1:{d['i_tags']}, 描述2:{d['c_tags']}"
)
sxs_data = d['name'], d['minsal'] + '-' + d['maxsal'] + '/' + '天', d['industry'] + '/' + d['scale'], d['cname'], \
d['city'] + '|' + d['day'] + '天/周' + '|' + d['month_num'] + '个月', d['i_tags'], d['c_tags']
# 文件写入
csv_f.writerow(sxs_data)
for p in range(1, 6): # 循环翻页取前5页数据
print(f'爬取第{p}页...')
print('-' * 50)
sxs_get(p)
csv_file.close()
标签:加密,python,text,E6%,sxs,反扒,import,csv,data
From: https://blog.51cto.com/u_15791586/6194888