仅供学习参考!!
控制台图
存入excel图(从新运行,会覆盖,建议重新命名)
完整代码
import requests
from urllib.parse import quote
import pandas as pd
import time
import json
import re
def collect(key, total):
resLs = []
for page in range(total):
url = f'https://so.toutiao.com/search?keyword={quote(key)}&pd=information&dvpf=pc&page_num={page}'
headers = {
'User-Agent': ua,
'Cookie': ck
}
res = requests.get(url=url, headers=headers).content.decode('utf-8')
ex = 'data-for=s-result-json data-used-by=hydrate>(.*?)</script>'
pattern = re.compile(ex)
for dic in pattern.findall(res):
try:
dic = json.loads(dic)
title = dic['data']['title']
abstract = dic['data']['abstract']
date = time.strftime('%Y-%m-%d', time.localtime(int(dic['data']['publish_time'])))
source = dic['data']['source']
comment = dic['data']['comment_count']
href = dic['data']['source_url']
record = {
'title': title,
'abstract': abstract,
'date': date,
'source': source,
'href': href,
'comment': comment
}
resLs.append(record)
print(record)
except KeyError:
pass
except Exception as e:
print(str(e))
time.sleep(1)
pd.DataFrame(resLs).to_excel('今日头条搜索.xlsx', index=False, encoding='utf-8')
if __name__ == '__main__':
ua = 'Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/91.0.4472.106Safari/537.36'
ck = 'passport_csrf_token=72948754528a79a9bf062bf1733fc133; tt_webid=7078110223483389448; _S_DPR=1.25; _S_IPAD=0; MONITOR_WEB_ID=7078110223483389448; ttwid=1%7ChKYuO5fu4IwwB_lSa1vsppRwr8U2VaXqA7O3OhsuYpA%7C1648012935%7C5497cee960112e92e59f927c732a5b71043db6617e25e79f2af45faa8144898c; _S_WIN_WH=394_754'
key = input("输入查询的关键字:")
page = int(input("采集的页数:"))
collect(key, page)
标签:dic,source,89,搜索,time,import,data,page,头条
From: https://www.cnblogs.com/code3/p/17184955.html