首页 > 其他分享 >89今日头条搜索

89今日头条搜索

时间:2023-03-06 18:44:06浏览次数:46  
标签:dic source 89 搜索 time import data page 头条

仅供学习参考!!

控制台图

存入excel图(从新运行,会覆盖,建议重新命名

完整代码

import requests
from urllib.parse import quote
import pandas as pd
import time
import json
import re


def collect(key, total):
    resLs = []
    for page in range(total):
        url = f'https://so.toutiao.com/search?keyword={quote(key)}&pd=information&dvpf=pc&page_num={page}'
        headers = {
            'User-Agent': ua,
            'Cookie': ck
        }
        res = requests.get(url=url, headers=headers).content.decode('utf-8')
        ex = 'data-for=s-result-json data-used-by=hydrate>(.*?)</script>'
        pattern = re.compile(ex)
        for dic in pattern.findall(res):
            try:
                dic = json.loads(dic)
                title = dic['data']['title']
                abstract = dic['data']['abstract']
                date = time.strftime('%Y-%m-%d', time.localtime(int(dic['data']['publish_time'])))
                source = dic['data']['source']
                comment = dic['data']['comment_count']
                href = dic['data']['source_url']
                record = {
                    'title': title,
                    'abstract': abstract,
                    'date': date,
                    'source': source,
                    'href': href,
                    'comment': comment
                }
                resLs.append(record)
                print(record)
            except KeyError:
                pass
            except Exception as e:
                print(str(e))
            time.sleep(1)
    pd.DataFrame(resLs).to_excel('今日头条搜索.xlsx', index=False, encoding='utf-8')


if __name__ == '__main__':
    ua = 'Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/91.0.4472.106Safari/537.36'
    ck = 'passport_csrf_token=72948754528a79a9bf062bf1733fc133; tt_webid=7078110223483389448; _S_DPR=1.25; _S_IPAD=0; MONITOR_WEB_ID=7078110223483389448; ttwid=1%7ChKYuO5fu4IwwB_lSa1vsppRwr8U2VaXqA7O3OhsuYpA%7C1648012935%7C5497cee960112e92e59f927c732a5b71043db6617e25e79f2af45faa8144898c; _S_WIN_WH=394_754'
    key = input("输入查询的关键字:")
    page = int(input("采集的页数:"))
    collect(key, page)

标签:dic,source,89,搜索,time,import,data,page,头条
From: https://www.cnblogs.com/code3/p/17184955.html

相关文章