首页 > 其他分享 >64爬取b站,微博,ai问答等数据写入excel

64爬取b站,微博,ai问答等数据写入excel

时间:2023-02-04 11:25:16浏览次数:37  
标签:ai text excel 爬取 params res print table data

# 功能1:获取手机号归属地
# 功能2:查询天气
# 功能3:查询百度热搜
# 功能4:查询微博热搜
# 功能5:查询b站
# 功能6 ai问答(在这用不了 涉及网站逆向写在另外一个py模块,没写入到这里)

# coding=gbk
# -*- coding:uft-8 -*-

import requests
import time
import os
import re
import pandas as pd
from lxml import etree
import io
import sys
import datetime

os.environ['NO_PROXY'] = 'https://cc-api.sbaliyun.com/v1/completions'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
}


# 手机号
def phone_number():
    while True:
        phoneNumber = input("查询的手机号(按q退出手机号查询):")
        if phoneNumber == 'q':
            break
        try:
            params = {
                'mobile': phoneNumber,
                'action': 'mobile'
            }
            url = 'https://www.ip138.com/mobile.asp'
            res = requests.get(url=url, headers=headers, params=params)
            # print(res.text)
            e = etree.HTML(res.text)
            KH = e.xpath('//div[@class="table"]/table/tbody/tr[2]/td/text()')[0]  # 卡号归属地
            kh = e.xpath('//div[@class="table"]/table/tbody/tr[2]/td/span/text()')[0]  # 卡号归属地

            YXS = e.xpath('//div[@class="table"]/table/tbody/tr[3]/td/text()')[0]  # 运行商
            yxs = e.xpath('//div[@class="table"]/table/tbody/tr[3]/td/a/text()')[0]  # 运行商

            QH = e.xpath('//div[@class="table"]/table/tbody/tr[5]/td/text()')[0]  # 区号
            qh = e.xpath('//div[@class="table"]/table/tbody/tr[5]/td/a/text()')[0]  # 区号

            YB = e.xpath('//div[@class="table"]/table/tbody/tr[6]/td/text()')[0]  # 邮编
            yb = e.xpath('//div[@class="table"]/table/tbody/tr[6]/td/a/text()')[0]  # 邮编

            # print(KH, kh)
            # return f'{KH}-->{kh}\n{YXS}-->{yxs}\n{QH}-->{qh}\n{YB}-->{yb}'
            print("查询结果如下:")
            print(f'{KH}-->{kh}\n{YXS}--->{yxs}\n{QH}------>{qh}\n{YB}------>{yb}')
        except Exception:
            print("输入的手机号格式不正确,请重新输入!")


# 天气
def get_weather():
    while True:
        location = input("输入查询的地区(按q退出天气查询):")
        if location == 'q':
            break
        try:
            params = {
                'location': location
            }
            url = 'https://www.wentian123.com/search/'
            res = requests.get(url=url, headers=headers, params=params)
            e = etree.HTML(res.text)
            dz = e.xpath('//div[@class="table-inner"]/table//td/a/text()')  # 地址
            xqj = e.xpath('//div[@class="table-inner"]/table/tbody/tr/td[1]/p[1]/text()')  # 星期几
            # rq = e.xpath('//div[@class="table-inner"]/table/tbody/tr/td[1]/p[2]/text()')  # 日期
            tqkj = e.xpath('//div[@class="table-inner"]/table/tbody/tr/td[2]/p[2]/span/text()')  # 天气情况
            ds = e.xpath('//div[@class="table-inner"]/table/tbody/tr/td[3]/p/text()')  # 度数
            fxjs = e.xpath('//div[@class="table-inner"]/table/tbody/tr/td[4]/p/text()')  # 风向级数
            # print(dz, xqj, rq, tqkj, ds, fxjs)
            # print(f'现在是:\t{xqj[0]}\t{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}')
            for d, t, d2, f in zip(dz, tqkj, ds, fxjs):
                if d.__contains__(location):
                    print(d, t, d2, f)
            print(f'现在是:\t{xqj[0]}\t{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}')
        except Exception:
            print("输入的地区有误,请重写输入!")


# 百度热搜
def bai_du_rei_sou():
    while True:
        lis = ['realtime', 'novel', 'movie', 'teleplay', 'car', 'game']
        print('1.热搜榜\t 2.小说\t 3.电影\t 4.电视剧\t 5.汽车\t 6.游戏\t 7.退出百度热榜单查询')
        dic = {
            'realtime': '热搜榜',
            'novel': '小说',
            'movie': '电影',
            'teleplay': '电视剧',
            'car': '汽车',
            'game': '游戏'
        }
        try:
            num = int(input("输入你要查询的榜单:"))
            if num == 7:
                break
            print(f'正在查询{dic[lis[num - 1]]}的榜单:')
            url = 'https://top.baidu.com/board'
            params = {
                # 'tab': 'realtime',
                'tab': lis[num - 1]
            }
            res = requests.get(url=url, headers=headers, params=params)
            e = etree.HTML(res.text)
            titles = e.xpath('//div[@class="c-single-text-ellipsis"]/text()')  # 标题
            hot_nums = e.xpath('//div[@class="hot-index_1Bl1a"]/text()')  # 热搜指数
            # print(hot_nums)
            # print(f'北京时间:{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}')
            for t, h in zip(titles, hot_nums):
                print(f'标题:{t}------>热度:{h}')
            print(f'北京时间:{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}')
        except Exception:
            print("输入格式错误,请重新输入!")


# 微博热搜
def weibo_search():
    while True:
        lis = ['realtimehot', 'socialevent', 'entrank']
        url = 'https://s.weibo.com/top/summary'
        print('1.热搜榜\t 2.要闻榜\t 3.文娱榜\t 4.退出微博热榜单查询')
        num = int(input("输入你要查询的榜单:"))
        try:
            if num == 4:
                break
            params = {
                'cate': lis[num - 1]
            }
            headers = {
                'cookie': 'SINAGLOBAL=690519784757.2731.1671192419517; SUBP=0033WrSXqPxfM72wWs9jqgMF55529P9D9WFB2MFg.53.mACIaAgd8wTi5JpVF020e05Neh5XSoMp; SUB=_2AkMUwi8HdcPxrAZZnPoTymngb49H-jynF0bxAn7uJhMyAxh87nwzqSVutBF-XMKjNdhFviACxIXacTNM_j5vca_y; _s_tentry=www.google.com; UOR=,,www.google.com; Apache=8260187671478.501.1675384443714; ULV=1675384443775:3:1:1:8260187671478.501.1675384443714:1671340035730',
                'referer': 'https://www.google.com/',
                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
            }

            res = requests.get(url=url, headers=headers, params=params)
            e = etree.HTML(res.text)
            if params['cate'] != 'socialevent':
                xh = e.xpath('//div[@id="pl_top_realtimehot"]//tbody/tr/td[1]/text()')  # 序号
                biao_t = e.xpath('//div[@id="pl_top_realtimehot"]//tbody/tr/td[2]/a/text()')  # 置顶+标题
                biao_tts = e.xpath('//div[@id="pl_top_realtimehot"]//tbody/tr/td[2]/a/text()')[1:]  # 标题
                # print(xh,biao_t)
                # print(len(xh),len(biao_t))
                print(f"置顶:------->{biao_t[0]}")
                for x, b in zip(xh, biao_tts):
                    print(f"{x}-------->{b}")
            else:
                biaot = e.xpath('//div[@id="pl_top_realtimehot"]/table/tbody/tr/td[2]/a/text()')  # 标题
                for i in biaot:
                    print(f'o------>{i[1:][:-1]}')
            print(f'北京时间:{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}')
        except Exception:
            print("存在响应或输入问题!重新查询!")


# b站类
class Bili:
    # sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8') #改变标准输出的默认编码
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030')  # 改变标准输出的默认编码
    headers = {
        'accept': 'application/json, text/plain, */*',
        'origin': 'https://www.bilibili.com',
        'referer': 'https://www.bilibili.com/',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
    }

    # b站排行榜
    def get_bili(self):
        lis_e = ['all', 'bangumi', 'guochan', 'guochuang', 'documentary', 'douga', 'music', 'dance', 'game',
                 'knowledge',
                 'tech', 'sports', 'car', 'life', 'food', 'animal', 'kichiku', 'fashion', 'ent', 'cinephile', 'movie',
                 'tv',
                 'variety', 'origin', 'rookie']
        lis_c = ['全站', '番剧', '国产动画', '国创相关', '纪录片', '动画', '音乐', '舞蹈', '游戏', '知识', '科技',
                 '运动',
                 '汽车', '生活', '美食', '动物圈', '鬼畜', '时尚', '娱乐', '影视', '电影', '电视剧', '综艺', '原创',
                 '新人']
        # print(len(lis_e), len(lis_c))
        dic = {
        }
        for i in range(len(lis_e)):  # 写入字典
            dic[lis_e[i]] = lis_c[i]
        while True:
            print('1.全站 2.番剧 3.国产动画 4.国创相关 5.纪录片 6.动画 7.音乐 8.舞蹈 9.游戏 10.知识 11.科技 12.运动 13.汽车 14.生活 15.美食 \
    16.动物圈 17.鬼畜 18.时尚 19.娱乐 20.影视 21.电影 22.电视剧 23.综艺 24.原创 25.新人  100.退出当前查询')
            # print(dic)
            num = int(input("你要查询的类型榜单:"))
            if num == 100:
                break
            try:
                print(f'正在查询{lis_c[num - 1]}的榜单:')
                url = f'https://www.bilibili.com/v/popular/rank/{lis_e[num - 1]}'

                res = requests.get(url, headers)
                # print(res.text)
                # title = re.findall('class="title">(.*?)</a>', res.text) # 标题
                # up = re.findall('alt="up"(.*?)', res.text, re.S)
                # print(title)
                e = etree.HTML(res.text)
                total = e.xpath('//div[@class="detail"]//span/text()')  # up 播放 评论
                bt = e.xpath('//div[@class="info"]/a/text()')  # 标题
                # print(total)
                lis = []
                for i in total:
                    t = i.strip().replace('\n', '')
                    lis.append(t)
                # print(lis)
                lis2 = [lis[i:i + 3] for i in range(0, len(lis), 3)]  # 将里面的元素排成3个一组
                # print(lis2)
                print(e.xpath('//div[@id="app"]//ul[@class="rank-tab"]/li/text()'))
                for i, b in zip(lis2, bt):
                    print(f'标题:{b}------------up:{i[0]}------------播放:{i[1]}------------评论:{i[2]}')
                print(f'查询{lis_c[num - 1]}榜单完毕!')
                save = input("是否保存到本地? 'y/n':")
                if save == 'y':
# ------------------------------excel  todo
                    today = datetime.datetime.today()
                    # year = today.year
                    # month = today.month
                    print(today)
                    today = str(today).split(' ')[0].replace('-', '_')
                    total_list = []
                    for i ,b in zip(lis2, bt):
                        dic = {
                            "标题": b,
                            "up": i[0],
                            "播放量": i[1],
                            "评论": i[2]
                        }
                        total_list.append(dic)
                    pf = pd.DataFrame(total_list)  # 转列表为DataFrame
                    path = pd.ExcelWriter(f'{today}{lis_c[num - 1]}.xlsx')  # 设置保存路径
                    pf.to_excel(path, encoding='utf-8', index=False)  # 转化为Excel
                    path.save()  # 保存
                    print(f'{lis_c[num - 1]}已保存!')
# ------------------------------excel
            except Exception:
                print("输入格式有误或响应错误,重新输入")


    # 综合热门
    def zong_he_rm(self):
        url = 'https://api.bilibili.com/x/web-interface/popular'
        params = {
            # 'ps': '20',   # 展示数据量
            'ps': '50',   # 展示数据量
            'pn': '1'
        }
        res = requests.get(url, headers=headers, params=params)
        datas = res.json()['data']['list']
        # print(datas)
        for data in datas:
            print(
                f'{data["tname"]}-------标题:{data["title"]}-----up:{data["owner"]["name"]}------播放量:{data["stat"]["view"]}-----评论数:{data["stat"]["reply"]}------投币数:{data["stat"]["coin"]}-------点赞数{data["stat"]["like"]}')
        print()
# ------------------------------excel  todo
        save = input("是否保存到本地? 'y/n':")
        if save == 'y':
            today = datetime.datetime.today()
            # year = today.year
            # month = today.month
            print(today)
            today = str(today).split(' ')[0].replace('-', '_')
            total_list = []
            for data in datas:
                dic = {
                    "板块": data["tname"],
                    "标题": data["title"],
                    "up": data["owner"]["name"],
                    "播放量": data["stat"]["view"],
                    "评论数": data["stat"]["reply"],
                    "投币数": data["stat"]["coin"],
                    "点赞数": data["stat"]["like"]

                }
                total_list.append(dic)
            pf = pd.DataFrame(total_list)  # 转列表为DataFrame
            path = pd.ExcelWriter(f'{today}b站综合热门.xlsx')  # 设置保存路径
            pf.to_excel(path, encoding='utf-8', index=False)  # 转化为Excel
            path.save()  # 保存
            print(f'b站综合热门已保存!')
# ------------------------------excel


    # 每周必看
    def weekly(self, num_page):
        url = 'https://api.bilibili.com/x/web-interface/popular/series/one'
        params = {
            'number': num_page
        }
        res = requests.get(url, headers=headers, params=params)
        datas = res.json()['data']['list']
        # print(datas)
        for data in datas:
            print(
                f'{data["tname"]}-------标题:{data["title"]}-----up:{data["owner"]["name"]}------播放量:{data["stat"]["view"]}-----评论数:{data["stat"]["reply"]}------投币数:{data["stat"]["coin"]}-------点赞数{data["stat"]["like"]}')
        print()

    # 每周必看(第**期)
    def week2(self):
        url = 'https://api.bilibili.com/x/web-interface/popular/series/list'
        res = requests.get(url, headers=headers).json()
        # print(res)
        lists = res['data']['list']
        num_lis = []
        for i in lists:
            print(f'期数:{i["number"]}----------{i["subject"]}-----------{i["name"]}')
            num_lis.append(i["number"])
        return num_lis

    # 入站必刷
    def r_z(self):
        url = 'https://api.bilibili.com/x/web-interface/popular/precious'
        params = {
            'page_size': '100',
            'page': '1'
        }
        res = requests.get(url, headers=headers, params=params)
        datas = res.json()['data']['list']
        # print(datas)
        for data in datas:
            print(
                f'{data["tname"]}-------标题:{data["title"]}-----up:{data["owner"]["name"]}------播放量:{data["stat"]["view"]}-----评论数:{data["stat"]["reply"]}------投币数:{data["stat"]["coin"]}-------点赞数{data["stat"]["like"]}')
        print()

    # 全站音乐榜
    def music_(self, num):
        url = 'https://api.bilibili.com/x/copyright-music-publicity/toplist/music_list'
        params = {
            'list_id': num
        }
        res = requests.get(url, headers=headers, params=params)
        try:
            datas = res.json()['data']['list']
            # print(datas)
            for data in datas:
                print(
                    f'歌名:{data["music_title"]}-----歌手:{data["singer"]}-----热度:{data["heat"]}-----播放量:{data["creation_play"]}------up:{data["creation_nickname"]}------成就:{data["achievements"]}')
            print()
        except Exception:
            print("响应超时或改期数不存在! 请查询输入!")


def main():
    while True:
        print(
            "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------")
        print('->\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t ???Welcome to into???')
        print('->\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t 1:查询手机号')
        print('->\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t 2:查询天气')
        print('->\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t 3:查询百度热搜')
        print('->\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t 4:查询微博热搜')
        print('->\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t 5:查询b站页面')
        print('->\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t 6:ai问答区')
        print('->\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t q:退出查询')
        print(
            "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------")

        order = input("输入查询的序号:")
        if order == '1':
            print("----进入查询手机号界面----")
            phone_number()

        elif order == '2':
            print("----进入查询天气界面----")
            get_weather()

        elif order == '3':
            print("----进入查询百度热搜界面----")
            bai_du_rei_sou()

        elif order == '4':
            print("----进入查询微博热搜界面----")
            weibo_search()

        elif order == '5':
            bl = Bili()
            print("----进入查询b站界面----")
            while True:
                print("1.综合热门 2.每周必看 3.入站必刷 4.排行榜 5.全站音乐榜 6.退出当前查询")
                n5 = input("输入你要查询的板块:")
                if n5 == '1':
                    bl.zong_he_rm()
                elif n5 == '2':
                    qi_shu = bl.week2()
                    while True:
                        print(qi_shu)
                        num = input("输入你要查看的期数(按q退出):")
                        if num == 'q':
                            break
                        bl.weekly(num)
                elif n5 == '3':
                    bl.r_z()
                elif n5 == '4':
                    bl.get_bili()
                elif n5 == '5':
                    while True:
                        num1 = input('输入查询期号(按q退出!):')
                        if num1 == 'q':
                            break
                        bl.music_(num=num1)
                elif n5 == '6':
                    break
        elif order == '6':
            from re_Ai import input_get_info
            input_get_info()

        elif order == 'q':
            exit()
        else:
            print("输入的格式有误!请重新输入")


if __name__ == '__main__':
    main()

标签:ai,text,excel,爬取,params,res,print,table,data
From: https://www.cnblogs.com/socoo-/p/17091119.html

相关文章