首页 > 编程语言 >Python记录微博关键词的教程

Python记录微博关键词的教程

时间:2022-12-29 16:05:25浏览次数:44  
标签:info 教程 word get Python item 微博 user data


功能效果:

通过关键词,记录微博用户以及博客的相关数据

Python记录微博关键词的教程_搜索

代码核心部分:

from config import g_none_word, g_weibo_host, g_weibo_headers, WeiboData
import requests
from bs4 import BeautifulSoup
import csv
import re
import json
import os
import dateutil.parser


def base62_decode(string):
"""
base
"""
alphabet = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
string = str(string)
num = 0
idx = 0
for char in string:
power = (len(string) - (idx + 1))
num += alphabet.index(char) * (len(alphabet) ** power)
idx += 1

return num


def reverse_cut_to_length(content, code_func, cut_num=4, fill_num=7):
"""
url to mid
"""
content = str(content)
cut_list = [content[i - cut_num if i >= cut_num else 0:i] for i in range(len(content), 0, (-1 * cut_num))]
cut_list.reverse()
result = []
for i, item in enumerate(cut_list):
s = str(code_func(item))
if i > 0 and len(s) < fill_num:
s = (fill_num - len(s)) * '0' + s
result.append(s)
return ''.join(result)


def url_to_mid(url: str):
""">>> url_to_mid('z0JH2lOMb')
3501756485200075
"""
result = reverse_cut_to_length(url, base62_decode)
return int(result)


def parse_time(s):
"""
Wed Oct 19 23:44:36 +0800 2022 => 2022-10-19 23:44:36
"""
# return "2022-10-19 23:44:36"
return dateutil.parser.parse(s).strftime('%Y-%m-%d %H:%M:%S')


def parse_user_info(data):
"""
解析用户信息
"""
# 基础信息
user = {
"_id": str(data['id']),
"avatar_hd": data['avatar_hd'],
"nick_name": data['screen_name'],
"verified": data['verified'],
}
# 额外的信息
keys = ['description', 'followers_count', 'friends_count', 'statuses_count',
'gender', 'location', 'mbrank', 'mbtype', 'credit_score']
for key in keys:
if key in data:
user[key] = data[key]
if 'created_at' in data:
user['created_at'] = parse_time(data.get('created_at'))
if user['verified']:
user['verified_type'] = data['verified_type']
if 'verified_reason' in data:
user['verified_reason'] = data['verified_reason']
return user


class WeiboCrawler(object):
"""爬虫主入口

Args:
object (_type_): _description_
"""
def __init__(self, search_config: dict):
self.__search_config = search_config
self.__search_result = False
self.__key_word = search_config.get("keyword",g_none_word)


def start_search(self, is_need_multithreading: bool = False):
"""_summary_

Args:
is_need_multithreading (bool, optional): 是否需要开启多线程. Defaults to False.

Returns:
bool: 是否搜索成功
"""
keyword = self.__search_config.get("keyword",g_none_word)
if keyword == g_none_word:
print("..........未提供关键词,搜索失败............")
return False
print(f"搜索开始,关键词为:{keyword}")
is_search_by_time = True
begin_time = self.__search_config.get("begin_time",g_none_word)
end_time = self.__search_config.get("end_time",g_none_word)
page = self.__search_config.get("page",g_none_word)
if begin_time == g_none_word or end_time == g_none_word:
print("开始时间或者结束时间设置为空")
is_search_by_time = False
if is_search_by_time:
time_scope = f"custom%3A{begin_time}%3A{end_time}"
print(f"构建搜索时间范围成功:字段参数为:{time_scope}")
req_url = f"{g_weibo_host}q={keyword}&typeall=1&suball=1×cope={time_scope}&Refer=g&page={page}"
print(f"需要搜索的url地址构建成功,地址为: {req_url}")
resp = requests.get(req_url, headers=g_weibo_headers)
if resp.status_code !=200:
print(f".....{req_url} 网址响应异常......")
return False
try:
resp.encoding = "utf-8"
resp_text = resp.text
self.__search_result = True
self.__result_text = resp_text
return True
except:
print("搜索结果异常")
return False
else:
req_url = f"https://s.weibo.com/weibo?q={keyword}&Refer=index"
print(f"需要搜索的url地址构建成功,地址为: {req_url}")
resp = requests.get(req_url, headers=g_weibo_headers)
if resp.status_code !=200:
print(f".....{req_url} 网址响应异常......")
return False
try:
resp.encoding = "utf-8"
resp_text = resp.text
self.__result_text = resp_text # 返回结果保存
self.__search_result = True
return True
except:
print("搜索结果异常")
return False
return False

def parse_blog_info(self, data):
tweet = {
"_id": str(data['mid']),
"mblogid": data['mblogid'],
"created_at": parse_time(data['created_at']), # 文章发布时间
"geo": data['geo'],
"ip_location": data.get('region_name', None),
"reposts_count": data['reposts_count'],
"comments_count": data['comments_count'],
"attitudes_count": data['attitudes_count'],
"source": data['source'],
"content": data['text_raw'].replace('\u200b', ''),
"pic_urls": ["https://wx1.sinaimg.cn/orj960/" + pic_id for pic_id in data.get('pic_ids', [])],
"pic_num": data['pic_num'],
'isLongText': False,
"user": parse_user_info(data['user']),
}
if 'page_info' in data and data['page_info'].get('object_type', '') == 'video':
tweet['video'] = data['page_info']['media_info']['mp4_720p_mp4']
tweet['url'] = f"https://weibo.com/{tweet['user']['_id']}/{tweet['mblogid']}" # 文章地址
if 'continue_tag' in data and data['isLongText']:
tweet['isLongText'] = True
return tweet


def save_wb_data(self, file_name, wb_data:WeiboData):
data_dict = wb_data.__dict__
is_first = False
if os.path.exists(file_name):
is_first = False
else:
is_first = True
with open(file_name, 'a+', newline='') as f:
writer = csv.writer(f)
if is_first == True:
first_data = ["关键词","帖子内容","帖子链接","帖子点赞数",
"帖子转发数","帖子评论数","图片视频链接",
"发布时间","发布者的id","发布者的姓名",
"发布人的账号类型","发布人的粉丝数","作者简介",
"ip归属地","性别","全部微博数量"]
writer.writerow(first_data)
data = []
for item_ in data_dict.values():
data.append(item_)
writer.writerow(data)


def save_to_file(self, file_name:str, is_appended:bool = True):
"""保存到文件中

Args:
file_name (str): 文件名
is_appended (bool, optional): 是否需要追加的形式写入. Defaults to True.
Returns:
bool: 是否保存数据成功
str: 相关说明
"""
if self.__search_result == False:
return False, "未搜索到数据,无法保存"
else:
result_text = self.__result_text # 结果
tweet_ids = re.findall(r'\d+/(.*?)\?refer_flag=1001030103_\'\)">复制微博地址</a>', result_text)
for tweet_id in tweet_ids:
wb_data = WeiboData() # 微博数据
wb_data.keyword = self.__key_word # 关键词
url = f"https://weibo.com/ajax/statuses/show?id={tweet_id}"
resp_blog = requests.get(url, headers=g_weibo_headers)
resp_blog.encoding = 'utf-8'
response_text_blog = resp_blog.text
data = json.loads(response_text_blog)
item_blog = self.parse_blog_info(data) # 博客数据
wb_data.post_content = item_blog.get("content",g_none_word) # 帖子内容
wb_data.post_url = item_blog.get("url",g_none_word) # 帖子链接
wb_data.post_liked = item_blog.get("attitudes_count","0") # 点赞
wb_data.post_transpond = item_blog.get("reposts_count","0") # 转发
wb_data.post_comment = item_blog.get("comments_count","0") # 评论
wb_data.post_image_videos_link = str(item_blog.get("video",g_none_word)) + str(item_blog.get("pic_urls",g_none_word)) # 图片记录
wb_data.post_release_time = item_blog.get("created_at",g_none_word) # 发布时间
wb_data.post_user_id = item_blog["user"]["_id"] # 发布者的id
wb_data.post_user_name = item_blog["user"]["nick_name"]
for key_, value_ in item_blog.items():
if key_ == "user":
user_dict = value_
user_id = user_dict["_id"]
user_url = f'https://weibo.com/ajax/profile/info?uid={user_id}' # 用户链接
resp_user = requests.get(user_url,headers=g_weibo_headers)
resp_user.encoding = "utf-8"
data_user = json.loads(resp_user.text)
item_user = parse_user_info(data_user["data"]["user"])
url_user_info = f"https://weibo.com/ajax/profile/detail?uid={item_user['_id']}"
resp_user_info = requests.get(url_user_info,headers=g_weibo_headers)
data_user_info = json.loads(resp_user_info.text)['data']
item_user['birthday'] = data_user_info.get('birthday', g_none_word)
if 'created_at' not in item_user:
item_user['created_at'] = data_user_info.get('created_at', g_none_word)
item_user['desc_text'] = data_user_info.get('desc_text', g_none_word)
item_user['ip_location'] = data_user_info.get('ip_location', g_none_word)
item_user['sunshine_credit'] = data_user_info.get('sunshine_credit', {}).get('level', g_none_word)
item_user['label_desc'] = [label['name'] for label in data_user_info.get('label_desc', [])]
if 'company' in data_user_info:
item_user['company'] = data_user_info['company']
if 'education' in data_user_info:
item_user['education'] = data_user_info['education']

wb_data.post_account_type = item_user.get("verified",g_none_word) # 是否认证
wb_data.post_fans_num = item_user.get("friends_count",g_none_word) # 粉丝数
wb_data.post_author_brief = item_user.get("description",g_none_word) # 简介
wb_data.post_ip_pos = item_user.get("ip_location",g_none_word)
sex = item_user.get("gender","m") # m 男性
if sex == "m":
wb_data.post_gender = "男"
else:
wb_data.post_gender = "女"
wb_data.post_all_weibo_nums = item_user.get("statuses_count",g_none_word)
self.save_wb_data(file_name,wb_data)

源码地址:​​https://github.com/huifeng-kooboo/weibo_keyword_crawl​


标签:info,教程,word,get,Python,item,微博,user,data
From: https://blog.51cto.com/u_15906863/5978170

相关文章

  • REDM库使用教程01(详细入门)
    写这篇文章初衷在于,虽然开发这套框架的作者很叼,但是教程写的有点糟糕,坑了我一个晚上研究。REDM库的简单介绍​​https://gitee.com/hgy413/REDM​​ 然后要先装好VS2010;装......
  • 用户登录界面(Bootstrap)入门教程01(适合初学者)
    首先博主也是小白,之前没学过前端,花了一晚上做了个登录界面,想分享给大家,比较适合初学者来快速学习,博主以后也好好学基础的前端。首先先上我做的登录界面的图:因为博主水平有限......
  • Python使用pip自动升级所有第三方库
    大家复制底下的代码并且保存成py文件执行即可,在python3环境下,执行成功。importpipfrompip._internal.utils.miscimportget_installed_distributionsfromsubprocessimp......
  • Python学习心路历程
    刚刚开始学Python也是零基础学这个,凭的完全是个人的兴趣,后面发现其实对学习有帮助的是视频,便疯狂看视频,然后看了慕课、黑马教育等视频,最后发现了一个比较好的视频资源:......
  • [DT框架使用教程01]如何在DT框架中创建插件
    [DT框架使用教程01]如何在DT框架中创建插件DT框架代码地址:​​https://github.com/huifeng-kooboo/DT​​由于国内访问速度的问题也可以访问gitee的地址:​​https://git......
  • 锐安信动态信任签章使用教程
    当用户访问一个不熟悉的网站或是没有任何知名度的平台时,如何证明此网站的安全性和真实性,让用户安全放心使用网站成为企业发展业务的一个难题。此时,锐安信(sslTrus)SSL证书动......
  • F5 Big-IP真实内网IP泄露Python3计算脚本
    漏洞描述:F5BIG-IP是美国F5公司一款集成流量管理、DNS、出入站规则、web应用防火墙、web网关、负载均衡等功能的应用交付平台。F5Big-IP可以解码cookie,获取内网真实IP。......
  • Python--SQLite学习(10.29)
    最近开始反思自己了,发现自己就是缺少坚持,然后从今天起,每天都认真学一点Python,做个积累,希望年底可以换个好工作! 今天学的是SqlLite的操作:首先sqllite3是python3.7自带的模块......
  • Python 安装pythoncom库和pyHook
    首先这个用pipinstall安装很麻烦,所以找了一点资料--https://sourceforge.net/projects/pywin32/files/pywin32/ 这里面下载pythoncom安装包直接安装https://www.lfd.u......
  • Python计算长方形面积(带参数函数demo)
    #计算面积函数defarea(width,height):returnwidth*heightdefprint_welcome(name):print("Welcome",name)print_welcome("duhuifeng")w=int(in......