整体思路:
1、从热搜榜获取详情页的链接
2、排除掉广告以及置顶 广告热力值为空 置定为第一个tr
3、点击加载更多评论页面会跳转
4、跳转页面后抓包获得评论所在真实地址
5、观察真实地址所需要的参数 从详情页获取 完成构造
6、对构造地址进行请求将count=20改为count=200 可以显示二百条
7、使用re对数据进行提取并保存
8、仅供学习使用
1 import re 2 import csv 3 import requests 4 from lxml import etree 5 6 url = 'https://s.weibo.com/top/summary?sudaref=www.baidu.com' 7 headers = { 8 '你的': 'cookie', 9 'referer': 'https://login.sina.com.cn/', 10 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36', 11 12 } 13 with open('微博热搜评论.csv', 'a', encoding='utf-8-sig', newline='') as f: 14 csv_writer = csv.writer(f) 15 csv_writer.writerow(['标题', '评论名称', '所在地区', '评论内容']) 16 17 resp = requests.get(url=url, headers=headers) 18 tree = etree.HTML(resp.text) 19 # 第一个不在热搜范围内 20 tr_list = tree.xpath('//*[@id="pl_top_realtimehot"]/table/tbody/tr')[1:] 21 # print(tr_list) 22 for tr in tr_list: 23 # 判断是不是广告 广告后面没有热力值 如果为空就为广告 24 if tr.xpath('./td[2]/span') != '': 25 href = tr.xpath('./td[2]/a/@href')[0] 26 # void 我也不知道是啥 反正不是正常热搜链接 直接排除掉 27 if 'void' not in href: 28 # 在此或得到详情页链接 29 title = tr.xpath('./td[2]/a/text()')[0] 30 wanzheng = 'https://s.weibo.com/'+href 31 xiangqing_resp = requests.get(url=wanzheng, headers=headers).text 32 uid_obj = re.compile(r'uid=(?P<uid>.*?)&') 33 uid = uid_obj.search(xiangqing_resp).group('uid') 34 # print(uid) 35 mid_obj = re.compile(r'mid="(?P<mid>.*?)" >') 36 mid = mid_obj.search(xiangqing_resp).group('mid') 37 # print(mid) 38 pinglun_200url = f'https://weibo.com/ajax/statuses/buildComments?is_reload=1&id={mid}&is_show_bulletin=2&is_mix=0&count=200&uid={uid}' 39 # print(pinglun_200url) 40 neirong = requests.get(url=pinglun_200url) 41 neirong_obj = re.compile(r'"source":"(.*?)",.*?"screen_name":"(.*?)","profile_image_url":.*?"text_raw":"(.*?)"}', re.S) 42 tiqu = neirong_obj.findall(neirong.text) 43 for i in tiqu: 44 diqu = i[0] 45 name = i[1] 46 neirong = i[2] 47 with open('微博热搜评论.csv', 'a', encoding='utf-8-sig', newline='') as f: 48 csv_writer = csv.writer(f) 49 csv_writer.writerow([title,name, diqu, neirong]) 50 print( title+'保存完成')
标签:200,uid,Python,writer,tr,mid,re,搜榜前,csv From: https://www.cnblogs.com/Gil-1117/p/16611776.html