首页 > 其他分享 >66拼dd评论采集

66拼dd评论采集

时间:2023-02-05 13:55:54浏览次数:31  
标签:goods name dd page 采集 66 print import id

# 需要更改的地方 cookie 、 accesstoken、 与之 对应的 用户id、

# coding=gbk
# -*- coding:uft-8 -*-
# @Time: 2023/2/4
# @Author: 十架bgm
# @FileName: pd
import datetime
import json

import pandas as pd
import re
import time
import threading
import requests
import os
import sys
import io
os.environ['NO_PROXY'] = 'https://mobile.pinduoduo.com/proxy/api/reviews/426046631761/list'
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8') #改变标准输出的默认编码
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030')  # 改变标准输出的默认编码


# 评论
def pl(goods_id):
    lis = []
    # url = 'https://mobile.pinduoduo.com/proxy/api/reviews/426046631761/list'  # 426046631761 是goods_id
    try:
        url = f'https://mobile.pinduoduo.com/proxy/api/reviews/{goods_id}/list'
        for i in range(1, 10000):   # 起始评论1
            params = {
                'pdduid': '5735401831',  # 不变的
                # 'page': '1',
                'page': str(i),
                # 'size': '10',
                'size': '20',   # 最大只能跑20
                'enable_video': '1',
                'enable_group_review': '1',
                'label_id': '0'
            }
            headers = {
                'cookie': 'api_uid=CkmYfmPeZz+CAABuDOyCAg==; _nano_fp=XpE8npdalpCol0XyXo_NdxAuhKD77v5NckRC8MOK; webp=1; jrpl=wylxmv1fCBzsXnZ8RSiLfmTZDX037iLG; dilx=qHF8iHPRf6m5_hLzu5j3M; njrpl=wylxmv1fCBzsXnZ8RSiLfmTZDX037iLG; PDDAccessToken=KRWIP56MWHTQVXFSCZBSIY5AZ6U6SW5SZDC3KLVJ7KWEOIW4ZYNQ1116791; pdd_user_id=5735401831; pdd_user_uin=BIVTYNEACNDUJMXJRP6QBXZ7HQ_GEXDA; pdd_vds=gaSUgVJUpXKRzAgRFRgKJgSRHHJRJjJRSgVKHXHAzWXXFVpMMzWKkJWKZzJH',
                'referer': 'https://mobile.pinduoduo.com/goods_comments.html?goods_id=48187200265&_oc_trace_mark=199&_oc_adinfo=eyJzY2VuZV9pZCI6Mn0%3D&_oak_gallery=https%3A%2F%2Fimg.pddpic.com%2Fmms-material-img%2F2020-07-13%2Ff54514a5-03be-4e3c-8e7b-0b0011546dc7.png.a.jpeg&_oc_refer_ad=1&_x_query=%E5%B0%8F%E4%BD%A9solo%E9%A5%AE%E6%B0%B4%E6%9C%BA&refer_page_el_sn=99369&refer_rn=&refer_page_name=goods_detail&refer_page_id=10014_1675563362551_rjp7v0a1g5&refer_page_sn=10014&uin=BIVTYNEACNDUJMXJRP6QBXZ7HQ_GEXDA&mall_id=695078883',
                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
            }
            res = requests.get(url=url, headers=headers, params=params)
            # print(res.json())
            # print('------------------------')
            res2 = res.json()
            datas = res2['data']
            if len(datas) == 0:
                break
            print(len(datas))
            for data in datas:
                name = data["name"]
                specs = data["specs"]
                specs2 = specs.split(',')[1].split(':')[1][:-3].strip()[1:]  # 型号
                # specs
                # 荒川-----[{"spec_key":"型号","spec_value":"智能饮水机滤芯5片(不兼容1代)"}]------小佩饮水机solo2完美适配
                comment = data["comment"]
                print(f'{name}-----{specs2}------{comment}')
                dic = {
                    "用户": name,
                    "型号": specs2,
                    "评论": comment
                }
                lis.append(dic)
            time.sleep(1)
        # return lis
        print(f'总计爬取{len(lis)}评论')
    # ----------------------------------------------excel
        if len(lis) > 0:
            save = input("是否保存到本地? 'y/n':")
            if save == 'y':
                today = str(datetime.datetime.today()).split(' ')[0].replace('-', '_')
                pf = pd.DataFrame(lis)  # 转列表为DataFrame
                path = pd.ExcelWriter(f'{today}商品id{goods_id}.xlsx')  # 设置保存路径
                pf.to_excel(path, encoding='utf-8', index=False)  # 转化为Excel
                path.save()  # 保存
                print(f"保存成功,文件名为:{today}商品id{goods_id}")
        else:
            print("输入商品id有误!请重新输入!")
    except Exception:
        print("输入商品id有误!请重新输入!")

# 历史订单
def order_list():
    # url = 'https://mobile.pinduoduo.com/proxy/api/api/aristotle/order_list_v4?pdduid=5735401831'
    url = 'https://mobile.pinduoduo.com/proxy/api/api/aristotle/order_list_v4'
    header = {
        'accept': 'application/json, text/plain, */*',
        'accesstoken': 'RKIMYUG5HIT7P2EDFMQOMULKX4QNUY3CX5EC6JXRAOJGJCZ56ZBA1116791',
        'content-type': 'application/json;charset=UTF-8',
        'cookie': 'api_uid=CkmYfmPeZz+CAABuDOyCAg==; _nano_fp=XpE8npdalpCol0XyXo_NdxAuhKD77v5NckRC8MOK; webp=1; jrpl=wylxmv1fCBzsXnZ8RSiLfmTZDX037iLG; dilx=qHF8iHPRf6m5_hLzu5j3M; njrpl=wylxmv1fCBzsXnZ8RSiLfmTZDX037iLG; PDDAccessToken=RKIMYUG5HIT7P2EDFMQOMULKX4QNUY3CX5EC6JXRAOJGJCZ56ZBA1116791; pdd_user_id=5735401831; pdd_user_uin=BIVTYNEACNDUJMXJRP6QBXZ7HQ_GEXDA; rec_list_personal=rec_list_personal_w4ddbf; pdd_vds=gaLLNOnLQmPmbtPOOLmtoQitninonONoOOnaLEmnyItIbabmNLyOGomQoyNO; JSESSIONID=49619BB5F9A6D0F9BAE45FF96CB4F9EF',
        'origin': 'https://mobile.pinduoduo.com',
        'referer': 'https://mobile.pinduoduo.com/orders.html?type=0&comment_tab=1&combine_orders=1&main_orders=1&refer_page_name=personal&refer_page_id=10001_1675570924330_jic22o28i0&refer_page_sn=10001&order_index=0',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
        'verifyauthtoken': 'ymAWWO1WjmrFFdMV9YA2gw58de46e976b938cd7'
    }
    data = {
            "pdduid": "5735401831",   # 用户id
            "type": "all",
            "page": 1,
            "origin_host_name": "mobile.pinduoduo.com",
            "page_from": 0,
            "size": 10,
            # "offset": "MO-02-220916-403250881482407"
            "offset": "MO-01-221020-252801259082407"
        }
    res = requests.post(url=url, headers=header, data=json.dumps(data))
    # print(res.json()["orders"])
    totals = res.json()["orders"]
    # re.findall('"goods_name": "(.*?)"', totals)
    for i in totals:
        # print(f'mall_name:{i["mall"]["mall_name"]}')
        print(i)

def main():
    print("1.评论  2.历史订单(暂未完善)")

    orde = input("输入查询的命令:")
    if orde == '1':
        while True:
            id = input("输入要查询的商品链接(按q退出):")
            if id == 'q':
                break
            try:
                id_r = re.findall(r'goods_id=(.*?)&_oak', id)[0]
                print("商品id为:" + str(id_r))
                pl(goods_id=id_r)
            except Exception:
                print("请重新输入链接或去web页面复制商品链接!")

    if orde == '2':
        order_list()


if __name__ == '__main__':
    main()

标签:goods,name,dd,page,采集,66,print,import,id
From: https://www.cnblogs.com/socoo-/p/17093290.html

相关文章

  • CF1666K Kingdom Partition 题解
    神仙网络流题。Description传送门Solution考虑最小割,将每个点\(u\)拆成\(L_u,R_u\)两个点。对于每一条原图中的边\((u,v,w)\),连双向边\((L_u,R_v,w),(L_v,R_u,w)......
  • 问题:AttributeError: module 'lib' has no attribute 'OpenSSL_add_all_algorithms'
    分析在使用支付宝沙箱时,报了这个错误,该问题是没有安装openssl包解决pip3installpyOpenSSL安装后再次运行如果还是报错,请降低加密库pipinstallcryptography==38.0.......
  • 【UVA10943】How do you add?
    比较简单的数学题。先设状态,以分解出的个数\(m\)划分阶段,以数\(n\)划分子问题。则显然的,有\(f_{i,j}=\sum\limits_{0\lew\lej}f_{i-1,j-w}\)。这个式子啥意思......
  • [Express] Add error handling middleware for express
    Expressapp:importcorsfrom'cors';importexpress,{Application}from'express';importroutesfrom'./routes';import*asmiddlewaresfrom'./middleware......
  • 领域驱动设计(DDD)——架构设计
    系统与子系统系统:泛指由一群有关联的个体组成,根据某种规则运作,能完成个别元件不能独立完成的工作能力的群体。子系统:也是由一群关联的个体组成的系统,多半是在更大的系统......
  • IndexedDB
    概述随着浏览器的功能不断增强,越来越多的网站开始考虑,将大量数据储存在客户端,这样可以减少从服务器获取数据,直接从本地获取数据现有的浏览器数据储存方案,都不适合储存大......
  • 深度学习基础——感受野、padding、stride、反向传播
    卷积神经网络基础补充——梯度下降卷积神经网络基础VGG网络详解及感受野的计算感受野、padding、stride感受野:特征图上一个像素点对应于原图中的区域有多大。举例:假设......
  • 隆重介绍:Buddy全新Docker CLI操作
    自2013年推出以来,Docker一直是软件开发人员的福音。一个隔离独立的测试环境,一个在整个团队中共享工具和依赖项的独立运行时且快速与易用,有保证的Kubernetes可扩展性——所有......
  • esphome-esp8266
    esp8266使用esphome接入hass对于生成配置文件的更改此处nodemcu泛指集成的开发板,一般十几块钱一块下方使用的是D1,对应的针脚是GPIO5esp8266:board:nodemcuv2......
  • 【第9篇】影响语音采集水平的因素
    2.2.2影响语音采集水平的因素语音采集的水平高低,严重影响后续语音识别结果的正确性,因此影响到整体语音交互的效果。一般影响语音采集水平的因素有如下几点:1.声源采样......