首页 > 其他分享 >开学测试

开学测试

时间:2023-02-16 21:57:31浏览次数:53  
标签:comment insert 开学 into 测试 table where select

要求Python 编写程序爬取京东手机的评论数据,生成Json形式的数据文件。

import requests
import json
import csv
from lxml import etree
from bs4 import BeautifulSoup
import time

'''
#如果ip被封,可以使用这个进行ip设置
proxy ='114.239.148.103'
proxies={
    'http':'http://'+proxy,
    'https':'https://'+proxy,
    }
'''
comment_url = 'https://sclub.jd.com/comment/productPageComments.action?callback'


# 获取评论
def get_comment(productid, name):
    headers = {
        # 'cookie': 'shshshfpa=4e6c0f90-587c-a46f-5880-a7debd7d4393-1544616560; __jdu=1126324296; PCSYCityID=412; user-key=44089d07-befa-4522-87fc-bcc039ec7045; pinId=qopcdCj6kcR3U84v0KTTbrV9-x-f3wj7; pin=jd_769791719e9e9; unick=jd_769791719e9e9; _tp=nc%2FbpB%2BkeSbk3jZ6p2H0FlWrdUa1gbgi16QiQ7NBXKY%3D; _pst=jd_769791719e9e9; cn=9; ipLoc-djd=1-72-2799-0; mt_xid=V2_52007VwMSUVpaUV8cQR5sUWMDEgUIUVBGGEofWhliABNUQQtQWkpVHVVXb1ZGB1lYW11LeRpdBW4fElFBW1VLH0ESXgJsAhpiX2hSahxLGFsFZwcRUG1bWlo%3D; shshshfpb=bRnqa4s886i2OeHTTR9Nq6g%3D%3D; unpl=V2_ZzNtbUZTSxJ3DURTLk0LAmJXFVlKAkdAIQ1PUXseCVIzU0UKclRCFXwURldnGlUUZwcZXERcQRdFCHZXchBYAWcCGllyBBNNIEwHDCRSBUE3XHxcFVUWF3RaTwEoSVoAYwtBDkZUFBYhW0IAKElVVTUFR21yVEMldQl2VHsaWwdkBhFVRWdzEkU4dl17HVwDYDMTbUNnAUEpAUJRfRpcSGcDEVpAVEYWfQ92VUsa; __jda=122270672.1126324296.1544405080.1545968922.1545980857.16; __jdc=122270672; ceshi3.com=000; TrackID=11EpDXYHaqwJE15W6paeMk_GMm05o3NUUeze9XyIcFs33GGxX8knxMpxWTeID75qSiUlj31s8CtKJs4hJUV-7CvKuiOEyDd8bvOCH7zzigeI; __jdv=122270672|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_55963436def64e659d5de48416dfeaff|1545980984854; 3AB9D23F7A4B3C9B=OA3G4SO3KYLQB6H3AIX36QQAW34BF376WJN66IUPEQAG6FUA2NWGM6R6MBDL32HLDG62WL2FICMYIVMOU6ISUWHKPE; shshshfp=1ed96ad08a7585648cd5017583df22bd; _gcl_au=1.1.162218981.1545981094; JSESSIONID=305879A97D4EA21F4D5C4207BB81423F.s1; shshshsID=c8c51ee0c5b1ddada7c5544abc3eea8a_5_1545981289039; __jdb=122270672.11.1126324296|16.1545980857; thor=3A30EBABA844934A836AC9AA37D0F4B85306071BD7FC64831E361A626E76F6977EC7794D06F2A922AEABF7D3D7DC22FBE2EB6B240F81A13F5A609368D4185BA0081D7C34A93760063D2F058F5B916835B4960EC8A9122008745971D812BA9E4AE48542CCC5A42E5CD786CC93770E520E36F950614C06A7EB05C8E1DD93EEA844B2EBA9B0136063FCFB6B7C83AECA828774041A9FED7BD98496689496122822FF',
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36",
        "Referer": "https://item.jd.com/%s.html" % (productid)
    }
    for i in range(100):  # 此处设置爬取几页的评论
        page = i
        params = {
            "productId": 100031406046,  # 商品id
            'score': 0,  # 如果想要爬取全部评论设置为0,好评为3,中评为2,差评为1,晒图评价为4,追评为5
            'sortType': 5,
            'page': page,
            'pageSize': 10,
        }
        comment_resp = requests.get(url=comment_url, params=params, headers=headers)
        comment_str = comment_resp.text
        if comment_str == '':
            print("获取内容为空")
        comment_dict = json.loads(comment_str)
        comments = comment_dict['comments']
        load(comments, name, productid)


# 数据存储
def load(comments, name, productid):
    for comment in comments:
        nickname = comment['nickname']
        content = comment['content'].replace(',',',').replace('\n', '').replace('\r', '')
        g_uid = comment['guid']
        creationTime = comment['creationTime']
        is_Top = comment['isTop']
        plus = comment['plusAvailable']
        referenceTime = comment['referenceTime']
        score = comment['score']
        days = comment['days']
        is_Mobile = comment['userClient']

        if(is_Top == False):
            is_Top = "非置顶"
        else:
            is_Top = "置顶"

        if(is_Mobile == 2):
            is_Mobile = "IOS客户端"
        elif (is_Mobile == 0):
            is_Mobile = "PC端"
        elif(is_Mobile == 4):
            is_Mobile = "Android客户端"

        if(plus == 201):
            plus = "PLUS会员"
        else:
            plus = "非会员"


        test = g_uid \
               + ',' + nickname \
               + ',' + content \
               + ',' + str(score) \
               +',' + creationTime \
               + ',' + str(is_Top) \
               + ',' + plus \
               + ',' + referenceTime \
               + ',' + str(days) \
               + ',' + str(is_Mobile)

        # print(infor)
        with open('D:/新建文件夹' + '%s' % (name) + '.csv', 'a',
                  newline='')as csv_file:
            text = []
            text.append(test)
            writer = csv.writer(csv_file)
            writer.writerow(text)


# 获取搜索商品名界面的每个商品的序号
def get_number(name):
    headers = {
         'cookie': 'shshshfpa=4e6c0f90-587c-a46f-5880-a7debd7d4393-1544616560; __jdu=1126324296; PCSYCityID=412; user-key=44089d07-befa-4522-87fc-bcc039ec7045; pinId=qopcdCj6kcR3U84v0KTTbrV9-x-f3wj7; pin=jd_769791719e9e9; unick=jd_769791719e9e9; _tp=nc%2FbpB%2BkeSbk3jZ6p2H0FlWrdUa1gbgi16QiQ7NBXKY%3D; _pst=jd_769791719e9e9; cn=9; ipLoc-djd=1-72-2799-0; mt_xid=V2_52007VwMSUVpaUV8cQR5sUWMDEgUIUVBGGEofWhliABNUQQtQWkpVHVVXb1ZGB1lYW11LeRpdBW4fElFBW1VLH0ESXgJsAhpiX2hSahxLGFsFZwcRUG1bWlo%3D; shshshfpb=bRnqa4s886i2OeHTTR9Nq6g%3D%3D; unpl=V2_ZzNtbUZTSxJ3DURTLk0LAmJXFVlKAkdAIQ1PUXseCVIzU0UKclRCFXwURldnGlUUZwcZXERcQRdFCHZXchBYAWcCGllyBBNNIEwHDCRSBUE3XHxcFVUWF3RaTwEoSVoAYwtBDkZUFBYhW0IAKElVVTUFR21yVEMldQl2VHsaWwdkBhFVRWdzEkU4dl17HVwDYDMTbUNnAUEpAUJRfRpcSGcDEVpAVEYWfQ92VUsa; __jda=122270672.1126324296.1544405080.1545968922.1545980857.16; __jdc=122270672; ceshi3.com=000; TrackID=11EpDXYHaqwJE15W6paeMk_GMm05o3NUUeze9XyIcFs33GGxX8knxMpxWTeID75qSiUlj31s8CtKJs4hJUV-7CvKuiOEyDd8bvOCH7zzigeI; __jdv=122270672|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_55963436def64e659d5de48416dfeaff|1545980984854; 3AB9D23F7A4B3C9B=OA3G4SO3KYLQB6H3AIX36QQAW34BF376WJN66IUPEQAG6FUA2NWGM6R6MBDL32HLDG62WL2FICMYIVMOU6ISUWHKPE; shshshfp=1ed96ad08a7585648cd5017583df22bd; _gcl_au=1.1.162218981.1545981094; JSESSIONID=305879A97D4EA21F4D5C4207BB81423F.s1; shshshsID=c8c51ee0c5b1ddada7c5544abc3eea8a_5_1545981289039; __jdb=122270672.11.1126324296|16.1545980857; thor=3A30EBABA844934A836AC9AA37D0F4B85306071BD7FC64831E361A626E76F6977EC7794D06F2A922AEABF7D3D7DC22FBE2EB6B240F81A13F5A609368D4185BA0081D7C34A93760063D2F058F5B916835B4960EC8A9122008745971D812BA9E4AE48542CCC5A42E5CD786CC93770E520E36F950614C06A7EB05C8E1DD93EEA844B2EBA9B0136063FCFB6B7C83AECA828774041A9FED7BD98496689496122822FF',
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36",
    }
    response = requests.get("https://search.jd.com/Search?keyword=%s&enc=utf-8" % (name), headers=headers)
    html = BeautifulSoup(response.text, 'lxml')
    list = html.find_all("li", class_='gl-item')
    numbers = []
    for number in list:
        numbers.append(int(number.get("data-sku")))
    return numbers


def main():
    get_comment(100031406046, "差评-小米12Pro")
    #time.sleep(0.5)
    print("爬取完毕")


main()

需求1:分析用户使用移动端购买还是PC端购买,及移动端和PC端的用户比例,生成ismobilehive表,存储统计结果;

建表,ismobilehive

 
insert into table ismobilehive select 'Android',count(*) from comment where is_mobile = 'Android客户端';
insert into table ismobilehive select 'IOS',count(*) from comment where is_mobile = 'IOS客户端';
insert into table ismobilehive select 'PC',count(*) from comment where is_mobile = 'PC端';

需求2:分析用户评论周期(收到货后,一般多久进行评论),生成dayssql表,存储统计结果;

   
create table dayssql(
day int,
num string
)
row format delimited fields terminated by ',';

**************************************************************************************************************
insert into table dayssql select '0',count(*) from comment01 where days = '0';
insert into table dayssql select '1',count(*) from comment01 where days = '1';
insert into table dayssql select '2',count(*) from comment01 where days = '2';
insert into table dayssql select '3',count(*) from comment01 where days = '3';
insert into table dayssql select '4',count(*) from comment01 where days = '4';
insert into table dayssql select '5',count(*) from comment01 where days = '5';
insert into table dayssql select '6',count(*) from comment01 where days = '6';
insert into table dayssql select '7',count(*) from comment01 where days = '7';
insert into table dayssql select '8',count(*) from comment01 where days = '8';
insert into table dayssql select '9',count(*) from comment01 where days = '9';
insert into table dayssql select '10',count(*) from comment01 where days = '10';
insert into table dayssql select '11',count(*) from comment01 where days = '11';
insert into table dayssql select '12',count(*) from comment01 where days = '12';
insert into table dayssql select '13',count(*) from comment01 where days = '13';
insert into table dayssql select '14',count(*) from comment01 where days = '14';
insert into table dayssql select '大于14天小于21天',count(*) from comment01 where days > '14' and days <= '21';
insert into table dayssql select '大于21天小于30天',count(*) from comment01 where days > '21' and days <= '30';
insert into table dayssql select '大于30天',count(*) from comment01 where days > '30';
**************************************************************************************************************
 

需求3:分析会员级别(判断购买此商品的用户级别),生成userlevelname_out表,存储统计结果;

   
create table userlevelname_out(
level string,
num int
)
row format delimited fields terminated by ',';

insert into table userlevelname_out select 'PLUS会员',count(*) from comment01 where plus = 'PLUS会员';
insert into table userlevelname_out select '非会员',count(*) from comment01 where plus = '非会员';
 

需求4:分析每天评论量,生成creationtime_out表,存储统计结果;

   
create table creationtime_out(
day string,
num int
)
row format delimited fields terminated by ',';

insert into table creationtime_out select '2021年12月',count(*) from comment01 where creationtime between '2021-12-01 00:00:00' and '2021-12-31 24:00:00';
insert into table creationtime_out select '2022年01月',count(*) from comment01 where creationtime between '2022-01-01 00:00:00' and '2022-01-31 24:00:00';
insert into table creationtime_out select '2022年02月',count(*) from comment01 where creationtime between '2022-02-01 00:00:00' and '2022-02-28 24:00:00';
insert into table creationtime_out select '2022年03月',count(*) from comment01 where creationtime between '2022-03-01 00:00:00' and '2022-03-31 24:00:00';

select * from creationtime_out;
 

需求5:日期格式标准化需求5:日期格式标准化

功能为:去掉评论时间的时分秒,只保留年月日

   
create table comment02(
guid string,
nickname string,
content string,
score int,
creationTime string,
is_Top string,
plus string,
referenceTime string,
days int,
is_Mobile string
)
row format delimited fields terminated by ',';

insert into table comment02 select guid,nickname,content,score,substr(creationTime,0,10),is_Top,plus,substr(referenceTime,0,10),days,is_Mobile from comment;
 

4、导出用到的是hive -e可以参考数据清洗——日志 - O_O似曾相识 - 博客园 (cnblogs.com)

5、数据显示

 

 

 

 

 

 

 

标签:comment,insert,开学,into,测试,table,where,select
From: https://www.cnblogs.com/jyt604743080/p/17128448.html

相关文章

  • python接口自动化13-API流量回放/锲约测试/自动化测试
    PPL-Tester简介http工具集,通过代理获取到API的请求与响应信息,将这些请求信息进行流量回放/锲约测试或快速生成用例,亦可通过人工进行修改参数化提取、变量引用、断言......
  • 与ChatGPT关于测试问题的对话
    1测试人员创造什么价值?测试人员在软件开发生命周期中扮演着非常重要的角色,他们的主要职责是确保软件在发布前达到高质量标准。以下是测试人员为软件开发和业务提供的价值......
  • Burp Suite Professional 2023.2 (macOS, Linux, Windows) - Web 应用安全、测试和扫
    BurpSuiteProfessional,Test,find,andexploitvulnerabilities.请访问原文链接:https://sysin.org/blog/burp-suite-pro-2023/,查看最新版。原创作品,转载请保留出处。......
  • 如何利用iphone自带的弱网络进行弱网或接口timeout测试
    前提条件:1.设置中调出:开发者选项。(方法一:找开发帮忙真机链接编译器xcode,重启手机后开发者选项消失.开发者选项在设置的一级子目录下;方法二:在设置隐私与安全性中,打开开发......
  • 课堂测试——架构
     (1)什么是架构?   人根据某个需要解决的问题,识别分析问题,并将按照最优解决方式对问题进行切分,让不同角色分工切分好的模块,最后整合各个达到最优解的模块形成一个整体,以......
  • (二)接口测试基础认知
    测试用例设计思路:从输入参数进行考虑设计:1)优先级-针对所有接口1、暴露给其他系统、第三方调用接口2、系统内部调用的核心功能接口3、系统内部调用的非核心功能接口2......
  • jmeter的阶梯式加压性能测试 jp@gc - Stepping Thread Group (deprecated)
    当测试需求要求是阶梯型的压力测试场景时,使用该线程组。比如测试场景是  从100并发开始,每60s加压50并发,直至达到目的并发数(中途发现问题随时停掉),之后保持每60s停止50......
  • 软件测试新人如何提升自己?
    首先要熟悉部门业务,了解目前一共有哪些项目,你可能会投入哪个项目,这样就可以有优先级的看需求。看需求一定要仔细,如果看不透彻,很容易漏掉测试点,或者发现不了需求存在的......
  • 如何提升软件测试的价值?
    日常我们测试的项目都是时间紧急,导致需求或设计都可能存在一定的问题;或是bug或是未考虑周全,或是描述不清。开发写代码过程也会出现遗漏、错误或到了时间节点未进行联......
  • 接口测试经验
    问题:1)为什么要做接口测试2)接口测试需要用到的流程和工具3)接口测试点编写的思路4)接口出现问题如何定位5)接口疑问  答案:一、为什么要做接口测试 a)提升测试工作......