要求Python 编写程序爬取京东手机的评论数据,生成Json形式的数据文件。
import requests import json import csv from lxml import etree from bs4 import BeautifulSoup import time ''' #如果ip被封,可以使用这个进行ip设置 proxy ='114.239.148.103' proxies={ 'http':'http://'+proxy, 'https':'https://'+proxy, } ''' comment_url = 'https://sclub.jd.com/comment/productPageComments.action?callback' # 获取评论 def get_comment(productid, name): headers = { # 'cookie': 'shshshfpa=4e6c0f90-587c-a46f-5880-a7debd7d4393-1544616560; __jdu=1126324296; PCSYCityID=412; user-key=44089d07-befa-4522-87fc-bcc039ec7045; pinId=qopcdCj6kcR3U84v0KTTbrV9-x-f3wj7; pin=jd_769791719e9e9; unick=jd_769791719e9e9; _tp=nc%2FbpB%2BkeSbk3jZ6p2H0FlWrdUa1gbgi16QiQ7NBXKY%3D; _pst=jd_769791719e9e9; cn=9; ipLoc-djd=1-72-2799-0; mt_xid=V2_52007VwMSUVpaUV8cQR5sUWMDEgUIUVBGGEofWhliABNUQQtQWkpVHVVXb1ZGB1lYW11LeRpdBW4fElFBW1VLH0ESXgJsAhpiX2hSahxLGFsFZwcRUG1bWlo%3D; shshshfpb=bRnqa4s886i2OeHTTR9Nq6g%3D%3D; unpl=V2_ZzNtbUZTSxJ3DURTLk0LAmJXFVlKAkdAIQ1PUXseCVIzU0UKclRCFXwURldnGlUUZwcZXERcQRdFCHZXchBYAWcCGllyBBNNIEwHDCRSBUE3XHxcFVUWF3RaTwEoSVoAYwtBDkZUFBYhW0IAKElVVTUFR21yVEMldQl2VHsaWwdkBhFVRWdzEkU4dl17HVwDYDMTbUNnAUEpAUJRfRpcSGcDEVpAVEYWfQ92VUsa; __jda=122270672.1126324296.1544405080.1545968922.1545980857.16; __jdc=122270672; ceshi3.com=000; TrackID=11EpDXYHaqwJE15W6paeMk_GMm05o3NUUeze9XyIcFs33GGxX8knxMpxWTeID75qSiUlj31s8CtKJs4hJUV-7CvKuiOEyDd8bvOCH7zzigeI; __jdv=122270672|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_55963436def64e659d5de48416dfeaff|1545980984854; 3AB9D23F7A4B3C9B=OA3G4SO3KYLQB6H3AIX36QQAW34BF376WJN66IUPEQAG6FUA2NWGM6R6MBDL32HLDG62WL2FICMYIVMOU6ISUWHKPE; shshshfp=1ed96ad08a7585648cd5017583df22bd; _gcl_au=1.1.162218981.1545981094; JSESSIONID=305879A97D4EA21F4D5C4207BB81423F.s1; shshshsID=c8c51ee0c5b1ddada7c5544abc3eea8a_5_1545981289039; __jdb=122270672.11.1126324296|16.1545980857; thor=3A30EBABA844934A836AC9AA37D0F4B85306071BD7FC64831E361A626E76F6977EC7794D06F2A922AEABF7D3D7DC22FBE2EB6B240F81A13F5A609368D4185BA0081D7C34A93760063D2F058F5B916835B4960EC8A9122008745971D812BA9E4AE48542CCC5A42E5CD786CC93770E520E36F950614C06A7EB05C8E1DD93EEA844B2EBA9B0136063FCFB6B7C83AECA828774041A9FED7BD98496689496122822FF', "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36", "Referer": "https://item.jd.com/%s.html" % (productid) } for i in range(100): # 此处设置爬取几页的评论 page = i params = { "productId": 100031406046, # 商品id 'score': 0, # 如果想要爬取全部评论设置为0,好评为3,中评为2,差评为1,晒图评价为4,追评为5 'sortType': 5, 'page': page, 'pageSize': 10, } comment_resp = requests.get(url=comment_url, params=params, headers=headers) comment_str = comment_resp.text if comment_str == '': print("获取内容为空") comment_dict = json.loads(comment_str) comments = comment_dict['comments'] load(comments, name, productid) # 数据存储 def load(comments, name, productid): for comment in comments: nickname = comment['nickname'] content = comment['content'].replace(',',',').replace('\n', '').replace('\r', '') g_uid = comment['guid'] creationTime = comment['creationTime'] is_Top = comment['isTop'] plus = comment['plusAvailable'] referenceTime = comment['referenceTime'] score = comment['score'] days = comment['days'] is_Mobile = comment['userClient'] if(is_Top == False): is_Top = "非置顶" else: is_Top = "置顶" if(is_Mobile == 2): is_Mobile = "IOS客户端" elif (is_Mobile == 0): is_Mobile = "PC端" elif(is_Mobile == 4): is_Mobile = "Android客户端" if(plus == 201): plus = "PLUS会员" else: plus = "非会员" test = g_uid \ + ',' + nickname \ + ',' + content \ + ',' + str(score) \ +',' + creationTime \ + ',' + str(is_Top) \ + ',' + plus \ + ',' + referenceTime \ + ',' + str(days) \ + ',' + str(is_Mobile) # print(infor) with open('D:/新建文件夹' + '%s' % (name) + '.csv', 'a', newline='')as csv_file: text = [] text.append(test) writer = csv.writer(csv_file) writer.writerow(text) # 获取搜索商品名界面的每个商品的序号 def get_number(name): headers = { 'cookie': 'shshshfpa=4e6c0f90-587c-a46f-5880-a7debd7d4393-1544616560; __jdu=1126324296; PCSYCityID=412; user-key=44089d07-befa-4522-87fc-bcc039ec7045; pinId=qopcdCj6kcR3U84v0KTTbrV9-x-f3wj7; pin=jd_769791719e9e9; unick=jd_769791719e9e9; _tp=nc%2FbpB%2BkeSbk3jZ6p2H0FlWrdUa1gbgi16QiQ7NBXKY%3D; _pst=jd_769791719e9e9; cn=9; ipLoc-djd=1-72-2799-0; mt_xid=V2_52007VwMSUVpaUV8cQR5sUWMDEgUIUVBGGEofWhliABNUQQtQWkpVHVVXb1ZGB1lYW11LeRpdBW4fElFBW1VLH0ESXgJsAhpiX2hSahxLGFsFZwcRUG1bWlo%3D; shshshfpb=bRnqa4s886i2OeHTTR9Nq6g%3D%3D; unpl=V2_ZzNtbUZTSxJ3DURTLk0LAmJXFVlKAkdAIQ1PUXseCVIzU0UKclRCFXwURldnGlUUZwcZXERcQRdFCHZXchBYAWcCGllyBBNNIEwHDCRSBUE3XHxcFVUWF3RaTwEoSVoAYwtBDkZUFBYhW0IAKElVVTUFR21yVEMldQl2VHsaWwdkBhFVRWdzEkU4dl17HVwDYDMTbUNnAUEpAUJRfRpcSGcDEVpAVEYWfQ92VUsa; __jda=122270672.1126324296.1544405080.1545968922.1545980857.16; __jdc=122270672; ceshi3.com=000; TrackID=11EpDXYHaqwJE15W6paeMk_GMm05o3NUUeze9XyIcFs33GGxX8knxMpxWTeID75qSiUlj31s8CtKJs4hJUV-7CvKuiOEyDd8bvOCH7zzigeI; __jdv=122270672|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_55963436def64e659d5de48416dfeaff|1545980984854; 3AB9D23F7A4B3C9B=OA3G4SO3KYLQB6H3AIX36QQAW34BF376WJN66IUPEQAG6FUA2NWGM6R6MBDL32HLDG62WL2FICMYIVMOU6ISUWHKPE; shshshfp=1ed96ad08a7585648cd5017583df22bd; _gcl_au=1.1.162218981.1545981094; JSESSIONID=305879A97D4EA21F4D5C4207BB81423F.s1; shshshsID=c8c51ee0c5b1ddada7c5544abc3eea8a_5_1545981289039; __jdb=122270672.11.1126324296|16.1545980857; thor=3A30EBABA844934A836AC9AA37D0F4B85306071BD7FC64831E361A626E76F6977EC7794D06F2A922AEABF7D3D7DC22FBE2EB6B240F81A13F5A609368D4185BA0081D7C34A93760063D2F058F5B916835B4960EC8A9122008745971D812BA9E4AE48542CCC5A42E5CD786CC93770E520E36F950614C06A7EB05C8E1DD93EEA844B2EBA9B0136063FCFB6B7C83AECA828774041A9FED7BD98496689496122822FF', "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36", } response = requests.get("https://search.jd.com/Search?keyword=%s&enc=utf-8" % (name), headers=headers) html = BeautifulSoup(response.text, 'lxml') list = html.find_all("li", class_='gl-item') numbers = [] for number in list: numbers.append(int(number.get("data-sku"))) return numbers def main(): get_comment(100031406046, "差评-小米12Pro") #time.sleep(0.5) print("爬取完毕") main()
需求1:分析用户使用移动端购买还是PC端购买,及移动端和PC端的用户比例,生成ismobilehive表,存储统计结果;
建表,ismobilehive
insert into table ismobilehive select 'Android',count(*) from comment where is_mobile = 'Android客户端'; insert into table ismobilehive select 'IOS',count(*) from comment where is_mobile = 'IOS客户端'; insert into table ismobilehive select 'PC',count(*) from comment where is_mobile = 'PC端';
需求2:分析用户评论周期(收到货后,一般多久进行评论),生成dayssql表,存储统计结果;
create table dayssql( day int, num string ) row format delimited fields terminated by ','; ************************************************************************************************************** insert into table dayssql select '0',count(*) from comment01 where days = '0'; insert into table dayssql select '1',count(*) from comment01 where days = '1'; insert into table dayssql select '2',count(*) from comment01 where days = '2'; insert into table dayssql select '3',count(*) from comment01 where days = '3'; insert into table dayssql select '4',count(*) from comment01 where days = '4'; insert into table dayssql select '5',count(*) from comment01 where days = '5'; insert into table dayssql select '6',count(*) from comment01 where days = '6'; insert into table dayssql select '7',count(*) from comment01 where days = '7'; insert into table dayssql select '8',count(*) from comment01 where days = '8'; insert into table dayssql select '9',count(*) from comment01 where days = '9'; insert into table dayssql select '10',count(*) from comment01 where days = '10'; insert into table dayssql select '11',count(*) from comment01 where days = '11'; insert into table dayssql select '12',count(*) from comment01 where days = '12'; insert into table dayssql select '13',count(*) from comment01 where days = '13'; insert into table dayssql select '14',count(*) from comment01 where days = '14'; insert into table dayssql select '大于14天小于21天',count(*) from comment01 where days > '14' and days <= '21'; insert into table dayssql select '大于21天小于30天',count(*) from comment01 where days > '21' and days <= '30'; insert into table dayssql select '大于30天',count(*) from comment01 where days > '30'; **************************************************************************************************************
需求3:分析会员级别(判断购买此商品的用户级别),生成userlevelname_out表,存储统计结果;
create table userlevelname_out( level string, num int ) row format delimited fields terminated by ','; insert into table userlevelname_out select 'PLUS会员',count(*) from comment01 where plus = 'PLUS会员'; insert into table userlevelname_out select '非会员',count(*) from comment01 where plus = '非会员';
需求4:分析每天评论量,生成creationtime_out表,存储统计结果;
create table creationtime_out( day string, num int ) row format delimited fields terminated by ','; insert into table creationtime_out select '2021年12月',count(*) from comment01 where creationtime between '2021-12-01 00:00:00' and '2021-12-31 24:00:00'; insert into table creationtime_out select '2022年01月',count(*) from comment01 where creationtime between '2022-01-01 00:00:00' and '2022-01-31 24:00:00'; insert into table creationtime_out select '2022年02月',count(*) from comment01 where creationtime between '2022-02-01 00:00:00' and '2022-02-28 24:00:00'; insert into table creationtime_out select '2022年03月',count(*) from comment01 where creationtime between '2022-03-01 00:00:00' and '2022-03-31 24:00:00'; select * from creationtime_out;
需求5:日期格式标准化需求5:日期格式标准化
功能为:去掉评论时间的时分秒,只保留年月日
create table comment02( guid string, nickname string, content string, score int, creationTime string, is_Top string, plus string, referenceTime string, days int, is_Mobile string ) row format delimited fields terminated by ','; insert into table comment02 select guid,nickname,content,score,substr(creationTime,0,10),is_Top,plus,substr(referenceTime,0,10),days,is_Mobile from comment;
4、导出用到的是hive -e可以参考数据清洗——日志 - O_O似曾相识 - 博客园 (cnblogs.com)
5、数据显示
标签:comment,insert,开学,into,测试,table,where,select From: https://www.cnblogs.com/jyt604743080/p/17128448.html