目录
reference: 爬取携程景点评论数据
本博客记录一个爬取携程景点评论并制作词云的例子,并且可以很轻易地拓展到多个景点。
爬取景点评论
准备工作
确定需要爬取的景点,得到其网址和对应ID。
postUrl="https://m.ctrip.com/restapi/soa2/13444/json/getCommentCollapseList"
urls = [['93527','思溪延村']]
获取HTML页面
这个JSON字典里的参数并不总是一样,一般要修改commentTagId,可以打开
id = urls[0]
data_pre={
"arg":{
"channelType":2,
"collapseType":0,
"commentTagId":-19,
"pageIndex":1,
"pageSize":10,
"poiId":id[0],
"sourceType":1,
"sortType":3,
"starType":0
},
"head":{
"cid":"09031073210610739886",
"ctok":"",
"cver":"1.0",
"lang":"01",
"sid":"8888",
"syscode":"09",
"auth":"",
"xsid":"",
"extension":[]
}
}
html=requests.post(postUrl,data=json.dumps(data_pre)).text
html=json.loads(html)
解析处理
# 确定总页数总页数
total_page = int(html['result']['totalCount']) / 10
total_page = int(math.ceil(total_page))
# 遍历查询评论
print("总页数:", total_page, "爬取中")
total_page = 30
# 创建写入csv文件
path = str(id[1]) + '.txt'
index = 0
with open(path, 'w', newline='', encoding='utf-8') as f:
# file = csv.writer(f)
# file.writerow(['序号', '景区ID', '景区名称', '评论'])
for page in range(1, total_page+1):
data={
"arg":{
"channelType":2,
"collapseType":0,
"commentTagId":-19,
"pageIndex":page,
"pageSize":10,
"poiId":id[0],
"sourceType":1,
"sortType":3,
"starType":0
},
"head":{
"cid":"09031073210610739886",
"ctok":"",
"cver":"1.0",
"lang":"01",
"sid":"8888",
"syscode":"09",
"auth":"",
"xsid":"",
"extension":[]
}
}
html=requests.post(postUrl,data=json.dumps(data)).text
html=json.loads(html)
# 获取评论
for j in range(0,10):
try:
result = html['result']['items'][j]['content']
# file.writerow([index, page, result])
f.write('{} {}'.format(index, result))
f.write('\n')
index += 1
except:
print("Error raised! Please look for page {} review {}.".format(page, j))
worldcloud
mask = imageio.imread('circle.jpg')
f = open('思溪延村.txt', 'r',encoding = 'utf-8')
t = f.read()
f.close()
ls = jieba.lcut(t)
for i in range(len(ls)):
if len(ls[i]) == 1:
ls[i] = ' '
if ls[i] in ['有点', '一些', '几个', '这个', '一个']:
ls[i] = ''
txt = ' '.join(ls)
w = wordcloud.WordCloud(font_path="msyh.ttc",mask = mask, \
width=1000, height=700, background_color='white',max_words=60)
w.generate(txt)
w.to_file('sixiyancun.png')
标签:旅游景点,Python,爬虫,爬取,html,ls,total,data,page
From: https://www.cnblogs.com/coco02/p/16901855.html