爬取豆瓣top250影片资料(待修改)
使用BeautifulSoup方法进行操作,CSS选择器截取html文本内容,对网页解析如。
import requests标签:review,list,爬虫,nth,lst,div,type,选择器,CSS From: https://www.cnblogs.com/smith-count/p/17474109.html
from bs4 import BeautifulSoup
#避免反复获取出现爬取失败
#头请求用于防止访问拒绝,亦可加cookies
def page_request(url,headers):
htmltxt=requests.get(url,headers=headers).content.decode('utf-8')
htmltxt = htmltxt.replace('<br>', "")
return htmltxt
def get_Info(htmltxt,ua):
soup=BeautifulSoup(htmltxt,'lxml')
#CSS选择器
#name
name=soup.select('body>div:nth-of-type(3)>div:nth-of-type(1)>div:nth-of-type(1)>div.article>ol>li>div.item>div.info>div.hd>a')
#director and actor
dir_and_actor=soup.select('body>div:nth-of-type(3)>div:nth-of-type(1)>div:nth-of-type(1)>div.article>ol>li>div.item>div.info>div.bd>p:nth-of-type(1)')
#point
review_point=soup.select('body>div:nth-of-type(3)>div:nth-of-type(1)>div:nth-of-type(1)>div.article>ol>li>div.item>div.info>div.bd>div.star>span:nth-of-type(2)')
#reviewnum
review_num= soup.select('body>div:nth-of-type(3)>div:nth-of-type(1)>div:nth-of-type(1)>div.article>ol>li>div.item>div.info>div.bd>div.star>span:nth-of-type(4)')
#子网页获取评论
name_review = soup.select('body>div:nth-of-type(3)>div:nth-of-type(1)>div:nth-of-type(1)>div.article>ol>li>div.item>div.info>div.hd>a')
get_review_web = []
for i in range(0,10):
get_review_web.append(name_review[i].get('href'))
#数据清洗集和
list_result=[]
name=clear_info_name(name)
dir_and_actor=clear_info_dir(dir_and_actor)
review_num=clear_info_name(review_num)
review_point=clear_info_name(review_point)
for i in range(0,len(name)):
list_result.append(str(name[i])+"|"+str(dir_and_actor[i])+"|"+str(review_point[i])+"|"+str(review_num[i]))
return [list_result,get_comments(get_review_web,ua)]
def clear_info_name(lst):
list=[]
for i in range(0,len(lst)):
record=lst[i].get_text().split('/')[0]
record= "".join(record.split())
list.append(record.strip())
return list
def clear_info_dir(lst):
list=[]
for i in range(0,len(lst)):
record=(lst[i].get_text()).split('/')
Str=""
for i in range(0,len(record)):
Str+=record[i]
Str= "".join(Str.split())
list.append(Str.strip())
return list
def get_comments(lst,ua):
output=[]
for i in range(0,len(lst)):
htmltxt=page_request(lst[i]+"reviews",ua)
soup=BeautifulSoup(htmltxt,'lxml')
result=soup.select('body>div:nth-of-type(3)>div:nth-of-type(1)>div:nth-of-type(1)>div:nth-of-type(1)>div:nth-of-type(1)>div>div:nth-of-type(1)>div:nth-of-type(1)>div:nth-of-type(1)>div:nth-of-type(1)')
for i in range(0, len(result)):
record = result[i].get_text().strip().replace('\n','|')
record=record.split('|')[0]
output.append(record)
return output
#写入到txt
def save_part(lst):
fp=open('D:\\info.txt', 'w', encoding='utf-8')
for i in range(0,len(lst)):
fp.write(lst[i]+'\n')
fp.close()
def save_review(lst):
fp = open('D:\\reviews.txt', 'w', encoding='utf-8')
for i in range(0, len(lst)):
fp.write(lst[i] + '\n')
fp.close()
if __name__=='__main__':
ua= {}
list=[]
list_result=[]
comments=[]
for i in range(0,11):
url='https://movie.douban.com/top250?start='+str(time1)+'&filter='
list=get_Info(page_request(url,ua),ua)
list_result+=list[0]
comments+=list[1]
time1+=25
save_review(comments)
save_part(list_result)
使用python3.6,更高版本会存在方法警告。
程序未给出请求标头。
使用CSS获取的内容进行了初步清洗,需要建议再次修改。
通过获取子网页url再次访问,获取评论,评论未截全(待修改)。
关于<br>非成对标签,通过查询以字符替换的方式解决(即将<br>,替换为“”)。