说明
python 3.6.1 32位
获取豆瓣电影推荐页电影详情,参考网址https://movie.douban.com/explore#!type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=0
使用第三方库BeautifulSoup4,xlwt,lxml。
代码
import os
import json
import shutil
from urllib.parse import urlencode
from urllib.request import urlopen, Request
import xlwt
from bs4 import BeautifulSoup
datalist=[]
def getDetail(purl):
print("当前爬取页面:{0}".format(purl))
try:
resp=urlopen(Request(purl,headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
}))
rb=BeautifulSoup(resp.read().decode('utf-8'),'lxml')
name=rb.select_one('#content > h1 > span').get_text()
doctor=rb.select_one('#info span.attrs').get_text()
score=rb.select_one('#interest_sectl > div.rating_wrap.clearbox > div.rating_self.clearfix > strong').get_text()
scorepersion=rb.select_one('#interest_sectl > div.rating_wrap.clearbox > div.rating_self.clearfix > div > div.rating_sum > a > span').get_text()
date=rb.select_one('#info span[property="v:initialReleaseDate"]').get_text()
img=rb.select_one('#mainpic > a > img').get('src')
desp=rb.select_one('#link-report span[property="v:summary"]').get_text().strip().replace('\n',' ').replace(' ','')
except:
pass
data=[purl,name,doctor,date,score,scorepersion,img,desp]
datalist.append(data)
def saveToFile(param):
if os.path.exists(param):
os.remove(param)
print(len(datalist))
book=xlwt.Workbook(encoding='utf-8',style_compression=0)
sheet=book.add_sheet('豆瓣最新30部电影推荐',cell_overwrite_ok=True)
col = ('链接', '影片名', '导演', '上市日期', '评分', '评分人数', '缩略图链接', '简述')
for i in range(len(col)):
sheet.write(0, i, col[i])
for i in range(len(datalist)):
data = datalist[i]
for j in range(len(col)):
sheet.write(i + 1, j, data[j])
book.save(param) # 保存
print('xls文件已保存')
def saveImg(param):
print("开始获取图片")
if os.path.exists(param):
shutil.rmtree(param)
os.mkdir(param)
for i in range(len(datalist)):
print(datalist[i][6])
try:
resp =urlopen(Request(datalist[i][6],headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}))
with open("{3}{0}.{1}.{2}".format(i+1,datalist[i][1], datalist[i][6].split('.')[-1], param), 'wb+') as f:
f.write(resp.read())
except:
continue
print("图片爬取完成")
if __name__ == '__main__':
values = {
'type': 'movie',
'tag': '热门',
'sort': 'recommend',
'page_limit': '30',#可以修改,指定爬取的记录数
'page_start': '0'#指定爬取开始位置
}
uop = urlopen(Request('https://movie.douban.com/j/search_subjects',data=urlencode(values).encode('utf-8')))
for item in json.loads(uop.read())['subjects']:
getDetail(item['url'])
saveToFile('d:/Zzdouban.xls')
#
saveImg("d:/Zzdouban/");
结果