需求:需求:爬取电影名称、评分、引言、详情页的 url,翻页爬取 1-10 页。
代码:
import requests
from lxml import etree
'''
url分析:
第一页:https://movie.douban.com/top250
第二页:https://movie.douban.com/top250?start=25&filter=
第三页:https://movie.douban.com/top250?start=50&filter=
第四页:https://movie.douban.com/top250?start=75&filter=
url 模型:https://movie.douban.com/top250?start={25 * 页数-1}&filter=
'''
# 获取网页源代码
def getsource(pagelink):
response = requests.get(url=pagelink, headers=headers)
response.encoding = 'utf-8'
html = response.text
return html
# 用于解析数据
def getitem(html):
element = etree.HTML(html)
movieitemlist = element.xpath('//li//div[@class="info"]')
# 定义一个列表,用于存放25条数据
li = []
for i in movieitemlist:
dic = {}
title = i.xpath('./div[@class="hd"]/a/span[@class="title"]/text()')
title = ''.join(title).replace('\xa0', '')
othertitle = i.xpath('./div[@class="hd"]/a/span[@class="other"]/text()')
othertitle = ''.join(othertitle).replace('\xa0', '')
# 名称
title = title + othertitle
# 评分
grade = i.xpath('./div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()')[0]
# 详情页的 url
link = i.xpath('div[@class="hd"]/a/@href')[0]
# 引言
quote = i.xpath('./div[@class="bd"]/p[@class="quote"]/span/text()')
if quote:
quote = quote[0]
else:
quote = None
dic['title'] = title
dic['grade'] = grade
dic['link'] = link
dic['quote'] = quote
li.append(dic)
return li
# 用于保存数据
def writedata(li):
pass
if __name__ == '__main__':
all = []
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36',
}
for i in range(0, 10):
url = f'https://movie.douban.com/top250?start={25 * i}&filter='
html = getsource(url)
li = getitem(html)
all += li
print(len(all), all)
标签:xpath,title,quote,li,豆瓣,top250,class
From: https://www.cnblogs.com/tudigong/p/18466787