# -*- coding: utf-8 -*- """ Created on Sat Oct 8 13:09:04 2022 @author: 小徐同学 """ #使用xpath豆瓣 import requests from lxml.html import fromstring base_url = "https://movie.douban.com/" headers= {"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.34"} request = requests.get(url=base_url,headers=headers) request.encoding = 'utf-8' #获取每部电影的url并且以列表形式返回 def get_url(html_text): doc = fromstring(request.text) movies_url_list = doc.xpath("//table/tr/td/a")#返回一个a标签的列表 if movies_url_list: movie_title_urls = [elem.xpath('@href')[0] for elem in movies_url_list] return movie_title_urls #获取每部电影的网页源代码 def get_every_text(movie_title_url): request = requests.get(url=movie_title_url,headers=headers) request.encoding = 'utf-8' every_detail=request.text return every_detail #获取每部电影的导演,演员,片长等详细信息 def get_every_detail_content(every_detail): doc = fromstring(every_detail) list1 = [] title = doc.xpath("//*[@id='content']/h1/span[1]/text()") list2=[] list1.append(list2) list2.append(title) director = doc.xpath(".//*[@id='info']/span/span/a[@rel='v:directedBy']/text()")#导演 actor_and_actress = doc.xpath(".//div[@id='info']/span[@class='actor']/span//a/text()")#演员 date = doc.xpath(".//div[@id='info']/span[@property='v:initialReleaseDate']/text()")#上映日期 time_long = doc.xpath(".//div[@id='info']/span[@property='v:initialReleaseDate']/text()")#电影时长 score = doc.xpath(".//div[@id='interest_sectl']/div/div[@class=contains(rating_self,clearfix)]/strong/text()")#豆瓣评分 list2.append(director) list2.append(actor_and_actress) list2.append(date) list2.append(time_long) list2.append(score) print(list1) if __name__=="__main__": movie_title_urls = get_url(request.text) for url in movie_title_urls: every_detail = get_every_text(url) get_every_detail_content(every_detail)
标签:xpath,title,url,text,爬取,豆瓣,every,doc From: https://www.cnblogs.com/xjhblogs/p/16774168.html