import requests
from lxml import etree
import pandas as pd
num = 0
url = "https://movie.douban.com/top250?start="+str(num)+"&filter="
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/"
"537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"}
all_name = []
all_star = []
all_actor = []
all_types = []
while num < 250:
resp = requests.get(url, headers=headers)
e = etree.HTML(resp.text)
name = e.xpath('//div[@class="hd"]/a/span[1]/text()')
star = e.xpath('//div[@class="star"]/span[2]/text()')
actor_type = e.xpath('//div[@class="bd"]/p[1]/text()')
actor = actor_type[::2]
types = actor_type[1::2]
actor = [each.strip() for each in actor]
actor = [each.replace("\xa0", "") for each in actor]
types = [each.replace("\xa0", "") for each in types]
types = [each.strip() for each in types]
all_name.extend(name)
all_star.extend(star)
all_actor.extend(actor)
all_types.extend(types)
num += 25
url = "https://movie.douban.com/top250?start="+str(num)+"&filter="
标签:num,name,actor,爬取,豆瓣,each,star,top250,types
From: https://www.cnblogs.com/jzm123/p/17292498.html