目录
使用re爬取+为请求头,保存为csv
import requests
import re
import csv
from fake_useragent import UserAgent
#re文档:
#https://docs.python.org/zh-cn/3.8/library/re.html#re.S
header = {
'user-agent': UserAgent().ie,
'cookie':'bid=gZhOMjq7Ag0; ll="118200"; __gads=ID=ee81490f4e78ee41-226c825738cf0077:T=1637495539:RT=1637495539:S=ALNI_MYAsbTf9f4zarcndONOU8V3iX3aKg; _vwo_uuid_v2=D5CD017E3233C8F72BD20AB7E8A3DE8C6|e0376aed09832ec0574b534bffe098fc; dbcl2="144119796:t9KAADz+2i4"; push_noty_num=0; push_doumail_num=0; __utmv=30149280.14411; ck=oiAV; _pk_ref.100001.4cf6=["","",1637997339,"https://www.baidu.com/link?url=-BeYMom6zanu8afK9L3VZBlLbFUbdO_SynvSZ9V8_KxMbRniAGx-WAUEh-IFvJ4g&wd=&eqid=e083a3d3000506490000000661a1db18"]; _pk_id.100001.4cf6=98c1f43971dcb9d9.1637495527.4.1637997339.1637511633.; _pk_ses.100001.4cf6=*; ap_v=0,6.0; __utma=30149280.2069033705.1637495528.1637679354.1637997340.5; __utmb=30149280.0.10.1637997340; __utmc=30149280; __utmz=30149280.1637997340.5.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utma=223695111.1576995638.1637495528.1637510908.1637997340.4; __utmb=223695111.0.10.1637997340; __utmc=223695111; __utmz=223695111.1637997340.4.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic'
}
def main(base_url):
source=request_url(base_url)
source = re.findall(
'<div id="billboard" class="s" data-dstat-areaid="75" data-dstat-mode="click,expose">(.+)</div>', source, re.S)
all_conditions(source)
#请求得到源码
def request_url(url):
req = requests.get(url, headers=header)
source = req.content.decode('utf-8')
return source
#主页
def all_conditions(source):
datalist=[]
for item in source:
item=re.findall("<table>(.+)</table>",item,re.S)
for i in item:
#获取主页下的排名、电影名称、子页链接
rank_list=re.findall('<td class="order">(.+)</td>',i)
href_list=re.findall('href="(.*?)">',i,re.S)
name_list=re.findall('[\u4e00-\u9fa5]+',i,re.S)
#获取子页所有信息,全部添加到列表中
for href,name,rank in zip(href_list,name_list,rank_list):
data = []
data.append(rank)
data.append(name)
data.append(href)
sub_page=get_sub_page(href)
for i in sub_page:
# print(i)
data.append(i)
datalist.append(data)
print(data)
#print("datalist\n",datalist)
#保存爬取的数据
save(datalist)
#获取子页下所有信息
def get_sub_page(href):
source=request_url(href)
score = re.search('<strong class="ll rating_num" property="v:average">(.+)</strong>', source).group(1)
con_list = re.findall('<div id="info">(.+)<br>', source, re.S)
for item in con_list:
item = re.sub('<a href="/celebrity/[\d]+/"', '', item)
item = re.sub('</span>', '', item)
#print("++++item+++",item)
#导演
actor = re.search('rel="v:directedBy">(.+)</a>', item).group(1)
# actor = [actor]
#编剧
writer = re.search("编剧: <span class='attrs'>(.+)</a><br/>", item).group(1)
writer = writer.replace("</a> / >", ",")
writer = re.sub('<a href="/(.*?)">', ',', writer).replace('</a> / ',"").replace(">","")
# writer = [writer]
#主演
star_list = re.search('rel="v:starring">(.+)</a><br/>', item).group(1)
star_list = re.sub('</a> / rel="v:starring">', ",", star_list)
if "href" in star_list:
star_list = re.sub('</a> / <a href="/(.*?)" rel="v:starring">', ',', star_list)
# star_list=[star_list]
# else:
# star_list = [star_list]
#类型
type = re.search('<span property="v:genre">(.+)<br/>', item).group(1)
type = re.findall('[\u4e00-\u9fa5]+', type)
type = ','.join(type)
#制片国家/地区
region = re.search("制片国家/地区: (.+)<br/>", item).group(1)
#语言
language = re.search("语言: (.+)<br/>", item).group(1)
#上映时间
date = re.findall('"v:initialReleaseDate" content="(.*?)">', item)
date = ','.join(date)
#片长
runtime = re.search("片长: (.+)<br/>", item).group(1)
runtime = re.findall('[\d]+[\u4e00-\u9fa5]+', runtime)
runtime = ''.join(runtime)
#又名
try:
other_name = re.search("又名: (.+)<br/>", item).group(1)
except:
other_name = ""
#IMDb
IMDb = re.search("IMDb: (.+)", item).group(1)
return score,actor,writer,star_list,type,region,language,date,runtime,other_name,IMDb
#保存为csv文件
def save(data):
with open("DoubanMovieWeekTop10.csv","w+",encoding="utf-8-sig",newline="") as f:
a=csv.writer(f)
a.writerow(["排名","电影名","详情链接","评分","导演","编剧","主演","类型","制片国家/地区","语言",
"上映时间","片场","又名","IMDb"])
a.writerows(data)
if __name__ == '__main__':
base_url = "https://movie.douban.com/"
main(base_url)
使用re爬取2+不保存
import requests
import cchardet
import re
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36',
'Cookie': 'bid=Wt9rGb6VTcE; douban-fav-remind=1; __gads=ID=b0b9fc62ad8fd36e-2277b1a4d0ca0007:T=1629037033:RT=1629037033:S=ALNI_MZcQI-zVIz4SDF1JEWl3bohLM8JKA; viewed="35571443"; gr_user_id=b4003e18-ed65-42a8-b2aa-c2eee8128f95; ll="118200"; __utmz=30149280.1633773615.6.6.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmz=223695111.1633773615.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _vwo_uuid_v2=DAAC4D9D6B82F69AC1F055078D065C751|92efe72a313f1fd9c1647ee1c083fa7d; __utmc=30149280; __utmc=223695111; ap_v=0,6.0; __utma=30149280.1433569655.1629037036.1634220097.1634222012.15; __utmb=30149280.0.10.1634222012; __utma=223695111.1215803576.1633773615.1634220097.1634222012.10; __utmb=223695111.0.10.1634222012; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1634222012%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3Dwyyw1hAEPCDkeCOiS0lMWDx6tRJnw2gELr3aZop7fzDRrduYHXftRKiI4PbeclDL%26wd%3D%26eqid%3Db60e5a81000182b7000000066161682b%22%5D; _pk_id.100001.4cf6=f75b65b3de20f07e.1633773614.10.1634222012.1634220097.; _pk_ses.100001.4cf6=*; dbcl2="146463518:ozVFabF9880"'
}
def get_movie_list():
resp = requests.get('https://movie.douban.com', headers=header)
resp.encoding = cchardet.detect(resp.content)['encoding']
movie_list_section = re.search(r'<div class="billboard-bd">(.*?)<div id="dale_movie_home_bottom_right"', resp.text, re.S).group(1)
movie_list = re.findall(r'<tr>.*?href="(.*?)">(.*?)</a>', movie_list_section, re.S)
return movie_list
def get_movie_info(movie_url_name):
resp = requests.get(movie_url_name[0], headers=header)
resp.encoding = cchardet.detect(resp.content)['encoding']
movie_info_section = re.search(r'<div id="info">(.*?)</div>', resp.text, re.S).group(1)
director = '/'.join(re.findall(r'href=.*?v:directedBy">(.*?)</a>', movie_info_section, re.S))
screenwriter_section = re.search(r"编剧.*?'attrs'>(.*?)</span>", movie_info_section, re.S).group(1)
screenwriter = '/'.join(re.findall(r'href=.*?>(.*?)</a>', screenwriter_section, re.S))
actor = '/'.join(re.findall(r'href=.*?v:starring">(.*?)</a>', movie_info_section, re.S))
movie_type = re.search(r'property="v:genre">(.*?)</span>', movie_info_section, re.S).group(1)
district = re.search(r'制片国家/地区:</span>(.*?)<br/>', movie_info_section, re.S).group(1)
language = re.search(r'语言:</span>(.*?)<br/>', movie_info_section, re.S).group(1)
initial_release_date = '/'.join(re.findall(r'v:initialReleaseDate.*?>(.*?)</span>', movie_info_section, re.S))
runtime = re.search(r'v:runtime.*?>(.*?)</span>', movie_info_section, re.S).group(1)
movie_detail = {'片名': movie_url_name[1], '导演': director, '编剧': screenwriter, '演员': actor, '类型': movie_type, '制片国家/地区': district,
'语言': language, '上映日期': initial_release_date, '片长': runtime}
return movie_detail
if __name__ == '__main__':
mv_lst = get_movie_list()
movie_detail_list = []
for movie in mv_lst:
movie_detail = get_movie_info(movie)
movie_detail_list.append(movie_detail)
for movie in movie_detail_list:
for key, value in movie.items():
print(f'{key}:{value}')
print()