写法一:
编写两个爬虫程序文件:爬虫1将豆瓣一周口碑榜的电影url添加到redis中名为movie_url的列表中(注意避免多次运行导致重复的问题);
爬虫2从movie_url中读出网址,爬取每一部电影的导演、主演、类型、制片国家/地区、语言、上映日期、片长,
并将它们保存到redis的hash表(自行命名)中。
def get_sub_page(href):
source=request_url(href)
score = re.search('<strong class="ll rating_num" property="v:average">(.+)</strong>', source).group(1)
con_list = re.findall('<div id="info">(.+)<br>', source, re.S)
for item in con_list:
item = re.sub('<a href="/celebrity/[\d]+/"', '', item)
item = re.sub('</span>', '', item)
#print("++++item+++",item)
#导演
actor = re.search('rel="v:directedBy">(.+)</a>', item).group(1)
# actor = [actor]
#编剧
writer = re.search("编剧: <span class='attrs'>(.+)</a><br/>", item).group(1)
writer = writer.replace("</a> / >", ",")
writer = re.sub('<a href="/(.*?)">', ',', writer).replace('</a> / ',"").replace(">","")
# writer = [writer]
#主演
star_list = re.search('rel="v:starring">(.+)</a><br/>', item).group(1)
star_list = re.sub('</a> / rel="v:starring">', ",", star_list)
if "href" in star_list:
star_list = re.sub('</a> / <a href="/(.*?)" rel="v:starring">', ',', star_list)
# star_list=[star_list]
# else:
# star_list = [star_list]
#类型
type = re.search('<span property="v:genre">(.+)<br/>', item).group(1)
type = re.findall('[\u4e00-\u9fa5]+', type)
type = ','.join(type)
#制片国家/地区
region = re.search("制片国家/地区: (.+)<br/>", item).group(1)
#语言
language = re.search("语言: (.+)<br/>", item).group(1)
#上映时间
date = re.findall('"v:initialReleaseDate" content="(.*?)">', item)
date = ','.join(date)
#片长
runtime = re.search("片长: (.+)<br/>", item).group(1)
runtime = re.findall('[\d]+[\u4e00-\u9fa5]+', runtime)
runtime = ''.join(runtime)
#又名
try:
other_name = re.search("又名: (.+)<br/>", item).group(1)
except:
other_name = ""
#IMDb
IMDb = re.search("IMDb: (.+)", item).group(1)
return score,actor,writer,star_list,type,region,language,date,runtime,other_name,IMDb
def request_url(url):
req = requests.get(url, headers=header)
source = req.content.decode('utf-8')
return source
import redis
from ch03_douban_movie.one import *
base_url = "https://movie.douban.com/"
#连接数据库
client=redis.Redis(host="localhost",port=6379,decode_responses=True,db=0)
#请求网页
source=request_url(base_url)
#源码
source = re.findall(
'<div id="billboard" class="s" data-dstat-areaid="75" data-dstat-mode="click,expose">(.+)</div>', source, re.S)
for item in source:
item = re.findall("<table>(.+)</table>", item, re.S)
for i in item:
# 获取子页链接
href_list = re.findall('href="(.*?)">', i, re.S)
print(href_list)
#添加到redis数据库中的列表movie_url
for i in href_list:
print(i)
client.rpush('movie_url', i)
# #如果周榜更新,则更新该列表
# for j in range(len(href_list)):
# client.lset('movie_url',j,'a')
client. Close()
client=redis.StrictRedis()
i=1
#从redis的movel_url列表读取链接
for href in client.lrange('movie_url', 0, -1):
info=list(get_sub_page(href))
movie_name=re.search('<span property="v:itemreviewed">(.+)</span>',request_url(href)).group(1)
info.insert(0,movie_name)
# #写入hash表
keys=["电影名","评分","导演","编剧","主演","类型","制片国家/地区","语言",
"上映时间","时长","又名","IMDb"]
info=dict(zip(keys,info))
print(info)
# client.hmset('WeekMovieTop%d'%i,info)
i=i+1
client. Close()
写法二:
import requests
import cchardet
import re
import redis
import json
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36',
'Cookie': 'bid=Wt9rGb6VTcE; douban-fav-remind=1; __gads=ID=b0b9fc62ad8fd36e-2277b1a4d0ca0007:T=1629037033:RT=1629037033:S=ALNI_MZcQI-zVIz4SDF1JEWl3bohLM8JKA; viewed="35571443"; gr_user_id=b4003e18-ed65-42a8-b2aa-c2eee8128f95; ll="118200"; _vwo_uuid_v2=DAAC4D9D6B82F69AC1F055078D065C751|92efe72a313f1fd9c1647ee1c083fa7d; push_doumail_num=0; push_noty_num=0; __utmv=30149280.14646; dbcl2="146463518:/DZkK9xj7FI"; ck=1dY8; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1637220583%2C%22https%3A%2F%2Faccounts.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; __utma=30149280.1433569655.1629037036.1635603856.1637220583.19; __utmb=30149280.0.10.1637220583; __utmc=30149280; __utmz=30149280.1637220583.19.8.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utma=223695111.1215803576.1633773615.1634231817.1637220583.13; __utmb=223695111.0.10.1637220583; __utmc=223695111; __utmz=223695111.1637220583.13.2.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_id.100001.4cf6=f75b65b3de20f07e.1633773614.15.1637220605.1636869814.; ap_v=0,6.0'
}
def get_movie_list():
resp = requests.get('https://movie.douban.com', headers=header)
print(resp.encoding)
print(resp.apparent_encoding)
resp.encoding = cchardet.detect(resp.content)['encoding']
movie_list_section = re.search(r'<div class="billboard-bd">(.*?)<div id="dale_movie_home_bottom_right"', resp.text, re.S).group(1)
movie_list = re.findall(r'<tr>.*?href="(.*?)">(.*?)</a>', movie_list_section, re.S)
return movie_list
def get_movie_info(movie_url_name):
resp = requests.get(movie_url_name[0], headers=header)
resp.encoding = cchardet.detect(resp.content)['encoding']
movie_info_section = re.search(r'<div id="info">(.*?)</div>', resp.text, re.S).group(1)
director = '/'.join(re.findall(r'href=.*?v:directedBy">(.*?)</a>', movie_info_section, re.S))
screenwriter_section = re.search(r"编剧.*?'attrs'>(.*?)</span>", movie_info_section, re.S).group(1)
screenwriter = '/'.join(re.findall(r'href=.*?>(.*?)</a>', screenwriter_section, re.S))
actor = '/'.join(re.findall(r'href=.*?v:starring">(.*?)</a>', movie_info_section, re.S))
movie_type = re.search(r'property="v:genre">(.*?)</span>', movie_info_section, re.S).group(1)
district = re.search(r'制片国家/地区:</span>(.*?)<br/>', movie_info_section, re.S).group(1)
language = re.search(r'语言:</span>(.*?)<br/>', movie_info_section, re.S).group(1)
initial_release_date = '/'.join(re.findall(r'v:initialReleaseDate.*?>(.*?)</span>', movie_info_section, re.S))
runtime = re.search(r'v:runtime.*?>(.*?)</span>', movie_info_section, re.S).group(1)
movie_detail = {'片名': movie_url_name[1], '导演': director, '编剧': screenwriter, '演员': actor, '类型': movie_type, '制片国家/地区': district,
'语言': language, '上映日期': initial_release_date, '片长': runtime}
return movie_detail
def save_to_redis_list(movie_info_list, redis_list):
# client = redis.Redis('localhost', 6379, 0)
cp = redis.ConnectionPool(host='localhost', port=6379, db=0)
client = redis.StrictRedis(connection_pool=cp, decode_responses=True)
for mv in movie_info_list:
client.rpush(redis_list, json.dumps(mv,ensure_ascii=False)) #,ensure_ascii=False))
def save_to_redis_hash(movie_info_list):
client = redis.StrictRedis(decode_responses=True)
i = 1
for mv in movie_info_list:
client.hmset(f'douban_movie:{i}', mv) # client.hset('douban1', i, json.dumps(mv))
i += 1
if __name__ == '__main__':
movie_lst = get_movie_list()
movie_detail_list = []
for movie in movie_lst:
movie_detail = get_movie_info(movie)
movie_detail_list.append(movie_detail)
# save_to_redis_list(movie_detail_list, 'douban_queue')
save_to_redis_hash(movie_detail_list)
标签:info,电影周,item,url,Redis,Top10,list,re,movie
From: https://www.cnblogs.com/Gimm/p/18116354