大数据分析与可视化 之 猫眼电影爬虫
import random
import time
import re
import requests
import csv
class MaoyanSpider(object):
# 初始化
# 定义初始页面url
def __init__(self):
self.url = 'https://www.maoyan.com/board/4?offset={}'
def get_html(self,url):
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept - Encoding':'gzip, deflate, br',
'Accept - Language':'zh - CN, zh;q = 0.9, en;q = 0.8, en - GB;q = 0.7, en - US;q = 0.6',
'Cache - Control':'max - age = 0',
'Connection':'keep-alive',
'Cookie':'__mta=142521997.1695026370028.1695105604302.1695106030738.15; uuid_n_v=v1; uuid=E13BF08055FE11EEBD786D8B351D1BEB69DF40D3DDA545AF98A7B7303437B12C; _lxsdk_cuid=18aa771d411c8-0d1e3cae321997-78505774-16e360-18aa771d411c8; _lxsdk=E13BF08055FE11EEBD786D8B351D1BEB69DF40D3DDA545AF98A7B7303437B12C; _csrf=db721765308a1674e7cba19e5867dbf60fb59d96717ba3913e47e499ab4726c0; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1695026370,1695086835; _lx_utm=utm_source%3Dbing%26utm_medium%3Dorganic; __mta=142521997.1695026370028.1695087067256.1695090012297.10; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1695106030; _lxsdk_s=18aac26180a-d75-779-1ac%7C%7C9',
'Host':'www.maoyan.com',
'Referer':'https://www.maoyan.com/board?requestCode=1d65f67ff588bed531de86b8d57274ac6pskg',
'Sec-Ch-Ua':'"Microsoft Edge"; v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'Sec-Ch-Ua-Mobile':'?0',
'Sec-Ch-Ua-Platform':'"Windows"',
'Sec-Fetch-Dest':'document',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-Site':'same-origin',
'Sec-Fetch-User':'?1',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.31'
}
res = requests.get(url=url, headers=headers)
html = res.text
#测试
# print(res.text)
# 直接调用解析函数
self.parse_html(html)
def parse_html(self,html):
# 正则表达式
re_bds = '<div class="movie-item-info">.*?title="(.*?)".*?<p class="star">(.*?)</p>.*?class="releasetime">(.*?)</p>'
# 生成正则表达式对象
pattern = re.compile(re_bds, re.S)
r_list = pattern.findall(html)
# 测试
# print(r_list)
self.save_html(r_list)
def save_html(self,r_list):
# 生成文件对象
with open('maoyan.csv', 'a', newline='', encoding="utf-8") as f:
# 生成csv操作对象
writer = csv.writer(f)
# 整理数据
for r in r_list:
name = r[0].strip()
star = r[1].strip()[3:]
# 上映时间:2018-07-05
# 切片截取时间
time = r[2].strip()[5:15]
L = [name, star, time]
# 写入csv文件
writer.writerow(L)
print(name, time, star)
# 主函数
def run(self):
# 抓取前6页数据
for offset in range(0, 61, 10):
url = self.url.format(offset)
self.get_html(url)
# 生成1-2之间的浮点数
time.sleep(random.uniform(1, 2))
# 以脚本方式启动
if __name__ == '__main__':
# 捕捉异常错误
try:
spider = MaoyanSpider()
spider.run()
except Exception as e:
print("错误:", e)
标签:__,url,self,list,爬虫,html,可视化,Sec,猫眼
From: https://www.cnblogs.com/IvanKK/p/17936782