import requests
import openpyxl
import re
import time
import pymysql
class DoubanSpider:
def __init__(self):
self.url_temp = "https://movie.douban.com/top250?start={}"
self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0'}
self.movie_data = []
#获取电影数据
def get_movie_data(self):
i = 1
for url in url_list:
response = requests.get(url, headers=self.headers)
text = response.text
pattern = re.compile(r'<span class="title">(.*?)</span>.*?<p class="">\n\s*(.*?)<br>.*?<span class="rating_num" property="v:average">(.*?)</span>.*?<span>\d*人评价</span>\s*</div>\s*(.*?)</div>',re.S)
items = re.findall(pattern, text)
self.movie_data.append([{'电影名称': item[0], '导演与主演': re.sub(' ', '', item[1]) ,
'电影评分': item[2], '电影引言': re.sub(r'<p class="quote">\s*<span class="inq">(.*?)</span>\s*</p>\s*',r'\1',item[3] if item[3] else '无')} for item in items])
print('----第{}页----'.format(i))
i = i+1
time.sleep(1)
print(self.movie_data)
#存入excel
def save_to_excel(self,wb):
ws = wb.active
ws.append(['电影名称', '导演与主演', '电影评分', '电影引言'])
for movielist in self.movie_data:
for movie in movielist:
ws.append([movie['电影名称'], movie['导演与主演'], movie['电影评分'], movie['电影引言']])
wb.save('douban_top250.xlsx')
wb.close()
# 存入数据库
def save_to_sql(self):
# 连接数据库
try:
conn = pymysql.connect(host='localhost',port=3306, user='root', password='yourpassword', db='dbtest')
print('数据库成功连接')
cursor = conn.cursor()
# 创建movies表
create_table_sql = '''CREATE TABLE IF NOT EXISTS movies (
id INT PRIMARY KEY AUTO_INCREMENT,
title VARCHAR(20),
director VARCHAR(100),
score FLOAT,
rate VARCHAR(100)
);'''
cursor.execute(create_table_sql)
# 插入数据库
insert_sql = '''
INSERT INTO movies (title, director, score, rate) VALUES (%s, %s, %s, %s)
'''
for movielist in self.movie_data:
for movie in movielist:
cursor.execute(insert_sql,
(movie['电影名称'], movie['导演与主演'], movie['电影评分'], movie['电影引言']))
# 提交并关闭连接
conn.commit()
cursor.close()
conn.close()
except pymysql.Error:
print('数据库无法连接')
if __name__ == '__main__':
dbspider = DoubanSpider()
url_list = [dbspider.url_temp.format(i*25) for i in range(10)] #url列表
dbspider.get_movie_data()
wb = openpyxl.load_workbook('douban_top250.xlsx') # douban_top250.xlsx表格
dbspider.save_to_excel(wb)
#dbspider.save_to_sql()