# -*- coding = utf-8 -*-
# @Time: 20:55
# @Author: zzc
# @File: spider.py
# @Software: PyCharm
import urllib.request
from bs4 import BeautifulSoup
import openpyxl
import re
import sqlite3
link_pattren = '<a href="(.*?)">'
src_pattren = 'src="(.*?)"'
cname_pattern = '<span class="title">(.*?)</span>'
rating_pattern = '<span class="rating_num" property="v:average">(.*?)</span>'
judge_pattern = '<span>(.*?)人评价</span>'
ing_pattern = re.compile('<span class="inq">(.*?)</span>')
content_pattern = re.compile(r'<p class="">(.*?)</p>', re.S) # re.S 让.匹配换行
def main():
baseurl = 'https://movie.douban.com/top250?start='
# 1.爬取数据
data_list = get_data(baseurl)
# 2.保存数据 到xlsx
# save_path = './豆瓣电影Top250.xlsx'
# save_data(data_list, save_path)
db_file = 'douban.db'
save_data2db(data_list, db_file)
def init_db(db_file):
''' 创建保存数据的表 '''
create_table_sql = '''
create table douban
(
id integer primary key autoincrement,
address_link text,
img_link text,
cname varchar,
ename varchar,
rate numberic,
judge numberic,
ing text,
content
)
'''
conn = sqlite3.connect(db_file)
cursor = conn.cursor()
cursor.execute(create_table_sql)
conn.commit()
conn.close()
def save_data2db(data_list, db_file):
''' 将爬取到的数据保存到数据库 '''
init_db(db_file) # 创建表
# print(data_list)
conn = sqlite3.connect(db_file)
cursor = conn.cursor()
# SQL方式一:
# for rows in data_list:
# fmt = ''
# for index in range(len(rows)):
# if index == len(rows) -1:
# fmt += '"' + rows[index] + '"'
# else:
# fmt += '"' + rows[index] + '"' + ','
# # 2.拼装sql
# sql = f'insert into douban7 values(null,{fmt})'
# SQL 方法二:
for row in data_list:
tup = tuple(row)
sql = 'insert into douban values(null,?,?,?,?,?,?,?,?)'
cursor.execute(sql,tup) # 第二个参数传列表和元组均可
conn.commit()
conn.close()
def ask_url(url):
'''请求网址'''
head = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
} # 伪装为浏览器
request = urllib.request.Request(url, headers=head)
html = ''
try:
resp = urllib.request.urlopen(request)
html = resp.read().decode('utf-8')
except urllib.error.URLError as e:
if hasattr(e, 'code'):
print(e.code)
if hasattr(e, 'reason'):
print(e.reason)
return html
def get_data(baseurl):
data_list = []
for i in range(0, 10):
url = baseurl + str(i * 25)
html = ask_url(url)
# 逐一解析数据
soup = BeautifulSoup(html, 'html.parser')
item_list = soup.find_all('div', class_="item") # 获取所有class属性为item 的div
for item in item_list:
data = []
# print(item)
str_item = str(item)
# 使用re 正则匹配值
link = re.findall(link_pattren, str_item) # link
data.append(link[0])
src = re.findall(src_pattren, str_item)
data.append(src[0]) # 图片信息
cname = re.findall(cname_pattern, str_item) # 片名
# [data.append(x.replace('\xa0/\xa0', '')) for x in cname] # 片名
if len(cname) == 2:
data.append(cname[0])
ename = cname[1].replace('\xa0/\xa0', '') # 英文名
data.append(ename)
else:
# 只有一个中文名
data.append(cname[0])
data.append(' ')
rating = re.findall(rating_pattern, str_item)
data.append(rating[0]) # 评分
judge = re.findall(judge_pattern, str_item)
data.append(judge[0]) # 评论人数
inq = re.findall(ing_pattern, str_item)
if inq:
data.append(inq[0])
else:
data.append(" ")
content = re.findall(content_pattern, str_item)
content = re.sub(r'<br/>', '', content[0])
content = content.strip() # 去掉空格
data.append(content)
data_list.append(data)
return data_list
def save_data2(data_list, save_path):
''' 将数据存储到xlsx '''
wb = openpyxl.Workbook()
ws = wb.active
ws.title = '豆瓣电影Top250'
first_row = ('地址', '图片', '中文名称', '外文名称', '评分', '评论人数', '简介', '内容')
# ws.append(first_row)
for i in range(1, 9):
ws.cell(row=1, column=i).value = first_row[i - 1]
# 循环
for row in range(0, 249):
data = data_list[row]
for col in range(0, 8):
ws.cell(row=row + 2, column=col + 1).value = data[col]
wb.save(save_path)
if __name__ == '__main__':
main()
# init_db('demo.db')
成功将抓取到的数据存入database: