import json
import os
import requests
from lxml import etree
from lxml.etree import _Element
class DoubanMovieSpider(object):
def __init__(self):
self.url = "https://movie.douban.com/chart"
self.headers = {
"Host": "movie.douban.com",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}
# 在init时就开启一个文件的fd
self.file = open("douban.json", "w", encoding="utf-8")
os.environ["NO_PROXY"] = "*"
pass
def get_data(self):
resp = requests.get(self.url, headers=self.headers)
return resp.content
def parse_data(self, data):
"""
1、获取列表://div[@class='indent']//div/table
2、每个项./tbody/tr[@class='item']
2-1、项中的图片./td/a[@class='nbg']/img/@src
2-2、项目的信息:
1、标题:./td[@vlign='top']/div[@class='p12']/a
2、链接:./td[@vlign='top']/div[@class='p12']/a/@href
"""
etree_html = etree.HTML(data.decode()) # type: _Element
table_item = etree_html.xpath("//table/tr[@class='item']")
result_list = []
for t_item in table_item: # type: _Element
temp_dict = dict()
title = t_item.xpath("./td[@valign='top']/div/a")[0].text.strip("\n /")
href = t_item.xpath("./td[@valign='top']/div/a/@href")[0]
img = str(t_item.xpath("./td[@valign='top']/a[@class='nbg']/img/@src")[0])
try:
rating_num = t_item.xpath("./td/div/div/span[@class='rating_nums']/text()")[0]
except:
rating_num = "暂无评分"
temp_dict["title"] = title
temp_dict["href"] = href
temp_dict["img"] = img
temp_dict["rating_num"] = rating_num
result_list.append(temp_dict)
return result_list
def save_data(self, data):
self.file.write(json.dumps(data, ensure_ascii=False, indent=4))
def __del__(self):
try:
self.file.close()
except Exception as e:
print(e)
def run(self):
data = self.get_data()
parse_data = self.parse_data(data)
self.save_data(parse_data)
pass
if __name__ == '__main__':
douban_movie = DoubanMovieSpider()
douban_movie.run()
标签:__,item,self,爬取,案例,豆瓣,div,data,class
From: https://www.cnblogs.com/juelian/p/17559512.html