import os
import random
import re
import sys
import time
import urllib.parse
import requests
from lxml import etree
from lxml.etree import _Element
class TiebaSpider(object):
BASE_DIR = os.path.dirname(__file__)
def __init__(self, url, name):
self.url = url
self.name = name
self.header = {
"Host": "tieba.baidu.com",
# "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/104.0",
"User-Agent": "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT)"
}
# 创建目录,用于保存图片
if not os.path.exists(self.name):
print(f"创建目录:{self.name}")
os.mkdir(os.path.join(self.BASE_DIR, self.name))
def get_data(self, pagesize):
"""获取响应体"""
# 带了UA去请求和不带UA去请求返回的内容不一样,证明网站都是有检测UA的,以前做易语言POST都做过了....
resp = requests.get(f"{self.url}{urllib.parse.quote(self.name)}&ie=utf-8&pn={pagesize * 50}",
headers=self.header)
return resp.content
def parse_data(self, data):
"""
1、帖子列表://div[@class="threadlist_title pull_left j_th_tit"]
如果item下能获取到i元素就是置顶帖: ./i[@title="置顶"] 置顶帖排除出去。
每个帖子的url:
./a/@href
每个帖子title:./a
2、再次访问每个帖子的url,获取其中的图片并下载(只获取帖子第一页的,就不写这么多了...)
"""
etree_html: _Element = etree.HTML(data.decode())
tiezi_elements = etree_html.xpath(
'//div[@class="threadlist_title pull_left j_th_tit"] | //div[@class="threadlist_title pull_left j_th_tit "]')
clist = []
for element in tiezi_elements:
title = element.xpath("./a/text()")[0]
url = element.xpath("./a/@href")[0]
clist.append({
"title": title,
"url": url
})
return clist
def parse_detail(self, data):
"""遍历列表中的每个帖子,发起请求,获取帖子中图片地址"""
for item in data:
# 补充完整的url
full_url = "https://tieba.baidu.com" + item["url"]
print(f"正在获取帖子【{item.get('title')}】中的图片...")
# 发送请求
resp = requests.get(full_url, headers=self.header)
content = resp.content.decode()
# 初始化tree
etree_html: _Element = etree.HTML(content)
# 获取图片
src_list = etree_html.xpath('//img[@class="BDE_Image"]/@src')
print(src_list)
print("*" * 150)
# 以帖子名创建子目录,调用保存图片的方法保存图片
tiezi_dir = os.path.join(self.BASE_DIR, os.path.join(self.name, item["title"]))
try:
os.mkdir(tiezi_dir)
except:
pass
print(tiezi_dir + "已经存在,跳过创建目录。")
self.save_pics(tiezi_dir, src_list)
def save_pics(self, dirname, pic_list):
"""
:param dirname: 帖子的目录path
:param pic_list: 帖子中的图片url列表
:return:
"""
print(dirname, pic_list)
for pic_url in pic_list:
# 通过正则拿到原来的图片名
# 用replace是因为它的sign=69f653d42559252da3171d0c049a032c/f0d931adcbef7609fb3928516bdda3cc7dd99ef8.jpg
# 所以就替换掉/就i算了
pic_name = re.search(r'sign=(.*)\?tbpicau=', pic_url).group(1).replace("/", "")
# 图片属于二进制数据,用b模式
with open(os.path.join(dirname, pic_name), "wb") as f:
# 贴吧图片下载需要指定另外的请求头,不然会提示403
f.write(requests.get(pic_url, headers={
"Host": "tiebapic.baidu.com",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/114.0",
}).content)
f.flush()
def run(self, size):
"""这里就没有用xpath拿到下一页了,没啥意思,就这样循环几次就行了"""
for i in range(1, size + 1):
print(f"正在读取【{self.name}】贴吧的第{i}页数据....")
data = self.get_data(i)
# 获取页中的帖子url地址
parse_data = self.parse_data(data)
# 解析并获取每个帖子中的图片地址并保存
self.parse_detail(parse_data)
# 延迟一下,防止被反扒检测到太快了...
time.sleep(random.randint(1, 5))
if __name__ == '__main__':
"""获取指定贴吧的每个帖子中的图片并保存"""
if len(sys.argv) >= 3:
name = sys.argv[1]
pages = int(sys.argv[2])
else:
pages = 1
name = "海贼王"
url = "https://tieba.baidu.com/f?kw="
TiebaSpider(url, name).run(pages)
标签:name,获取,url,self,title,帖子,案例,data,图片
From: https://www.cnblogs.com/juelian/p/17559519.html