from threading import Thread标签:src,img,PYTHON,resp,list,爬取,headers,href,图片 From: https://www.cnblogs.com/mingdeng3000/p/16972305.html
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Process, Queue
import requests
from lxml import etree
from urllib import parse
# 异常处理还未优化,后续补上
# 未解决问题1:这是爬取多个页面的当前所有图片,图片内部的还未处理
# 未解决问题2:当爬取页面过多时,会报错,原因还未找到,后续补上
headers = {
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Mobile Safari/537.36",
# 防盗链 : 朔源,当前本次请求的上一级是谁
"Referer": "https://xxx"
}
def get_img_src(q):
urls = []
for i in range(1, 5):
if i == 1:
a = f"https://xxx/index.html"
else:
a = f"https://xxx/{i}.html"
urls.append(a)
href_list_all = []
for i in urls:
resp = requests.get(i, headers=headers)
resp.encoding = 'utf-8'
tree = etree.HTML(resp.text)
href_list = tree.xpath("//div[@class='list-box-p']/ul/li/a/@href")
href_list_all.append(href_list)
for all_list in href_list_all:
for href in all_list:
child_resp = requests.get(href, headers=headers)
child_resp.encoding = 'utf-8'
child_tree = etree.HTML(child_resp.text)
src = child_tree.xpath("//div[@class='img_box']/a/img/@src")[0] # 注意这里获取的是列表,需要取里面的下标为0的第一个元素值
q.put(src) # 循环向队列里装东西,后面好给下载用
print(f"---------------------------------------------------被塞进队列--------------------->{src}")
q.put("完事了")
def download(src):
print('开始下载------------>', src)
name = src.split('/')[-1]
with open("./image/" + name, mode='wb') as f:
resp = requests.get(src, headers=headers)
f.write(resp.content)
print('下载完毕------------>', src)
def download_img(q):
with ThreadPoolExecutor(5) as t:
while 1:
src = q.get() # 从队列里拿东西,如果没数据就阻塞,一直等着有数据来
if src == "完事了":
break
t.submit(download, src)
if __name__ == '__main__':
q = Queue()
p1 = Process(target=get_img_src, args=(q,))
p2 = Process(target=download_img, args=(q,))
p1.start()
p2.start()