话不多说,直接上代码!
1 import time 2 import random 3 import uuid 4 5 from lxml import etree 6 import os 7 import requests 8 import threading 9 from queue import Queue 10 from tqdm import tqdm 11 from concurrent.futures import ThreadPoolExecutor 12 13 # 队列 14 q = Queue(maxsize=300) 15 # 线程池 16 pool = ThreadPoolExecutor(max_workers=10) 17 18 headers = { 19 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 ' 20 'Safari/537.36 ' 21 } 22 23 24 # 线程池中的方法,解析URL将图片地址和标题放进队列 25 def utl_image(url, cursor): 26 # 1. 请求URL 27 resp = requests.get(url, headers=headers) 28 resp.encoding = resp.apparent_encoding 29 page_content = resp.text 30 # 2. 数据解析 31 tree = etree.HTML(page_content) 32 # 图片地址 33 hrefs = tree.xpath('//a[@class="preview"]/@href') 34 print(f'解析到图片{len(hrefs)}张!') 35 for a_href in tqdm(hrefs, desc='hrefs,插入队列'): 36 time.sleep(random.randint(1, 5)) 37 resp_ah = requests.get(str(a_href), headers=headers) 38 resp_ah.encoding = resp_ah.apparent_encoding 39 pct = resp_ah.text 40 # 2. 数据解析 41 tre = etree.HTML(pct) 42 print(f'pct:{pct}') 43 img_list = tre.xpath('//img[@id="wallpaper"]') 44 print(f'线程{cursor},a_href:{a_href},img_list:{img_list}') 45 for img in img_list: 46 img_src = img.xpath('./@src')[0] 47 img_alt = img.xpath('./@alt')[0] 48 q.put([img_src, img_alt]) 49 50 51 # 线程中的方法,获取队列中的图片地址和标题,进行存储 52 def image_save(): 53 threadName = uuid.uuid1() 54 pCoun = 1 55 while True: 56 print(f'队列【消费】,当前队列消息总数:{q.qsize()}') 57 print(f'线程{threadName},开始消费第{pCoun}条消息!') 58 img_src, img_alt = q.get(timeout=180) 59 # 获取网页信息 60 r = requests.get(url=str(img_src), headers=headers) 61 path = f"wallhaven" 62 try: 63 if not os.path.exists(path): 64 os.makedirs(path) 65 except: 66 print(f"创建目录失败{path}") 67 if r.status_code == 200: 68 print(f"{img_alt}下载[start]") 69 f = open(f"{path}/{img_alt[:40]}.jpg", 'wb') 70 f.write(r.content) 71 f.close() 72 print(f"{img_alt}下载[ok]") 73 time.sleep(random.randint(1, 3)) 74 pCoun = pCoun+1 75 76 77 if __name__ == '__main__': 78 # 1. 准备数据 79 url = 'https://wallhaven.cc/search?categories=001&purity=100&ratios=landscape&topRange=1y&sorting=toplist&order=desc&ai_art_filter=1&page=' 80 url_list = [f'{url}{i}' for i in range(1, 2)] # 指定页区间,包头不包尾 81 i = 1 82 for ul in tqdm(url_list, desc='线程池'): 83 print(f'ul:{ul}') 84 # 使用线程池进行URL的解析 85 pool.submit(utl_image, ul, i) 86 time.sleep(1) 87 i = i + 1 88 # 使用多线程下载,数字几就是几个线程 89 for i in range(1): 90 t = threading.Thread(target=image_save) 91 t.start()
By the way:
4K-8k资源分享:https://www.cnblogs.com/kukuDF/p/15989961.html