无聊随便玩玩,要爬成功还早着呢,代码很乱可以整理,写了就记录一下吧,有机会再改。
import requests
import os
from bs4 import BeautifulSoup
from requests.packages import urllib3
import random
import threading
import time
urllib3.disable_warnings()
start_page = 1
end_page = 1
if not os.path.exists("gq_sjbz"):
os.makedirs("gq_sjbz")
base_url = "https://www.3gbizhi.com/sjbz/index_{}.html"
time_out_urls = []
print()
def crawl_page(page):
url = base_url.format(page)
try:
user_agent = random.choice(
['Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36'])
# 有用ip池的自己去买
headers = {'User-Agent': user_agent}
resp = requests.get(url, headers=headers, verify=False, timeout=20)
except requests.exceptions.HTTPError as e:
print(f"HTTP请求错误: {e}")
return
soup = BeautifulSoup(resp.text, 'html.parser')
ul_element = soup.select("div.contlistw ul.cl")
for ul in ul_element:
a_href_s = ul.find_all('a', href=True)
for a_href in a_href_s:
href = a_href['href']
resp2 = requests.get(href, headers=headers, verify=False, timeout=20)
soup2 = BeautifulSoup(resp2.text, 'html.parser')
# TODO: 当前只是下载了该界面的展示图,并不是高清原图,下一步下载高清原图,有空再写
# TODO: 难度要调用接口拿到
# {
# "file": "/api/user/imageDownload?downconfig=e03UPLG76erry5Fo6ZT7Zw%3D%3D3gbizhiComgV1S3%2BO8DlxWKbNOuZ7BLw%3D%3D&op=file&picnum=1&captcha=267",
# "zip": "/api/user/imageDownload?downconfig=e03UPLG76erry5Fo6ZT7Zw%3D%3D3gbizhiComgV1S3%2BO8DlxWKbNOuZ7BLw%3D%3D&op=zip&picnum=1&captcha=267"
# }
# 要绕过人机校验
ul_element2 = soup2.select("div.img-table img#contpic")
for ul2 in ul_element2:
img_url = ul2['src']
img_name = os.path.basename(img_url)
try:
img_resp = requests.get(img_url, verify=False, timeout=3)
if img_resp.status_code == 200:
with open("gq_sjbz/" + img_name, "wb") as img_file:
img_file.write(img_resp.content)
print(f"Downloaded image: {img_url} 超时数量:{len(time_out_urls)}")
else:
time_out_urls.append(img_url)
except:
time_out_urls.append(img_url)
print(f"爬取第 {page} 页完成")
# 定义批量爬取的函数
def crawl_batch(start, end):
for page in range(start, end + 1):
crawl_page(page)
# 设定每批次爬取的页面数量
batch_size = 20
# 分批次爬取页面
def run_threads():
threads = []
for batch_start in range(start_page, end_page + 1, batch_size):
batch_end = min(batch_start + batch_size - 1, end_page)
thread = threading.Thread(target=crawl_batch, args=(batch_start, batch_end))
threads.append(thread)
thread.start()
# 等待所有线程完成
for thread in threads:
thread.join()
# 启动多线程
run_threads()
def time_out_urls_download(max_download_time):
start_time = time.time() # 记录开始时间
while len(time_out_urls) > 0:
img_url = time_out_urls.pop(0) # 从列表中取出第一个URL
img_name = os.path.basename(img_url)
try:
img_resp = requests.get(img_url, verify=False, timeout=3)
if img_resp.status_code == 200:
with open("gq_sjbz/" + img_name, "wb") as img_file:
img_file.write(img_resp.content)
print(f"Downloaded image: {img_url} {len(time_out_urls)}")
else:
time_out_urls.append(img_url)
except Exception as e:
time_out_urls.append(img_url)
elapsed_time = time.time() - start_time # 计算已经过去的时间
if elapsed_time > max_download_time:
print(time_out_urls)
break # 超过指定时间后停止下载
# 设置最大下载时间(秒)
max_download_time = 1200 # 例如,设置为1小时
# 调用函数开始下载
time_out_urls_download(max_download_time)
print("爬取完成------------------------------------------------------------------------------")
标签:img,python,爬取,url,urls,time,壁纸,page,out
From: https://www.cnblogs.com/Airgity/p/17732794.html