#!/usr/bin/env python3 import os import sys import pandas as pd import requests from requests.packages.urllib3.util import Retry from requests.adapters import HTTPAdapter from requests import Session import time import logging from logging.handlers import RotatingFileHandler import re from clickhouse_driver import Client from multiprocessing import Process from multiprocessing import cpu_count import multiprocessing ''' 读取csv文件的url,多次请求url,批量下载图片, ''' def get_xg_images_url(): df = pd.read_csv('./xg_fail_rec.csv') df['license_plate2'] return df['image_url1'],df['license_plate2'],df['capture_time'] def get_am_images_url(): df = pd.read_csv('./am_fail_rec.csv') df['license_plate2'] return df['image_url1'],df['license_plate2'],df['capture_time'] def download_img(img_url,num,result_path,plateNo,ct,access_fail_c): os.chdir(result_path) # 以url命名 #img_list = re.findall(r'\/.*?\/.*?\/.*?\/.*?\/.*?\/.*?\/(.*\.)(JPG|jpg)',img_url) #img_name = img_list[0][0]+img_list[0][1] img_name = plateNo + '_' + ct + '+' + str(num) + '.jpg' retries = Retry(total=10,backoff_factor=0.1,status_forcelist=[500]) try: with Session() as s: s.mount('http://',HTTPAdapter(max_retries=retries)) img_obj = s.get(img_url) except: access_fail_c.value = access_fail_c.value + 1 logger.error("connect fail {} ".format(img_url)) logger.info("child_process {} exited... ".format(num)) sys.exit(1) if int(img_obj.status_code) != 200: access_fail_c.value = access_fail_c.value + 1 logger.error("download {} fail staus = {}".format(img_url,img_obj.status_code)) else: try: with open(img_name,'wb') as f: f.write(img_obj.content) logger.info("saved success {} staus = {}".format(img_url,img_obj.status_code)) except: access_fail_c.value = access_fail_c.value + 1 logger.error("saved fail {} fail staus = {}".format(img_url,img_obj.status_code)) logger.info("child_process {} exited... ".format(num)) def start_process(): access_fail_c = multiprocessing.Value('d',0) # 下载港牌图片 image_url,license_plate,cts = get_xg_images_url() print('港牌url总数: {}'.format(len(image_url))) process_list = [] i = 0 result_path = os.path.dirname(os.path.abspath(__file__)) + "/fail_images/fail_xg/" for url in image_url: if len(process_list) == cpu_count(): while True: #time.sleep(1) flag = 0 for p in process_list: if not p.is_alive(): process_list.remove(p) flag = 1 if flag == 1: break Pro = Process(target=download_img,args=(url,i,result_path,license_plate[i],cts[i],access_fail_c)) logger.info("child_process {} started... ".format(i)) Pro.start() process_list.append(Pro) i = i + 1 for p in process_list: p.join() # 下载澳牌图片 image_url,license_plate,cts = get_am_images_url() print('澳牌url总数: {}'.format(len(image_url))) i = 0 result_path = os.path.dirname(os.path.abspath(__file__)) + "/fail_images/fail_am/" for url in image_url: if len(process_list) == cpu_count(): while True: #time.sleep(1) flag = 0 for p in process_list: if not p.is_alive(): process_list.remove(p) flag = 1 if flag == 1: break Pro = Process(target=download_img,args=(url,i,result_path,license_plate[i],cts[i],access_fail_c)) logger.info("child_process {} started... ".format(i)) Pro.start() process_list.append(Pro) i = i + 1 for p in process_list: p.join() print('访问图片失败总数:{}'.format(access_fail_c.value)) if __name__ == '__main__': logger = logging.getLogger() logger.setLevel(logging.INFO) handler = RotatingFileHandler(os.path.dirname(os.path.abspath(__file__)) + "/dl_images.log") formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) logger.addHandler(handler) start_process()
标签:dl,img,url,py,list,fail,images,import,process From: https://www.cnblogs.com/lfxx/p/17745177.html