#!/usr/bin/env python3 import os import sys import datetime import pandas as pd import requests from requests import Session from requests.packages.urllib3.util import Retry from requests.adapters import HTTPAdapter import time import logging from logging.handlers import RotatingFileHandler import re from clickhouse_driver import Client from multiprocessing import Process from multiprocessing import cpu_count import multiprocessing def get_images_url(): try: cursor = Client(host='127.0.0.1', port=9001, password='Ys_gz@2022') except: logging.info("连接失败!") sys.exit(1) #sql = "select license_plate2,xgbdp,ambdp,VIIDOBJECTID,location_id,capture_time,image_url1 from yisa_oe.vehicle_all where (license_plate2 like '粤Z%澳' or license_plate2 LIKE '粤Z%港') and date = '2022-06-26'" #sql = "select license_plate2,xgbdp,ambdp,VIIDOBJECTID,location_id,capture_time,image_url1 from yisa_oe.vehicle_all where license_plate2 LIKE '粤Z%港' and date = '2022-06-29'" sql = "select image_url1,capture_time from yisa_oe.face_all where hair_id = 1 and (clarity_id = 1 or clarity_id = 2) limit 20000" try: results = cursor.execute(sql) except: logging.error("语句执行错误!") sys.exit(1) data_list = [] #查询某天的港澳牌的行 for row in results: row_list = list(row) logging.info(row) data_list.append(row_list) #df = pd.DataFrame(data_list,columns=['license_plate2','xgbdp','ambdp','VIIDOBJECTID','location_id','capture_time','image_url1']) df = pd.DataFrame(data_list,columns=['image_url1','capture_time']) return df['image_url1'],df['capture_time'] def download_img(img_url,num,result_path,ct,access_fail_c): os.chdir(result_path) # 以url命名 #img_list = re.findall(r'\/.*?\/.*?\/.*?\/.*?\/.*?\/.*?\/(.*\.)(JPG|jpg)',img_url) #img_name = img_list[0][0]+img_list[0][1] img_name = '_' + ct + '_' + str(num) + '.jpg' retries = Retry(total=10,backoff_factor=0.1,status_forcelist=[500]) try: with Session() as s: s.mount('http://',HTTPAdapter(max_retries=retries)) img_obj = s.get(img_url) except: access_fail_c.value = access_fail_c.value + 1 logger.error("connect fail {} ".format(img_url)) logger.info("child_process {} exited... ".format(num)) sys.exit(1) #排除404的情况 if int(img_obj.status_code) != 200: access_fail_c.value = access_fail_c.value + 1 logger.error("download {} fail staus = {}".format(img_url,img_obj.status_code)) else: try: with open(img_name,'wb') as f: f.write(img_obj.content) logger.info("saved success {} staus = {}".format(img_url,img_obj.status_code)) except: access_fail_c.value = access_fail_c.value + 1 logger.error("saved fail {} fail staus = {}".format(img_url,img_obj.status_code)) logger.info("child_process {} exited... ".format(num)) def start_process(): access_fail_c = multiprocessing.Value('d',0) image_url,cts = get_images_url() process_list = [] i = 0 result_path = os.path.dirname(os.path.abspath(__file__)) + "/result_images/" for url in image_url: if len(process_list) == cpu_count(): while True: #time.sleep(1) flag = 0 for p in process_list: if not p.is_alive(): process_list.remove(p) flag = 1 if flag == 1: break #print(str(cts[i])) #time_int = time.mktime(time.strptime(str(cts[i]),'%Y-%m-%d %H:%M:%S')) #time_obj = time.localtime(time_int) #cts[i] = time.strftime("%Y-%m-%d_%H-%M-%S",time_obj) Pro = Process(target=download_img,args=(url,i,result_path,cts[i],access_fail_c)) logger.info("child_process {} started... ".format(i)) Pro.start() process_list.append(Pro) i = i + 1 for p in process_list: p.join() if __name__ == '__main__': logger = logging.getLogger() logger.setLevel(logging.DEBUG) handler = RotatingFileHandler(os.path.dirname(os.path.abspath(__file__)) + "/dl_images.log") formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) logger.addHandler(handler) start_process()
标签:dl,img,url,py,list,fail,time,images,import From: https://www.cnblogs.com/lfxx/p/17745179.html