dl_images_4.py

发布时间 2023-10-06 22:13:09作者: 苦逼yw

 

 

#!/usr/bin/env python3
import os
import sys
import pandas as pd
import requests
from requests.packages.urllib3.util import Retry
from requests.adapters import HTTPAdapter
from requests import Session
import time
import logging
from logging.handlers import RotatingFileHandler
import re
from clickhouse_driver import Client
from multiprocessing import Process
from multiprocessing import cpu_count
import multiprocessing

'''
        读取csv文件的url,多次请求url,批量下载图片,
'''


def get_xg_images_url():
    df = pd.read_csv('./xg_fail_rec.csv')
    df['license_plate2']
    return df['image_url1'],df['license_plate2'],df['capture_time']
def get_am_images_url():
        df = pd.read_csv('./am_fail_rec.csv')
        df['license_plate2']
        return df['image_url1'],df['license_plate2'],df['capture_time']
def download_img(img_url,num,result_path,plateNo,ct,access_fail_c):
    os.chdir(result_path)
    # 以url命名
    #img_list =  re.findall(r'\/.*?\/.*?\/.*?\/.*?\/.*?\/.*?\/(.*\.)(JPG|jpg)',img_url)
    #img_name = img_list[0][0]+img_list[0][1]
    img_name = plateNo + '_' + ct + '+' + str(num) + '.jpg'
    retries = Retry(total=10,backoff_factor=0.1,status_forcelist=[500])
    try:
        with Session() as s:
            s.mount('http://',HTTPAdapter(max_retries=retries))
            img_obj = s.get(img_url)
    except:
        access_fail_c.value = access_fail_c.value + 1
        logger.error("connect fail {}  ".format(img_url))
        logger.info("child_process {} exited... ".format(num))
        sys.exit(1)
    if int(img_obj.status_code) != 200:
        access_fail_c.value = access_fail_c.value + 1
        logger.error("download {} fail staus = {}".format(img_url,img_obj.status_code))
    else:
        try:
            with open(img_name,'wb') as f:
                f.write(img_obj.content)
            logger.info("saved success {}  staus = {}".format(img_url,img_obj.status_code))
        except:
            access_fail_c.value = access_fail_c.value + 1
            logger.error("saved fail {} fail staus = {}".format(img_url,img_obj.status_code))
    logger.info("child_process {} exited... ".format(num))
def start_process():
        access_fail_c = multiprocessing.Value('d',0)
    # 下载港牌图片
        image_url,license_plate,cts = get_xg_images_url()
        print('港牌url总数: {}'.format(len(image_url)))
        process_list = []
        i = 0 
        result_path = os.path.dirname(os.path.abspath(__file__)) + "/fail_images/fail_xg/"
        for url in image_url:
            if len(process_list) == cpu_count():
                while True:
                    #time.sleep(1)
                    flag = 0 
                    for p in process_list:
                        if not p.is_alive():
                            process_list.remove(p)
                            flag = 1 
                    if flag == 1:
                        break
            Pro = Process(target=download_img,args=(url,i,result_path,license_plate[i],cts[i],access_fail_c))
            logger.info("child_process {} started... ".format(i))
            Pro.start()
            process_list.append(Pro)
            i = i + 1 
        for p in process_list:
            p.join()
        # 下载澳牌图片
        image_url,license_plate,cts = get_am_images_url()
        print('澳牌url总数: {}'.format(len(image_url)))
        i = 0
        result_path = os.path.dirname(os.path.abspath(__file__)) + "/fail_images/fail_am/"
        for url in image_url:
            if len(process_list) == cpu_count():
                while True:
                    #time.sleep(1)
                    flag = 0
                    for p in process_list:
                        if not p.is_alive():
                            process_list.remove(p)
                            flag = 1
                    if flag == 1:
                        break
            Pro = Process(target=download_img,args=(url,i,result_path,license_plate[i],cts[i],access_fail_c))
            logger.info("child_process {} started... ".format(i))
            Pro.start()
            process_list.append(Pro)
            i = i + 1
        for p in process_list:
            p.join()
        print('访问图片失败总数:{}'.format(access_fail_c.value))
if __name__ == '__main__':
      logger = logging.getLogger()
      logger.setLevel(logging.INFO)
      handler = RotatingFileHandler(os.path.dirname(os.path.abspath(__file__)) + "/dl_images.log")
      formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
      handler.setFormatter(formatter)
      logger.addHandler(handler)
      start_process()