python数据清洗日期格式和ipv4地址格式

发布时间 2023-10-19 12:08:51作者: YE-

清洗日期格式

import re
from datetime import datetime

# 读取文件
with open('result.txt', 'r') as file:
    data = file.read()

# 使用正则表达式查找日期时间字符串
pattern = r'(\d{2}/[A-Za-z]{3}/\d{4}:\d{2}:\d{2}:\d{2} \+\d{4})'
matches = re.findall(pattern, data)

# 将匹配的日期时间字符串转换为所需的格式
for match in matches:
    # 解析原日期时间字符串
    old_date = datetime.strptime(match, '%d/%b/%Y:%H:%M:%S %z')

    # 格式化新日期时间字符串
    new_date = old_date.strftime('%Y/%m/%d %H:%M')

    # 将原字符串替换为新字符串
    data = data.replace(match, new_date)

# 将修改后的数据写回文件
with open('out.txt', 'w') as file:
    file.write(data)

清洗ipv4格式

import re
import requests
import concurrent.futures

# 用于匹配IPv4地址的正则表达式
ipv4_pattern = re.compile(r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b')

# 用于发送API请求并获取城市信息的函数
def get_city_info(ip):
    response = requests.get(f'http://ip-api.com/json/{ip}')
    data = response.json()
    return data['city'] if data['status'] == 'success' else ip

# 读取文件内容
with open('out.txt', 'r') as file:
    content = file.read()

    # 使用正则表达式查找所有IPv4地址
    ip_addresses = re.findall(ipv4_pattern, content)

    # 使用多线程并行处理IP地址查询
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        city_results = list(executor.map(get_city_info, ip_addresses))

    # 替换IPv4地址为城市信息
    for old_ip, new_city in zip(ip_addresses, city_results):
        content = content.replace(old_ip, new_city)

# 将修改后的内容写回文件
with open('output.txt', 'w') as file:
    file.write(content)