(私人lora数据准备)MJ数据转lora训练的处理流程

发布时间 2023-11-20 22:10:01作者: 不上火星不改名

1.【删除乱码并打标签】删前缀(用户名),删后缀(乱码),加统一标签,并打开excel微调。(输入项为1.单个文件夹地址 2.需要文件夹内加上的标签名)

*注意:此时若要加多个标签,请用英文逗号“,”隔开。

 

import os
import openpyxl
import re

UNWANTED_UNITS = ["undefined", "皮皮", "zly324"]


# 第一步:删名称
def rename_files(path):
    files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
    renamed_files = []

    counter = 1
    for file in files:
        filename, ext = os.path.splitext(file)

        # 乱码类
        if re.search(r'[a-f0-9]{32}', filename) or not '_' in filename:
            renamed = f"({counter})"
            counter += 1
        # AI出图类
        else:
            parts = re.split(r'[_]+', filename)
            parts.pop(0)  # 删除第一个单元

            # 删除特定的单元
            parts = [part for part in parts if part not in UNWANTED_UNITS]

            # 删除所有带数字的单元
            parts = [part for part in parts if not any(char.isdigit() for char in part)]

            # 结尾规则
            # 删除UUID风格数字
            while parts and re.search(r'^[a-f0-9\-]{32,}$', parts[-1]):
                parts.pop(-1)
            # 删除长度小于等于4的部分
            while parts and len(parts[-1]) <= 4:
                parts.pop(-1)

            renamed = '_'.join(parts)

        renamed_files.append(renamed + ext)

    return renamed_files


# 第二步:增名称
def add_prefix(files, prefix):
    prefixed_files = [f"{prefix}_{file}" if not file.startswith(prefix) else file for file in files]

    # 删除特定的单元
    prefixed_files = ['_'.join([part for part in re.split(r'[_]+', name) if part not in UNWANTED_UNITS]) for name in
                      prefixed_files]

    return prefixed_files


# 第三步:创建Excel并自动打开
def create_and_open_excel(files, renamed_files, path):
    wb = openpyxl.Workbook()
    ws = wb.active

    for original, renamed in zip(files, renamed_files):
        ws.append([original, renamed])

    excel_path = os.path.join(path, os.path.basename(path) + ".xlsx")
    wb.save(excel_path)

    # 打开Excel文件
    os.system(f'start "" "{excel_path}"')

    return excel_path


# 第五步:读取Excel并重命名文件
def rename_files_from_excel(path, excel_path):
    wb = openpyxl.load_workbook(excel_path)
    ws = wb.active

    for row in ws.iter_rows(values_only=True):
        original_name, new_name = row
        target_path = os.path.join(path, new_name)

        # 检查原文件是否存在
        if os.path.exists(os.path.join(path, original_name)):
            # 如果目标文件名已存在,则添加一个编号
            counter = 1
            base_name, ext = os.path.splitext(new_name)
            while os.path.exists(target_path):
                new_name = f"{base_name} ({counter}){ext}"
                target_path = os.path.join(path, new_name)
                counter += 1

            os.rename(os.path.join(path, original_name), target_path)

    print("重命名完成。")


# 主函数
def main():
    path = input("请输入文件夹地址: ")
    files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
    renamed_files = rename_files(path)

    prefix = input("请输入需要批量命名的词: ")
    prefixed_files = add_prefix(renamed_files, prefix)

    excel_path = create_and_open_excel(files, prefixed_files, path)
    print(f"Excel文件已保存为:{excel_path}")
    print("请在Excel里微调B列数据,然后保存和关闭Excel文件。完成后按Enter键继续...")

    input()

    # 重命名文件
    rename_files_from_excel(path, excel_path)


if __name__ == "__main__":
    main()

2.【处理下划线】统一只保留第一个下划线,删除后面的下划线(输入项为:总地址)

import os
import shutil

def copy_directory(src, dst):
    """复制 src 目录到 dst 目录。"""
    try:
        shutil.copytree(src, dst)
    except FileExistsError:
        print(f"备份目录 '{dst}' 已存在。")

def rename_image_files(directory):
    """重命名图片文件,保留第一个下划线,将其他下划线替换为空格,并在需要时添加递增编号。"""
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')):
                # 找到第一个下划线的位置
                first_underscore = file.find('_')
                if first_underscore != -1:
                    # 保留第一个下划线,将后续下划线替换为空格
                    before_first_underscore = file[:first_underscore + 1]
                    after_first_underscore = file[first_underscore + 1:].replace('_', ' ')
                    new_file_name = before_first_underscore + after_first_underscore
                else:
                    new_file_name = file

                if new_file_name != file:
                    original_file_path = os.path.join(root, file)
                    new_file_path = os.path.join(root, new_file_name)
                    increment = 1
                    # 循环直到找到不冲突的文件名
                    while os.path.exists(new_file_path):
                        # 分离文件名和扩展名
                        file_name, file_extension = os.path.splitext(new_file_name)
                        # 添加递增编号
                        new_file_name = f"{file_name} ({increment}){file_extension}"
                        new_file_path = os.path.join(root, new_file_name)
                        increment += 1
                    os.rename(original_file_path, new_file_path)
                    print(f"文件 {original_file_path} 已重命名为 {new_file_path}")

def main():
    # 请求用户输入要处理的目录路径
    input_directory = input("请输入要处理的目录路径: ")

    # 检查目录是否存在
    if not os.path.exists(input_directory):
        print(f"指定的目录 {input_directory} 不存在。")
        return

    # 创建目录的备份
    backup_directory = os.path.join(input_directory, "_backup")
    print(f"正在创建备份目录: {backup_directory}")
    copy_directory(input_directory, backup_directory)

    # 在备份目录中重命名图片文件
    print("正在重命名备份目录中的图片文件...")
    rename_image_files(backup_directory)
    print("重命名操作完成。")

if __name__ == "__main__":
    main()

3.【逗号转下划线】大地址下图片文件名所有逗号转换为下划线,适用于输入多条标签情况。(输入项为大地址)

import os

def replace_commas_in_filenames_interactive():
    """
    交互式地从用户处获取目录路径,然后遍历该路径及其子目录,
    并将图片文件名中的逗号替换为下划线。
    """
    path = input("请输入目录路径: ")
    modified_files = []
    # 支持的图片文件扩展名
    image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp']

    for root, dirs, files in os.walk(path):
        for file in files:
            if any(file.endswith(ext) for ext in image_extensions):
                # 替换逗号为下划线
                new_file = file.replace(',', '_').replace('', '_')
                if new_file != file:
                    os.rename(os.path.join(root, file), os.path.join(root, new_file))
                    modified_files.append(os.path.join(root, new_file))

    return modified_files

# 运行这个函数
modified_files = replace_commas_in_filenames_interactive()
for f in modified_files:
    print(f"Modified file: {f}")

4.步骤1-3参照MJ转大模型训练,不同点在于,接下来需要的是:生成左右对称的镜像(防止图片不够)

import os
from PIL import Image

def flip_and_duplicate_image(image_path, output_path):
    """
    Flip an image horizontally and save a copy with a suffix.
    """
    try:
        with Image.open(image_path) as img:
            flipped_img = img.transpose(Image.FLIP_LEFT_RIGHT)
            flipped_img.save(output_path)
        return True
    except Exception as e:
        print(f"无法处理图像 {image_path}: {e}")
        return False

def process_images_in_directory(directory):
    """
    Process all images in a given directory and its subdirectories.
    """
    total_files = 0
    processed_files = 0

    # 预先计算总文件数以显示进度
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):
                total_files += 1

    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):
                original_path = os.path.join(root, file)
                filename, file_extension = os.path.splitext(file)
                output_path = os.path.join(root, f"{filename}(1){file_extension}")
                if not os.path.exists(output_path):  # 避免覆盖已存在的文件
                    if flip_and_duplicate_image(original_path, output_path):
                        processed_files += 1
                # 打印处理进度
                print(f"处理进度: {processed_files}/{total_files} ({(processed_files/total_files)*100:.2f}%)")

def main():
    directory = input("请输入要处理的文件夹的路径: ")
    if os.path.exists(directory) and os.path.isdir(directory):
        process_images_in_directory(directory)
        print("图片处理完成。")
    else:
        print("提供的路径无效或不是一个目录。")

# 运行脚本
main()

5.根据文件名生成文本,并在在文本内去掉多余元素,只保留描述语

import os
import re

def create_txt_from_image():
    # 请求用户输入文件夹地址
    root_folder = input("请输入图片所在文件夹的完整路径:")

    # 判断路径是否存在
    if not os.path.exists(root_folder):
        print("路径不存在,请检查输入的地址。")
        return

    # 用于存储创建的txt文件路径的列表
    created_txt_files = []

    # 使用os.walk遍历文件夹及其所有子文件夹
    for folder_path, dirs, files in os.walk(root_folder):
        for file in files:
            # 检查文件是否为图片(这里我们检查几种常见的图片格式)
            if file.endswith(('.jpg', '.png', '.jpeg')):
                # 获取不带扩展名的文件名
                base_name = os.path.splitext(file)[0]

                # 创建同名的txt文件路径
                txt_path = os.path.join(folder_path, base_name + '.txt')
                created_txt_files.append(txt_path)

                # 将图片文件名(不包括后缀)写入到txt文件中
                with open(txt_path, 'w', encoding='utf-8') as txt_file:
                    txt_file.write(base_name)

    print("所有图片对应的txt文件已创建完毕。")

    # 编译正则表达式以匹配括号及括号内的数字
    bracket_pattern = re.compile(r'\(\d+\)')
    underscore_pattern = "_" # 不需要正则表达式匹配单个字符

    # 遍历所有txt文件,进行内容修改
    for txt_file_path in created_txt_files:
        with open(txt_file_path, 'r', encoding='utf-8') as file:
            content = file.read()

        # 删除内容中的括号及括号内的数字
        content = bracket_pattern.sub('', content)
        # 把下划线替换成英文逗号
        content = content.replace(underscore_pattern, ",")

        with open(txt_file_path, 'w', encoding='utf-8') as file:
            file.write(content)

    print("所有txt文件内容中的括号及括号内的数字已删除,并且所有下划线已转换为英文逗号。")

# 运行函数
create_txt_from_image()