mht文件图片提取 python

发布时间 2023-10-26 14:23:05作者: 油画天空

简介

mhtml文件又称为聚合html文档、web档案或单一文件网页。单个文件网页可将网站的所有元素(包括文本和图形)都保存到单个文件中。总的来说mht文件保存了一个网页内的所有元素,让用户可以在没有网络的情况下访问网页。

本程序提取mht文件中的图片并保存至新建文件夹,同时将其压缩。食用方法:

  • 将代码复制存为python文件
  • mht文件放在代码同级目录下
  • 双击运行代码

随后即可在同级目录下生成图片文件夹和压缩文件夹。

实现效果

以下是保存的网页,是一章漫画。

运行程序截图:

>>>>>>> python -u "d:\CODE\extract_test\mht_img_extract.py"
[Find File] 第52话 - 想要成为影之实力者.mht
[File Nums] 1

------------------------------------------PROCESS [1/1]-----------------------------------
[File Name] 第52话 - 想要成为影之实力者.mht
[File Path] D:\CODE\extract_test
[New Folder] 第52话 - 想要成为影之实力者
[Saved Img] 36
[Raw/Compressed] 9.50 / 8.78 MB

文件保存截图:

实现思路

下面是截取mht文件部分内容:

------MultipartBoundary--K1usUkalTqg3WEUJLmLQ5xNVwipkHNGg7EXVdLUiFp----
Content-Type: image/jpeg
Content-Transfer-Encoding: base64
Content-Location: https://s1-a3-ussv.baozicdn.com/scomic/xiangyaochengweiyingzhishilizhe-fengzedajiebanyexingli/0/52-lros/1.jpg  

/9j/4SSgRXhpZgAATU0AKgAAAAgADAEAAAMAAAABBFoAAAEBAAMAAAABBkAAAAECAAMAAAADAAAA
ngEGAAMAAAABAAIAAAESAAMAAAABAAEAAAEVAAMAAAABAAMAAAEaAAUAAAABAAAApAEbAAUAAAAB
AAAArAEoAAMAAAABAAIAAAExAAIAAAAhAAAAtAEyAAIAAAAUAAAA1YdpAAQAAAABAAAA7AAAASQA
CAAIAAgACvyAAAAnEAAK/IAAACcQQWRvYmUgUGhvdG9zaG9wIDIyLjMgKE1hY2ludG9zaCkAMjAy
MzowNDozMCAwOTo1Njo1OQAAAAAABJAAAAcAAAAEMDIzMaABAAMAAAABAAEAAKACAAQAAAABAAAE
.................源文件base64编码太长,这里截取部分作为演示....................
+04k3NZyLMy+w+/+O38K9CgmjngSaJ90cihlb2PSuIkgjnieKVEaORdrKycMD/T+VT/De/kfQ7jR
bn/j50aZrX5uvldYj/3xigDt6KKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKK
ACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooA
in/495P9014r+z397xV/19r/AOzV7VP/AMe8n+6a8V/Z7+94q/6+1/8AZqAPb6KKKACiiigAoooo
AKKKKACkpaSgD//Z

------MultipartBoundary--K1usUkalTqg3WEUJLmLQ5xNVwipkHNGg7EXVdLUiFp----
Content-Type: image/jpeg
Content-Transfer-Encoding: base64
Content-Location: https://s1-a3-ussv.baozicdn.com/scomic/xiangyaochengweiyingzhishilizhe-fengzedajiebanyexingli/0/52-lros/6.jpg

从这里我们可以看出这是一个图片,使用base64编码。接下来我们就需要遍历mht文件,寻找------MultipartBoundary--K1usUkalTqg3WEUJLmLQ5xNVwipkHNGg7EXVdLUiFp----。找到这一行,紧随其后就是一个图片文件的说明部分,据此我们可以获取图片的基本信息。

  • Content-Type 说明了图片文件类型
  • Content-Transfer-Encoding 说明了文件编码方式
  • Content-Location 说明了文件的网址

在base64编码的末尾有一个空行,读取到空行即说明遍历完了一张图片,接下来我们只需使用python的base64.b64decode()即可还原图片,之后将其保存即可。当然其中还有许多细节问题,比如图片名获取、图片大小限制、空行判断问题等等,在以下代码中均已实现。

实现代码

import base64
import os
import zipfile

FILE_EXIST_AUTO_REWRITE = 1 
IMG_NAME_REWRITE = 1
BASE64_LINE_THRESHOLD = 500


def saveImg(file_path, folder_path):
    img_dic = dict()     # key:img name, value:lines
    skipped_dic = dict() # key:file name, value:encode
    sifted_dic = dict()  # key:img name, value:lines     
    with open(file_path, 'rb') as file:
        line = file.readline().decode()

        while line != '':
            # seek to the header of base64 code
            line = file.readline().decode()
            if 'Boundary' in line and line[0] == '-':
                content_type = file.readline().decode().replace(':', '').replace('/', ' ').split() # Content-Type
                content_encode = file.readline().decode().replace(':', '').replace('/', ' ').split() # Content-Location
                content_location = file.readline().decode().replace(':', '').replace('/', ' ').split() # Content-Transfer-Encoding
                if content_type == [] or content_encode == [] or content_location == []:
                    continue
                file_type = content_type[-1]
                file_encode = content_encode[-1]
                file_name = content_location[-1].split('?')[0]
                img_name = ''

                # file type is image
                if content_type[1] == 'image' and file_encode == 'base64':
                    line = file.readline()                            # blank line after Content-Location
                    line = file.readline().decode().replace('\n', '') # first line of base64
                    base64_str = line 
                    lines = 0  
                    while True:  
                        line = file.readline().decode()  
                        lines += 1
                        base64_str += line.replace('\n', '')
                        if len(line) <= 2: # blank line after base64 code has 2 bytes
                            break               
                    # img is too small
                    if lines <= BASE64_LINE_THRESHOLD:
                        sifted_dic[file_name] = lines
                    # img fit the threshold
                    else:      
                        if IMG_NAME_REWRITE == 1:
                            img_name = str(len(img_dic)+1).zfill(4) + '.' + file_type  # fill leading_zero
                        elif IMG_NAME_REWRITE == 0:
                            img_name = file_name.split('.')[0] + '.' + file_type                           
                        img_dic[img_name] = lines
                        base64_decode = base64.b64decode(base64_str)    
                        img = open(folder_path + '/' + img_name, "wb")
                        img.write(base64_decode)
                        img.close()
                # file type is not image
                else:
                    skipped_dic[file_name] = file_encode
    # reach the end of file
    print('[Saved Img] %d'%(len(img_dic)))
    # print('[Saved Img] [name:lines]: \n', img_dic)
    # print('[Sifted Img] [name:lines]: \n', sifted_dic)
    # print('Skipped [name:encode]: ', skipped_dic)


def getDirSize(dir):
   size = 0
   for root, dirs, files in os.walk(dir):
      size += sum([os.path.getsize(os.path.join(root, name)) for name in files])
   return size


def zipImg(folder_path, folder_name):
    zip_name = folder_name+'.zip'
    zip = zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED)   
    for file in os.listdir(folder_path):
        zip.write(folder_path + os.sep + file, file)
    zip.close()
    dir_size = getDirSize(folder_path)
    zip_size = os.path.getsize(zip_name)
    print(r'[Raw/Compressed] %.2f / %.2f MB'%(dir_size/1024/1024, zip_size/1024/1024))


def extractFile(FILE_NAME):
    CUR_PATH = os.getcwd()
    cur_file = os.path.join(CUR_PATH, FILE_NAME)
    # file exist
    if(os.path.isfile(cur_file)):
        print("[File Name]", FILE_NAME)
        print("[File Path]", CUR_PATH)
        folder_name = '.'.join(FILE_NAME.split('.')[0:-1])
        folder_path = os.path.join(CUR_PATH, folder_name)
        # folder exist
        if os.path.exists(folder_path):   
            if FILE_EXIST_AUTO_REWRITE == 1:
                pass
            elif FILE_EXIST_AUTO_REWRITE == 0:
                print("Folder Exist:", folder_name, "Rewrite It? [y/n]", end=" ")
                confirm = input()
                if confirm == 'y':
                    pass
                elif confirm == 'n':
                    return
        # folder not exist, create it
        else:
            os.makedirs(os.path.join(CUR_PATH, folder_name))
            print("[New Folder]", folder_name)
        saveImg(cur_file, folder_path)
        zipImg(folder_path, folder_name)   
    #file not exist, exit
    else:
        print("No This File!")
        return


def getTargetFile():
    file_list = os.listdir(os.getcwd())
    target_file_list = []
    for file in file_list:
        if file.split('.')[-1] == 'mht':
            target_file_list.append(file)
    for file in target_file_list:
        print('[Find File]', file)
    print('[File Nums]', len(target_file_list))
    return target_file_list


if __name__ == '__main__':
    target_file_list = getTargetFile()
    for index, file in enumerate(target_file_list):
        print("\n------------------------------------------PROCESS [{}/{}]-----------------------------------".format(index+1, len(target_file_list)))
        extractFile(file)