PDF转化为txt文件

发布时间 2023-04-12 18:55:52作者: 乔小生1221

环境

python>=3.6

pip install pdfminer.six

直接贴代码:

#!/usr/bin/env python3.8.6
# _*_ coding: utf-8 _*_
# Description:
# Author: qiaoxiaohang <qiaoxiaohang@beyondsoft.com>
# Date: 2023/4/12 18:20
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.converter import TextConverter, PDFPageAggregator
from pdfminer.layout import LAParams
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfdevice import PDFDevice
from pdfminer.pdfpage import PDFPage



# 获取pdf文档
def fun_pdf(url,name):
    fp = open(f'{url}', 'rb')

    # 创建一个与文档相关的解释器
    parser = PDFParser(fp)

    # pdf文档的对象,与解释器连接起来
    doc = PDFDocument(parser=parser)
    parser.set_document(doc=doc)

    # 如果是加密pdf,则输入密码
    # doc._initialize_password()

    # 创建pdf资源管理器
    resource = PDFResourceManager()

    # 参数分析器
    laparam = LAParams()

    # 创建一个聚合器
    device = PDFPageAggregator(resource, laparams=laparam)

    # 创建pdf页面解释器
    interpreter = PDFPageInterpreter(resource, device)

    # 获取页面的集合
    for page in PDFPage.get_pages(fp):
        # 使用页面解释器来读取
        interpreter.process_page(page)

        # 使用聚合器来获取内容
        layout = device.get_result()
        for out in layout:
            if hasattr(out, 'get_text'):
                print(out.get_text())

                # 写入txt文件
                fw = open(f'{name}', 'a',encoding='utf-8')
                fw.write(out.get_text())


if __name__ == '__main__':
    import os
    dir_path = 'C:\\Users\Administrator\Desktop\\test\pdf'
    data_list = os.listdir(dir_path)
    for i in data_list:
        name_txt = i.split('.')[0]+'.'+'txt'
        url=dir_path+'\\'+i
        fun_pdf(url,name_txt)