Python处理PDF——pdfplumber的安装与使用

发布时间 2023-10-13 15:59:52作者: 守护式等待

Python处理PDF——pdfplumber的安装与使用

# -*- coding:utf-8 -*-

"""
@Time :2023/XX/XX
@Auth :Stone
@File :parse_online_pdf.py
@DESC :在线解析PDF文档
"""
import requests
import pdfplumber
import re, time, os


def online_pdf_parse(path_or_url, mode=1, url_params=None, proxies=None, save_as=None):
    '''
    <语法>
        参数path_or_url: PDF文档路径或者URL
        参数mode: 设置解析模式,
            [1, '1', 'text']返回文档内容 -> str
            [2, '2', 'table']返回表格信息 -> list
            [3, '3', 'text_and_table']返回文档内容及表格信息 -> tuple
        参数url_params: 读取在线PDF文档时,传入requests请求参数,类型 <- dict
        参数proxies: 读取在线PDF文档时,传入requests的代理
        参数save_as: 读取在线PDF文档时,若进行此项设置则另存为本地文档,方便后续使用
    </语法>
    '''

    url_mode = False

    # 判断是本地文档还是在线文档
    if re.search(r'''(?x)\A([a-z][a-z0-9+\-.]*)://([a-z0-9\-._~%]+|\[[a-z0-9\-._~%!$&'()*+,;=:]+\])''', path_or_url):
        url_mode = True
    else:
        pdf_path = path_or_url

    if url_mode:
        pdf_url = path_or_url
        headers_d = None
        headers_d = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)'}
        if not proxies:
            proxy_host = {}
        if not url_params:
            url_params = {}
            url_params['headers'] = headers_d
            url_params['data'] = None
            url_params['params'] = None
            url_params['proxies'] = proxies
        if not url_params['headers']: url_params['headers'] = headers_d
        if url_params['data'] or url_params['params']:
            response = requests.post(pdf_url, **url_params)
        else:
            response = requests.get(pdf_url, **url_params)

        # 写入临时文件再进行解析
        pdf_path = save_as if save_as else f'~temp{time.time()}~.pdf'
        with open(pdf_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
                    f.flush()

    pdf_path = os.path.abspath(pdf_path)

    # 用pdfplumber对pdf文档进行解析
    pdf_text = ''
    pdf_tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            if str(mode).lower() in ['1', 'text', '0', '3']:
                pdf_text += str(page.extract_text())
            if str(mode).lower() in ['2', 'table', '0', '3']:
                pdf_tables += page.extract_tables()
            # print(f"pdf_tables={pdf_tables}")
            # print(f"*" * 166)

    # 删除临时pdf文档
    if url_mode and not save_as:
        try:
            os.remove(pdf_path)
        except Exception as e:
            pass

    if str(mode).lower() in ['1', 'text']:
        return pdf_text
    elif str(mode).lower() in ['2', 'table']:
        return pdf_tables
    elif str(mode).lower() in ['3', 'text_and_table']:
        return pdf_text, pdf_tables


def replace_str(str_font):
    """替换文字中的符号"""
    str_font = str(str_font).replace('\n', '').replace(' ', '').replace(': ', ':')
    return str_font


def link_last_list(need_list):
    """链接上一个list"""
    result_total = []
    for current_list in need_list:
        if current_list[0] == '':
            # 如果当前列表第一个值为空字符串
            if result_total:
                # 如果有上一个非空列表 list1=上一个列表,list2=当前列表,
                new_list = []
                for i in range(len(result_total[-1])):
                    new_value = str(result_total[-1][i]) + str(current_list[i])
                    new_list.append(new_value)
                result_total[-1] = new_list
            else:
                result_total.append(current_list)
        else:
            result_total.append(current_list)
    print(f"获取到所有数组合并后为={result_total}")
    return result_total


if __name__ == '__main__':
    pdf_url = f"********************************"
    pdf_text = online_pdf_parse(pdf_url, mode='table')
    # # print(f"获取的内容是={pdf_text}")
    # 识别后是按照页面进行划分数组,所以会产生一行的数据划分成两行
    data = []
    for item in pdf_text:
        for dd in item:
            data.append([replace_str(str_item) for str_item in dd])
    # print(f"all_list={data}")
    result_list = link_last_list(data)
    print(f"拼接后的数组为={result_list}")