python分段读取word文件数据到MySQL数据库和Java读取word数据到MySQL数据库

发布时间 2023-04-14 21:27:49作者: 往心。

1、python分段读取word文件数据到MySQL数据库

  示例:(注:此示例为读取某个文件夹下的所有文件,并对文件后缀名为doc的文件进行读取,并以文件名称为id完成对该word 内容的插入。)

# 导入os模块
import os
#导入所需库
import pymysql
from docx import Document

# path定义要获取的文件名称的目录
path = "your path"
# os.listdir()方法获取文件夹名字,返回数组
file_name_list = os.listdir(path)
# 转为转为字符串
file_name = file_name_list
# replace替换"["、"]"、" "、"'"
# file_name = file_name.replace("[", "").replace("]", "").replace("'", "").replace(",", "\n").replace(" ", "")
for fileitem in file_name:
    totalname = fileitem.split('.')
    print(totalname[0],end=">>")
    print(totalname[1])
    try:
        # print(fileitem)
        if totalname[1] == 'doc':
            strtext = ""
            # 打开word文档
            document = Document(path+"/"+totalname[0]+".doc")
            # 获取所有段落
            all_paragraphs = document.paragraphs
            # 打印看看all_paragraphs是什么东西
            # print(type(all_paragraphs)) #<class 'list'>,打印后发现是列表
            # 是列表就开始循环读取
            for paragraph in all_paragraphs:
                # 打印每一个段落的文字
                # print(paragraph.text)
                # 循环读取每个段落里的run内容
                for run in paragraph.runs:
                    if run.text != ' ':
                        strtext = strtext + run.text + "</br>"
                        # print(strtext)
            try:
                db = pymysql.connect(host='localhost', port=3306, user='root', passwd='yourpassword', db='your数据库',
                                                 charset='utf8')
                cursor = db.cursor()
                try:
                    sql = "update lawfiles_information_context1 set file_context = '"+strtext+"' where file_title = '"+totalname[0]+"'"
                    cursor.execute(sql)
                    db.commit()
                except Exception as e:
                    # db.rollback()
                    # print(e)                # 关闭光标对象
                    cursor.close()
                    # 关闭数据库连接
                    db.close()
                    # print(run.text, "</br>")  # 打印run内容
            except Exception as es:
                print("files update failed!!")
    except Exception as efile:
        print("file reading failed")

2、Java读取word数据到MySQL数据库

示例(注:以下Java代码同上述Python代码功能相同,均为读取某个文件夹下所有word文件内容并进行逐段读取同时存储至数据库,此处仅为更新数据表某个字段的内容,若要进行插入,可自行更改sql语句)

package testJava;

import com.spire.doc.Document;
import com.spire.doc.Section;
import com.spire.doc.documents.Paragraph;
import util.DbHelper;

import java.io.File;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;

/**
 * @author June
 * @date 2023/4/9 13:42
 * 5341
 * 12099
 */
public class Addcontext {
    //更新文件信息
    public static boolean fileupdateInfo(String file_context,String file_title) {
        int count = 0;
        boolean flag = false;
        String sql = "update lawfiles_information_deal_context set file_context = ? where file_title = ?";
        Connection conn = DbHelper.getConnection();
        PreparedStatement pst = null;
        try {
            pst = conn.prepareStatement(sql);
            pst.setString(1,file_context);
            pst.setString(2,file_title);
            count = pst.executeUpdate();
            pst.close();
        } catch (SQLException e) {
            // TODO 自动生成的 catch 块
            e.printStackTrace();
        } finally {
            if(count>0)
                flag = true;
        }
        return flag;
    }

    //更新文件信息
    public static boolean judgenuLL(String file_context,String file_title) {
        int count = 0;
        boolean flag = false;
        String sql = "update lawfiles_information_deal_context set file_context = ? where file_title = ?";
        Connection conn = DbHelper.getConnection();
        PreparedStatement pst = null;
        try {
            pst = conn.prepareStatement(sql);
            pst.setString(1,file_context);
            pst.setString(2,file_title);
            count = pst.executeUpdate();
            pst.close();
        } catch (SQLException e) {
            // TODO 自动生成的 catch 块
            e.printStackTrace();
        } finally {
            if(count>0)
                flag = true;
        }
        return flag;
    }

    //更新文件信息
    public static boolean fileupdatepart(String file_context,String file_title) {
        int count = 0;
        boolean flag = false;
        //String sql = "update lawfiles_information_decision set file_context = ? where file_title = ?";
        //String sql = "update lawfiles_information_interpreter set file_context = ? where file_title = ?";
        String sql = "update lawfiles_information_place set file_context = ? where file_title = ?";
        Connection conn = DbHelper.getConnection();
        PreparedStatement pst = null;
        try {
            pst = conn.prepareStatement(sql);
            pst.setString(1,file_context);
            pst.setString(2,file_title);
            count = pst.executeUpdate();
            pst.close();
        } catch (SQLException e) {
            // TODO 自动生成的 catch 块
            e.printStackTrace();
        } finally {
            if(count>0)
                flag = true;
        }
        return flag;
    }

    public static void main(String[] args) {
        //获取文件路径文件夹下的全部文件列表
        System.out.println("文件有如下:");
        //表示一个文件路径
        File file = new File("D:\\GraduationProject\\program\\coding\\paqu\\laws_regulations\\1crawling\\laws_files3");
        //用数组把文件夹下的文件存起来
        File[] files = file.listFiles();
        System.out.println("共有文件数"+files.length);
        int count = 0;
        int hace = 0;
        //foreach遍历数组
        for (int j=0;j<2000;j++) {
            File file2 = files[j];
            count++;
            hace = files.length - count;
            System.out.println("count_have ==>>"+hace);
            //打印文件列表:只读取名称使用getName();
            //System.out.println("路径:"+file2.getPath());
            //System.out.println("文件夹/文件名:"+file2.getName());
            try{
                //加载Word文档
                Document doc = new Document(file2.getPath());
                String fileName = file2.getName();
                //得到上传文件的扩展名
                String fileExtName = fileName.substring(fileName.lastIndexOf(".")+1);
                if("doc".equals(fileExtName)){
                    //文件名
                    System.out.println(count+">>doc名字:"+file2.getName().substring(0,file2.getName().length()-4));
                    System.out.println(count+">>doc名字:"+file2.getName());
                    System.out.println("doc内容:--------------------------------------------------------------------");
                    String context = "";
                    //遍历文档中的节和段落,获取每个段落的文本
                    for(int i = 0; i < doc.getSections().getCount(); i++) {
                        context = "";
                        try {
                            Section section = doc.getSections().get(i);
                            for(int p=0;p<section.getParagraphs().getCount();p++){
                                context = context + "\n" + section.getParagraphs().get(p).getText();
                            }
                            //System.out.println("context:"+context);
                            //System.out.println(context);
                            fileupdatepart(context,file2.getName().substring(0,file2.getName().length()-4));
                        }catch (Exception e){
                            System.out.println(count+">>read------false"+file2.getName().substring(0,file2.getName().length()-4));
                        }
                    }
                }

            }catch (Exception e){
                System.out.println(count+">>file------false");
            }

        }



    }
}

对比而言:

  (别问,实践证明。。。。python快。。。。。)python读取word文件的速度相比Java读取word文件的速度要快。