Excel word pdf查找

发布时间 2023-11-13 14:45:48作者: gsluofu

import org.apache.commons.lang.StringUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.ooxml.POIXMLDocument;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;

/**

  • @author

  • @title: aa

  • @projectName

  • @description: TODO

  • @date 2022/10/14 19:27
    */
    public class OfficeTextSearch {

    private static final String SUCCESS = "success";

    private static final String CONTINUE = "continue";

    private static final String EXIT = "exit";

    public static void main(String[] args) {
    String pdfFileDir = "D:\用户\luofu704\Desktop\1031";
    String excelFileDir = "D:\fintchFile\allexcel";
    String wordFileDir = "D:\fintchFile\word";
    //默认前后50个字符
    int percount = 200;
    int aftercount = 200;
    //查找关键字
    String beginKey = null;
    //读取所有文件放到map中 key为path value 为 内容
    LinkedHashMap<String, String> fileMap = new LinkedHashMap<>();
    //读取pdf文件
    //readPDFFileMap(fileMap,pdfFileDir);
    //读取excel文件
    //readExcelFileMap(fileMap,excelFileDir);
    //读取word文件
    readWordFileMap(fileMap,wordFileDir);
    while(true){
    Scanner scanner = new Scanner(System.in);
    if(StringUtils.isEmpty(beginKey)){
    System.err.println("请输入查找的关键字(或者输入exit退出):");
    beginKey = scanner.nextLine();
    }
    if(StringUtils.isEmpty(beginKey)){
    System.err.println("请输入查找的关键字,关键字不能为空:");
    beginKey = scanner.nextLine();
    }
    if(EXIT.equals(beginKey)){
    return;
    }
    if(StringUtils.isNotBlank(beginKey)){
    findKeyContent(percount, aftercount, beginKey, fileMap);
    }
    //清除关键字
    beginKey = null;
    System.err.println(">>>>>>>>>>>>>>>>>>>>>>>>下一题<<<<<<<<<<<<<<<<<<<<<<<<<<<<");
    }
    }

    public static void findKeyContent(int percount, int aftercount, String beginKey, LinkedHashMap<String, String> fileMap) {
    Set<Map.Entry<String, String>> entries = fileMap.entrySet();
    for (Map.Entry<String, String> entry : entries) {
    String message = getByStartAndEndKey(entry.getValue(), beginKey, entry.getKey(),percount,aftercount);
    if(SUCCESS.equals(message)){
    return;
    }else if(CONTINUE.equals(message)) {
    continue;
    }
    }
    }
    public static String getByStartAndEndKey(String content, String beginKey,String filePath,int percount,int aftercount) {
    //beginKey 开始关键字字符串
    int begin = content.indexOf(beginKey);
    if (begin > 0) {
    int strBegin = 0;
    int strEnd = content.length();
    //输出前后个字符串
    if (begin > percount) {
    //前10个字符
    strBegin = begin - percount;
    }
    if (begin + aftercount < strEnd) {
    strEnd = begin + aftercount;
    }
    String matchContent = content.substring(strBegin, strEnd);
    System.out.println();
    System.err.println("========================= 匹配文件: " + filePath + "================================================== ");
    System.out.println(matchContent);
    Scanner scanner = new Scanner(System.in);
    System.err.println("是否继续往下查找y/n:");
    String keyborad = scanner.nextLine();
    if(StringUtils.isEmpty(keyborad)){
    System.err.println("请输入y/n:");
    keyborad = scanner.nextLine();
    }
    if ("Y".equals(keyborad.toUpperCase())) {
    content = content.substring(begin + 1);
    String byStartAndEndKey = getByStartAndEndKey(content, beginKey,filePath, percount, aftercount);
    //多次搜索跳出循环
    if(SUCCESS.equals(byStartAndEndKey)){
    return SUCCESS;
    }
    } else if ("N".equals(keyborad.toUpperCase())) {
    return SUCCESS;
    }
    //继续下一个文件的检索
    return CONTINUE;
    } else {
    return CONTINUE;
    }
    }
    private static Map readPDFFileMap(LinkedHashMap<String, String> fileMap, String fileDir) {
    //pdf 解析
    File file = new File(fileDir);
    File[] files = file.listFiles();
    PDDocument doc = null;
    StringBuffer sb = new StringBuffer();
    for (File item : files) {
    try {
    if (item.isDirectory()) {
    continue;
    }
    String filePath = item.getAbsolutePath();
    doc = PDDocument.load(new FileInputStream(filePath));
    int numberOfPages = doc.getNumberOfPages();
    PDFTextStripper pts = new PDFTextStripper();
    pts.setSortByPosition(true);
    for (int i = 1; i < numberOfPages+1; i++) {
    pts.setStartPage(i);
    pts.setEndPage(i);
    String text = pts.getText(doc);
    sb.append("文件:"+filePath+" 第"+i+"页:"+text).append("\n");
    }
    fileMap.put(filePath, sb.toString());
    //System.out.println(sb.toString());
    } catch (Exception e) {
    System.err.println("##############文件解析异常:" + item.getName());
    } finally {
    try {
    doc.close();
    } catch (IOException e) {
    System.err.println("##############文件流工具关闭失败!!!");
    }
    }
    }
    return fileMap;
    }
    private static Map readExcelFileMap(LinkedHashMap<String, String> fileMap, String fileDir) {
    OPCPackage opcPackage = null;
    //word 解析
    File file = new File(fileDir);
    File[] files = file.listFiles();
    for (File item : files) {
    try {
    if (item.isDirectory()) {
    continue;
    }
    String filePath = item.getAbsolutePath();
    opcPackage = POIXMLDocument.openPackage(filePath);
    XSSFExcelExtractor xe = new XSSFExcelExtractor(opcPackage);
    xe.setFormulasNotResults(true);
    xe.setIncludeSheetNames(true);
    String content = xe.getText();;
    fileMap.put(filePath, content);
    } catch (Exception e) {
    System.err.println("##############文件解析异常:" + item.getName());
    } finally {
    try {
    opcPackage.close();
    } catch (IOException e) {
    System.err.println("##############文件流工具关闭失败!!!");
    }
    }
    }
    return fileMap;
    }
    private static Map readWordFileMap(LinkedHashMap<String, String> fileMap, String fileDir) {
    OPCPackage opcPackage = null;
    //word 解析
    File file = new File(fileDir);
    File[] files = file.listFiles();
    for (File item : files) {
    try {
    if (item.isDirectory()) {
    continue;
    }
    String filePath = item.getAbsolutePath();
    opcPackage = POIXMLDocument.openPackage(filePath);
    XWPFWordExtractor xwpfWordExtractor = new XWPFWordExtractor(opcPackage);
    String content = xwpfWordExtractor.getText();
    fileMap.put(filePath, content);
    } catch (Exception e) {
    System.err.println("##############文件解析异常:" + item.getName());
    } finally {
    try {
    opcPackage.close();
    } catch (IOException e) {
    System.err.println("##############文件流工具关闭失败!!!");
    }
    }
    }
    return fileMap;
    }

    /*

    com.itextpdf
    itextpdf
    5.5.13.2
    jar


    org.apache.pdfbox
    pdfbox
    2.0.9


    org.apache.pdfbox
    fontbox
    2.0.9

    org.apache.poi poi 4.1.1 org.apache.poi poi-ooxml 4.1.1 commons-lang commons-lang 2.6 javac -classpath .;D:\Fsearch\lib\poi-4.1.2.jar;D:\Fsearch\lib\poi-ooxml-4.1.2.jar;D:\Fsearch\lib\poi-ooxml-schemas-4.1.2.jar;D:\Fsearch\lib\rt.jar;D:\Fsearch\lib\commons-lang-2.6.jar;D:\Fsearch\lib\xmlbeans-3.1.0.jar -d D:\Fsearch\out aa.java */

}