word转html

发布时间 2023-11-07 20:25:44作者: 爱编程_喵

word转html

1. maven依赖

<!--word解析html -->
<!-- 针对2007以上版本的库docx -->
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-ooxml</artifactId>
    <version>4.1.2</version>
</dependency>
<!-- 针对2003版本的库doc -->
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-scratchpad</artifactId>
    <version>4.1.2</version>
</dependency>
<dependency>
    <groupId>fr.opensagres.xdocreport</groupId>
    <artifactId>fr.opensagres.poi.xwpf.converter.xhtml</artifactId>
    <version>2.0.2</version>
</dependency>

2. 实例

package com.baidu.cms.utils;
import cn.hutool.core.img.ImgUtil;
import fr.opensagres.poi.xwpf.converter.xhtml.Base64EmbedImgManager;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.util.ZipSecureFile;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.awt.image.BufferedImage;
import java.io.*;
import java.util.List;


/**
 * word 转换成html
 */
public class WordUtils {
    private final static Logger log = LoggerFactory.getLogger(WordUtils.class);

    /**word转写html
     * @param sourcePath 源文件路径
     * @param outPath    解析后的文件路径
     * @return
     */
    public static boolean word2Html(String sourcePath, String outPath) {
        boolean flag = false;
        try {
            File file = new File(sourcePath);
            if (!file.exists()) {
                return flag;
            }
            String fName = file.getName();
            String suffix = fName.substring(fName.lastIndexOf(".") + 1).toLowerCase();
            if (suffix.endsWith("doc")) {
                flag = docToHtml(sourcePath, outPath);
            } else if (suffix.endsWith("docx")) {
                flag = docxToHtml(sourcePath, outPath);
            }
            // 新增标签-解决中文内容乱码
            boolean editFlag = editHtml(outPath);
            log.info("word2html({}->{}):parser({});edit({})", sourcePath, outPath, flag, editFlag);
        } catch (Exception e) {
            e.printStackTrace();
            log.error("word2htmlError({}->{}):{}", sourcePath, outPath, String.valueOf(e));
        }
        return flag;
    }

    /**
     * 将word2003转换为html文件
     * @param wordPath word文件路径
     * @param htmlPath html文件路径
     */
    public static boolean docToHtml(String wordPath, String htmlPath) {
        boolean flag = false;
        try {
            File htmlFile = new File(htmlPath);
            // 原word文档
            InputStream input = new FileInputStream(new File(wordPath));
            HWPFDocument wordDocument = new HWPFDocument(input);
            WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
                    DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());

            wordToHtmlConverter.setPicturesManager((content, pictureType, suggestedName, widthInches, heightInches) -> {
                BufferedImage bufferedImage = ImgUtil.toImage(content);
                String base64Img = ImgUtil.toBase64(bufferedImage, pictureType.getExtension());
                //  带图片的word,则将图片转为base64编码,保存在一个页面中
                StringBuilder sb = (new StringBuilder(base64Img.length() + "data:;base64,".length()).append("data:;base64,").append(base64Img));
                return sb.toString();
            });

            // 解析word文档
            wordToHtmlConverter.processDocument(wordDocument);
            Document htmlDocument = wordToHtmlConverter.getDocument();
            // 生成html文件上级文件夹
            File folder = htmlFile.getParentFile();
            if (!folder.exists()) {
                folder.mkdirs();
            }

            // 生成html文件地址
            OutputStream outStream = new FileOutputStream(htmlFile);
            DOMSource domSource = new DOMSource(htmlDocument);
            StreamResult streamResult = new StreamResult(outStream);
            TransformerFactory factory = TransformerFactory.newInstance();
            Transformer serializer = factory.newTransformer();
            serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
            serializer.setOutputProperty(OutputKeys.INDENT, "yes");
            serializer.setOutputProperty(OutputKeys.METHOD, "html");
            serializer.transform(domSource, streamResult);
            outStream.close();
            flag = true;
        } catch (Exception e) {
            e.printStackTrace();
            log.error("Doc解析异常({}->{}):{}", wordPath, htmlPath, String.valueOf(e));
        }
        return flag;
    }

    /**
     * 2007版本word转换成html
     * @param wordPath word文件路径
     * @param htmlPath html文件路径
     * @return
     * @throws IOException
     */
    public static boolean docxToHtml(String wordPath, String htmlPath) {
        boolean flag = false;
        try {
            ZipSecureFile.setMinInflateRatio(-1.0d);
            File htmlFile = new File(htmlPath);
            File parentFile = htmlFile.getParentFile();
            if (!parentFile.exists()) {
                parentFile.mkdirs();
            }
            // 图片保存路径
            String imagePath = parentFile.getPath() + "image" + File.separator;

            // word文件
            File wordFile = new File(wordPath);

            // 加载word文档生成 XWPFDocument对象
            InputStream in = new FileInputStream(wordFile);
            XWPFDocument document = new XWPFDocument(in);

            // 解析 XHTML配置 (这里设置IURIResolver来设置图片存放的目录)
            File imgFolder = new File(imagePath);
            //  带图片的word,则将图片转为base64编码,保存在一个页面中
            XHTMLOptions options = XHTMLOptions.create().indent(4).setImageManager(new Base64EmbedImgManager());
            // 将 XWPFDocument转换成XHTML
            OutputStream out = new FileOutputStream(htmlFile);
            XHTMLConverter.getInstance().convert(document, out, options);
            flag = true;
        } catch (Exception e) {
            e.printStackTrace();
            log.error("Docx解析异常({}->{}):{}", wordPath, htmlPath, String.valueOf(e));
        }
        return flag;
    }


    /**
     * 编辑html 新增标签元素-解决偶尔出现的中文内容乱码
     * @param htmlPath
     * @return
     */
    public static boolean editHtml(String htmlPath) {
        boolean flag = false;

        BufferedReader br = null;
        BufferedWriter bw = null;
        try{
            // 读取html
            br = new BufferedReader(new FileReader(htmlPath));
            // 不使用按行读取(样式会有一定问题)
            String line;
            StringBuilder cb = new StringBuilder();
            while ((line=br.readLine()) != null){
                cb.append(line);
            }
            br.close();
            // 修改html
            String content = cb.toString();
            int i = content.indexOf("</head>");
            String newContent = new StringBuilder(content).insert(i, "<meta http-equiv='Content-Type' content='text/html;charset=utf-8'/>").toString();
            // 写入到html
            bw = new BufferedWriter(new FileWriter(htmlPath));
            bw.write(newContent);
            bw.close();
            flag = true;
        }catch (Exception e){
            e.printStackTrace();
            try {
                if (br != null){
                    br.close();
                }
                if (bw != null){
                    bw.close();
                }
            }catch (IOException ex){
                ex.printStackTrace();
            }
        }
        return flag;
    }

    public static void main(String[] args) {
        // 相关文档转换docx必须通过专业办公软件变更 直接更改后缀 本质未转换为docx 会出现异常
        word2Html("G:\\test\\download\\test.docx", "G:\\test\\download\\1.html");
        word2Html("G:\\test\\download\\test.doc", "G:\\test\\download\\2.html");
    }

}

image

image