解析pdf去掉水印内容

发布时间 2023-10-19 16:18:41作者: K_Unicode

因为水印内容一般是由倾斜度的,所以判断内容的倾斜度就可以去掉水印内容了。
PDFTextStripper.getText(document)底层是通过writeString(String text, List textPositions)来获取内容的,所以可以通过重写writeString()方法来实现。

public class MyPDFTextStripper extends PDFTextStripper {
    /**
     * Instantiate a new PDFTextStripper object.
     *
     * @throws IOException If there is an error loading the properties.
     */
    public DepPDFTextStripper() throws IOException {
    }

    @Override
    protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
        Iterator<TextPosition> iterator = textPositions.iterator();
        while (iterator.hasNext()) {
            TextPosition position = iterator.next();
            int angle = getAngle(position);
            if (Math.abs(angle) > 10) {
                iterator.remove();
            }
        }
        text = textPositions.stream().filter(elm->!Objects.isNull(elm)).map(TextPosition::getUnicode).collect(Collectors.joining());
        super.writeString(text, textPositions);
    }

    /**
     * 获取字体倾斜度
     *
     * @param text 当前字符对象
     * @return 倾斜度
     */
    public static int getAngle(TextPosition text) {
        Matrix m = text.getTextMatrix().clone();
        m.concatenate(text.getFont().getFontMatrix());
        return (int) Math.round(Math.toDegrees(Math.atan2(m.getShearY(), m.getScaleY())));
    }
}

获取pdf内容

        try (FileInputStream inputStream=new FileInputStream(new File("test.pdf"));PDDocument doc = PDDocument.load(inputStream)) {
            PDFTextStripper textStripper = new DepPDFTextStripper();
            int numberOfPages = doc.getNumberOfPages();
            log.info("Current pdf have {} page", numberOfPages);
            //解析的pdf内容
            String psText = textStripper.getText(doc);
        } catch (Exception e) {
            e.printStackTrace();
            log.info("parse PDF error {}", e.getMessage());
        }