今日团队个人作业

发布时间 2023-04-17 23:38:34作者: 小旺财

1.昨天将opancv库搞了搞,对图像的识别,maven的学习,花了两个多小时

2.不会配置,一直报错,不知道为什么明明是库中的自己的方法,却调用不了

3.今天OCR搞一搞,对表格进行一下数据的清洗

package com.example.demo;

import com.sun.org.slf4j.internal.Logger;
import com.sun.org.slf4j.internal.LoggerFactory;
import net.sourceforge.tess4j.ITesseract;
import net.sourceforge.tess4j.Tesseract;
import net.sourceforge.tess4j.TesseractException;
import org.opencv.core.Range;
import org.w3c.dom.css.Rect;

import org.opencv.core.Core;
import org.opencv.core.Mat;
import org.opencv.core.MatOfPoint;
import org.opencv.highgui.HighGui;
import org.opencv.imgcodecs.Imgcodecs;
import org.opencv.imgproc.Imgproc;
import org.opencv.objdetect.Objdetect;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;

public class TableScanner {
private static final Logger logger = LoggerFactory.getLogger(TableScanner.class);

private static final String TESSERACT_DATA_PATH = "/usr/share/tesseract-ocr/4.00/tessdata"; // Tesseract OCR数据路径

private String imagePath; // 图像文件路径
private String tablePath; // 表格文件保存路径

public TableScanner(String imagePath, String tablePath) {
this.imagePath = imagePath;
this.tablePath = tablePath;
}

public void scanTable() throws IOException {
// 加载OpenCV库
System.loadLibrary(Core.NATIVE_LIBRARY_NAME);

// 读取图像文件
Mat image = Imgcodecs.imread(imagePath);

// 转换为灰度图像
Mat gray = new Mat();
Imgproc.cvtColor(image, gray, Imgproc.COLOR_BGR2GRAY);

// 对图像进行二值化处理
Mat binary = new Mat();
Imgproc.threshold(gray, binary, 0, 255, Imgproc.THRESH_BINARY_INV | Imgproc.THRESH_OTSU);

// 进行表格检测
Rect tableRect = detectTable(binary);

if (tableRect == null) {
logger.warn("Failed to detect table in image: {}", imagePath);
return;
}

// 提取表格区域
Mat table = new Mat(image, (Range) tableRect);

// 进行表格识别
ITesseract tesseract = new Tesseract();
tesseract.setDatapath(TESSERACT_DATA_PATH);
tesseract.setLanguage("eng"); // 使用英文语言库
String text = null;
try {
text = tesseract.doOCR(table);
} catch (TesseractException e) {
logger.error("Failed to recognize table in image: {}", imagePath, e);
return;
}

// 保存为CSV文件
Path tableFile = Paths.get(tablePath);
Files.write(tableFile, text.getBytes());
}

/**
* 检测图像中的表格区域
*/
private Rect detectTable(Mat binary) {
// 进行轮廓检测
List<MatOfPoint> contours = Lists.newArrayList();
Mat hierarchy = new Mat();
Imgproc.findContours(binary, contours, hierarchy, Imgproc.RETR_EXTERNAL, Imgproc.CHAIN_APPROX_SIMPLE);

// 查找最大的矩形轮廓
Rect maxRect = null;
double maxArea = 0;
for (MatOfPoint contour : contours) {
Rect rect = (Rect) Imgproc.boundingRect(contour);
double area = rect.area();
if (area > maxArea && isTable(rect)) {
maxRect = rect;
maxArea = area;
}
}

return maxRect;
}

/**
* 判断矩形是否为表格
*/
private boolean isTable(Rect rect) {
return rect.width > 50 && rect.height > 50;

}

}