tesseract-wasm 基于webassembly 的tesseract npm 包

发布时间 2023-11-01 21:48:56作者: 荣锋亮

tesseract 是一个开源的ocr 工具,社区提供可一个基于webassembly 的tesseract-wasm ,可以方便直接基于浏览器的ocr 识别
以下是一个简单的试用

项目代码

  • package.json
 
{
  "name": "tesseract",
  "version": "1.0.0",
  "main": "index.js",
  "license": "MIT",
  "dependencies": {
    "tesseract-wasm": "^0.10.0"
  },
  "devDependencies": {
    "vite": "^4.5.0"
  },
  "scripts": {
    "dev":"vite --force",
    "build":"vite build"
  }
}
  • vite.config.js
import { defineConfig } from "vite";
 
export default defineConfig({
  optimizeDeps: {
    exclude: ['tesseract-wasm']
  },
  build: {
    rollupOptions:{
      output:{
        entryFileNames: `[name].js`,
        chunkFileNames: `[name].js`,
        assetFileNames: `[name].[ext]`
      }
    }
  },
  assetsInclude: ['**/*.wasm',"**/*.traineddata"],
});
  • 测试代码

需要识别的图片

app.js


 
import { OCRClient } from 'tesseract-wasm';
 
// 识别一个中文的图片
import imgUrl from './6.png'
 
// 使用中文模型
import traineddataModel from './chi_sim.traineddata'
 
async function runOCR() {
 
  // Fetch document image and decode it into an ImageBitmap.
  const imageResponse = await fetch(imgUrl);
  const imageBlob = await imageResponse.blob();
  const image = await createImageBitmap(imageBlob);
 
  // Initialize the OCR engine. This will start a Web Worker to do the
  // work in the background.
  const ocr = new OCRClient();
 
  try {
    // Load the appropriate OCR training data for the image(s) we want to
    // process.
    await ocr.loadModel(traineddataModel);
 
    await ocr.loadImage(image);
 
    // Perform text recognition and return text in reading order.
    const text = await ocr.getText();
   // 渲染到页面
    document.body.textContent = `tesseract-wasm result: ${text}`;
 
  } finally {
    // Once all OCR-ing has been done, shut down the Web Worker and free up
    // resources.
    ocr.destroy();
  }
}
 
runOCR();
  • 效果

 

说明

目前来说tesseract-wasm 并不是很大(2m) 左右,但是中文的模型比较大(50M),对于一些简单场景,而且可以容忍加载时间的可以使用
实际上做好优化也还可以,因为使用了web worker 对于一些加载会有一些问题,我使用了原始模式的vite 构建,完整代码在github 中,同时tesseract-wasm 也提供了nodejs 支持,官方示例提供了,nodejs 示例

 
import { readFileSync } from "node:fs";
import { fileURLToPath } from "node:url";
import { Command } from "commander";
 
import { createOCREngine } from "tesseract-wasm";
import { loadWasmBinary } from "tesseract-wasm/node";
import sharp from "sharp";
 
async function loadImage(path) {
  const image = await sharp(path).ensureAlpha();
  const { width, height } = await image.metadata();
  return {
    data: await image.raw().toBuffer(),
    width,
    height,
  };
}
 
/** Resolve a URL relative to the current module. */
function resolve(path) {
  return fileURLToPath(new URL(path, import.meta.url).href);
}
 
const program = new Command();
program.description("Extract text from an image");
program.argument("file");
program.parse();
 
// Initialize the OCR engine. In this demo we use the synchronous OCREngine
// API directly. In a server you would want to use the async OCRClient API
// instead.
const wasmBinary = await loadWasmBinary();
const engine = await createOCREngine({ wasmBinary });
 
const model = readFileSync("chi_sim.traineddata");
engine.loadModel(model);
 
// Load the image and perform OCR synchronously.
const image = await loadImage(program.args[0]);
engine.loadImage(image);
 
const text = engine.getText((progress) => {
  process.stderr.write(`\rRecognizing text (${progress}% done)...`);
});
process.stderr.write("\n\n");
process.stdout.write(text);

参考资料

https://github.com/tesseract-ocr/tesseract
https://github.com/robertknight/tesseract-wasm
https://github.com/robertknight/tesseract-wasm/tree/main/examples
https://www.fabiofranchino.com/log/how-to-remove-hashing-in-vite-built-file-names/
https://github.com/vitejs/vite/issues/378
https://github.com/rongfengliang/tesseract-wasm-learning