tesseract-wasm 基于webassembly 的tesseract npm 包-526互联

tesseract 是一个开源的ocr 工具，社区提供可一个基于webassembly 的tesseract-wasm ，可以方便直接基于浏览器的ocr 识别
以下是一个简单的试用

项目代码

package.json

 
{

  "name": "tesseract",

  "version": "1.0.0",

  "main": "index.js",

  "license": "MIT",

  "dependencies": {

    "tesseract-wasm": "^0.10.0"

  },

  "devDependencies": {

    "vite": "^4.5.0"

  },

  "scripts": {

    "dev":"vite --force",

    "build":"vite build"

  }

}

vite.config.js

import { defineConfig } from "vite";
 
export default defineConfig({

  optimizeDeps: {

    exclude: ['tesseract-wasm']

  },

  build: {

    rollupOptions:{

      output:{

        entryFileNames: `[name].js`,

        chunkFileNames: `[name].js`,

        assetFileNames: `[name].[ext]`

      }

    }

  },

  assetsInclude: ['**/*.wasm',"**/*.traineddata"],

});

测试代码

需要识别的图片

app.js

 
import { OCRClient } from 'tesseract-wasm';
 
// 识别一个中文的图片

import imgUrl from './6.png'
 
// 使用中文模型

import traineddataModel from './chi_sim.traineddata'
 
async function runOCR() {
 
  // Fetch document image and decode it into an ImageBitmap.

  const imageResponse = await fetch(imgUrl);

  const imageBlob = await imageResponse.blob();

  const image = await createImageBitmap(imageBlob);
 
  // Initialize the OCR engine. This will start a Web Worker to do the

  // work in the background.

  const ocr = new OCRClient();
 
  try {

    // Load the appropriate OCR training data for the image(s) we want to

    // process.

    await ocr.loadModel(traineddataModel);
 
    await ocr.loadImage(image);
 
    // Perform text recognition and return text in reading order.

    const text = await ocr.getText();

   // 渲染到页面

    document.body.textContent = `tesseract-wasm result: ${text}`;
 
  } finally {

    // Once all OCR-ing has been done, shut down the Web Worker and free up

    // resources.

    ocr.destroy();

  }

}
 
runOCR();

效果

说明

目前来说tesseract-wasm 并不是很大（2m）左右，但是中文的模型比较大（50M），对于一些简单场景，而且可以容忍加载时间的可以使用
实际上做好优化也还可以，因为使用了web worker 对于一些加载会有一些问题，我使用了原始模式的vite 构建,完整代码在github 中，同时tesseract-wasm 也提供了nodejs 支持，官方示例提供了,nodejs 示例

 
import { readFileSync } from "node:fs";

import { fileURLToPath } from "node:url";

import { Command } from "commander";
 
import { createOCREngine } from "tesseract-wasm";

import { loadWasmBinary } from "tesseract-wasm/node";

import sharp from "sharp";
 
async function loadImage(path) {

  const image = await sharp(path).ensureAlpha();

  const { width, height } = await image.metadata();

  return {

    data: await image.raw().toBuffer(),

    width,

    height,

  };

}
 
/** Resolve a URL relative to the current module. */

function resolve(path) {

  return fileURLToPath(new URL(path, import.meta.url).href);

}
 
const program = new Command();

program.description("Extract text from an image");

program.argument("file");

program.parse();
 
// Initialize the OCR engine. In this demo we use the synchronous OCREngine

// API directly. In a server you would want to use the async OCRClient API

// instead.

const wasmBinary = await loadWasmBinary();

const engine = await createOCREngine({ wasmBinary });
 
const model = readFileSync("chi_sim.traineddata");

engine.loadModel(model);
 
// Load the image and perform OCR synchronously.

const image = await loadImage(program.args[0]);

engine.loadImage(image);
 
const text = engine.getText((progress) => {

  process.stderr.write(`\rRecognizing text (${progress}% done)...`);

});

process.stderr.write("\n\n");

process.stdout.write(text);

参考资料

https://github.com/tesseract-ocr/tesseract
https://github.com/robertknight/tesseract-wasm
https://github.com/robertknight/tesseract-wasm/tree/main/examples
https://www.fabiofranchino.com/log/how-to-remove-hashing-in-vite-built-file-names/
https://github.com/vitejs/vite/issues/378
https://github.com/rongfengliang/tesseract-wasm-learning

tesseract tesseract-wasm webassembly wasm

tesseract-wasm

tesseract-wasm tesseract fastify wasm

tesseract webassembly ocr js

webassembly wasm-vips libvips wasm

containerd containerd-wasm-shims webassembly

开发工具webassembly wasm-pack工具

tesseract

tesseract-ocr

tesseract-ocr tesseract ocr