tesseract 是一个开源的ocr 工具,社区提供可一个基于webassembly 的tesseract-wasm ,可以方便直接基于浏览器的ocr 识别
以下是一个简单的试用
项目代码
- package.json
{
"name": "tesseract",
"version": "1.0.0",
"main": "index.js",
"license": "MIT",
"dependencies": {
"tesseract-wasm": "^0.10.0"
},
"devDependencies": {
"vite": "^4.5.0"
},
"scripts": {
"dev":"vite --force",
"build":"vite build"
}
}
- vite.config.js
import { defineConfig } from "vite";
export default defineConfig({
optimizeDeps: {
exclude: ['tesseract-wasm']
},
build: {
rollupOptions:{
output:{
entryFileNames: `[name].js`,
chunkFileNames: `[name].js`,
assetFileNames: `[name].[ext]`
}
}
},
assetsInclude: ['**/*.wasm',"**/*.traineddata"],
});
- 测试代码
需要识别的图片
app.js
import { OCRClient } from 'tesseract-wasm';
// 识别一个中文的图片
import imgUrl from './6.png'
// 使用中文模型
import traineddataModel from './chi_sim.traineddata'
async function runOCR() {
// Fetch document image and decode it into an ImageBitmap.
const imageResponse = await fetch(imgUrl);
const imageBlob = await imageResponse.blob();
const image = await createImageBitmap(imageBlob);
// Initialize the OCR engine. This will start a Web Worker to do the
// work in the background.
const ocr = new OCRClient();
try {
// Load the appropriate OCR training data for the image(s) we want to
// process.
await ocr.loadModel(traineddataModel);
await ocr.loadImage(image);
// Perform text recognition and return text in reading order.
const text = await ocr.getText();
// 渲染到页面
document.body.textContent = `tesseract-wasm result: ${text}`;
} finally {
// Once all OCR-ing has been done, shut down the Web Worker and free up
// resources.
ocr.destroy();
}
}
runOCR();
- 效果
说明
目前来说tesseract-wasm 并不是很大(2m) 左右,但是中文的模型比较大(50M),对于一些简单场景,而且可以容忍加载时间的可以使用
实际上做好优化也还可以,因为使用了web worker 对于一些加载会有一些问题,我使用了原始模式的vite 构建,完整代码在github 中,同时tesseract-wasm 也提供了nodejs 支持,官方示例提供了,nodejs 示例
import { readFileSync } from "node:fs";
import { fileURLToPath } from "node:url";
import { Command } from "commander";
import { createOCREngine } from "tesseract-wasm";
import { loadWasmBinary } from "tesseract-wasm/node";
import sharp from "sharp";
async function loadImage(path) {
const image = await sharp(path).ensureAlpha();
const { width, height } = await image.metadata();
return {
data: await image.raw().toBuffer(),
width,
height,
};
}
/** Resolve a URL relative to the current module. */
function resolve(path) {
return fileURLToPath(new URL(path, import.meta.url).href);
}
const program = new Command();
program.description("Extract text from an image");
program.argument("file");
program.parse();
// Initialize the OCR engine. In this demo we use the synchronous OCREngine
// API directly. In a server you would want to use the async OCRClient API
// instead.
const wasmBinary = await loadWasmBinary();
const engine = await createOCREngine({ wasmBinary });
const model = readFileSync("chi_sim.traineddata");
engine.loadModel(model);
// Load the image and perform OCR synchronously.
const image = await loadImage(program.args[0]);
engine.loadImage(image);
const text = engine.getText((progress) => {
process.stderr.write(`\rRecognizing text (${progress}% done)...`);
});
process.stderr.write("\n\n");
process.stdout.write(text);
参考资料
https://github.com/tesseract-ocr/tesseract
https://github.com/robertknight/tesseract-wasm
https://github.com/robertknight/tesseract-wasm/tree/main/examples
https://www.fabiofranchino.com/log/how-to-remove-hashing-in-vite-built-file-names/
https://github.com/vitejs/vite/issues/378
https://github.com/rongfengliang/tesseract-wasm-learning