Tesseract Node.js – 使用“lang”选项导致崩溃

Tesseract.js与英语不同的基本用法使整个系统崩溃:

var options = { lang: 'deu', }; var image = require("path").join(__dirname, 'lib/images/ocr-test-text.png'); var Tesseract = require('tesseract.js') Tesseract.recognize(image, options) .progress(function (info) { console.log(info); }) .then(function (data) { console.log('done', data); process.exit(); }) 

触发以下错误:

 > node index.js { status: 'loading tesseract core' } { status: 'loaded tesseract core' } { status: 'initializing tesseract', progress: 0 } pre-main prep time: 68 ms { status: 'initializing tesseract', progress: 1 } { status: 'downloading deu.traineddata.gz', loaded: 116, progress: 0.00011697604814572795 } events.js:182 throw er; // Unhandled 'error' event ^ Error: incorrect header check at Gunzip.zlibOnError (zlib.js:146:15) 

Github问题: https : //github.com/naptha/tesseract.js/issues/129

有什么想法发生了什么?

更新:按照第一个答案的指示,并下载“deu”训练数据后,出现以下错误:

 export TESSDATA_PREFIX=/opt/TESSDATA && node get-text-from-image.js /opt/app/out/image.png params [ '/opt/app/out/image.png' ] progress { status: 'loading tesseract core' } progress { status: 'loaded tesseract core' } progress { status: 'initializing tesseract', progress: 0 } pre-main prep time: 62 ms progress { status: 'initializing tesseract', progress: 1 } progress { status: 'loading deu.traineddata', progress: 0 } progress { status: 'loading deu.traineddata', progress: 1 } progress { status: 'initializing api', progress: 0 } Failed loading language 'deu' Tesseract couldn't load any languages! progress { status: 'initializing api', progress: 0.3 } progress { status: 'initializing api', progress: 0.6 } progress { status: 'initializing api', progress: 1 } progress { status: 'recognizing text', progress: 0 } AdaptedTemplates != NULL:Error:Assert failed:in file ../classify/adaptmatch.cpp, line 190 /opt/app/node_modules/tesseract.js-core/index.js:4 function f(a){throw a;}var h=void 0,i=!0,j=null,k=!1;function aa(){return function(){}}function ba(a){return function(){return a}}var n,Module;Module||(Module=eval("(function() { try { return TesseractCore || {} } catch(e) { return {} } })()"));var ca={},da;for(da in Module)Module.hasOwnProperty(da)&&(ca[da]=Module[da]);var ea=i,fa=!ea&&i; ^ abort() at Error at Na (/opt/app/node_modules/tesseract.js-core/index.js:32:26) at Object.ka [as abort] (/opt/app/node_modules/tesseract.js-core/index.js:507:108) at _abort (/opt/app/node_modules/tesseract.js-core/index.js:373:173) at $L (/opt/app/node_modules/tesseract.js-core/index.js:383:55709) at jpa (/opt/app/node_modules/tesseract.js-core/index.js:388:22274) at lT (/opt/app/node_modules/tesseract.js-core/index.js:387:80568) at mT (/opt/app/node_modules/tesseract.js-core/index.js:387:80700) at Array.BS (/opt/app/node_modules/tesseract.js-core/index.js:387:69011) at bP (/opt/app/node_modules/tesseract.js-core/index.js:383:110121) at jT (/opt/app/node_modules/tesseract.js-core/index.js:387:80280) If this abort() is unexpected, build with -s ASSERTIONS=1 which can give more information. 

它没有解压deu.traineddata.gz – 不知道为什么。 你可能想要自己下载文件,并尝试手工扫描。 这不是模块创build者build议的方式; 这里有其他东西可以尝试。

在运行node.js的机器上下载语言文件并将其放置在某处。

https://github.com/tesseract-ocr/tessdata

在环境中,确保variablesTESSDATA_PREFIX指向该位置。 例如,你可以把它们放在/opt/tessdata 。 如果你这样做,你可以像这样设置TESSDATA_PREFIX

 export TESSDATA_PREFIX=/opt/tessdata 

再试一次; 这一次它不应该尝试下载和解压缩它们本身。