From baa93cb371bfb766a290b5626f15421faf58d00e Mon Sep 17 00:00:00 2001 From: Elian Doran Date: Sun, 5 Apr 2026 22:14:01 +0300 Subject: [PATCH] chore(ocr): expose needed dependencies --- apps/server/scripts/build.ts | 4 ++++ .../ocr/processors/image_processor.ts | 19 +++++++++++++++++-- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/apps/server/scripts/build.ts b/apps/server/scripts/build.ts index 9fa1f9cb83..73a3aeaee3 100644 --- a/apps/server/scripts/build.ts +++ b/apps/server/scripts/build.ts @@ -12,6 +12,10 @@ async function main() { // Copy node modules dependencies build.copyNodeModules([ "better-sqlite3", "bindings", "file-uri-to-path" ]); + + // Tesseract.js worker runs in a separate worker_thread and needs its + // source files (+ WASM core) on disk — they cannot be bundled. + build.copyNodeModules([ "tesseract.js", "tesseract.js-core", "wasm-feature-detect" ]); build.copy("/node_modules/ckeditor5/dist/ckeditor5-content.css", "ckeditor5-content.css"); build.buildFrontend(); diff --git a/apps/server/src/services/ocr/processors/image_processor.ts b/apps/server/src/services/ocr/processors/image_processor.ts index 15bfe78085..a7a09acd85 100644 --- a/apps/server/src/services/ocr/processors/image_processor.ts +++ b/apps/server/src/services/ocr/processors/image_processor.ts @@ -1,9 +1,11 @@ import fs from 'fs'; +import path from 'path'; import Tesseract from 'tesseract.js'; import dataDirs from '../../data_dir.js'; import log from '../../log.js'; import options from '../../options.js'; +import { getResourceDir, isDev } from '../../utils.js'; import { OCRProcessingOptions,OCRResult } from '../ocr_service.js'; import { FileProcessor } from './file_processor.js'; @@ -79,14 +81,27 @@ export class ImageProcessor extends FileProcessor { fs.mkdirSync(dataDirs.OCR_CACHE_DIR, { recursive: true }); log.info(`Initializing Tesseract worker for language(s): ${language}`); - this.worker = await Tesseract.createWorker(language, 1, { + + const workerOptions: Record = { cachePath: dataDirs.OCR_CACHE_DIR, logger: (m: { status: string; progress: number }) => { if (m.status === 'recognizing text') { log.info(`Image OCR progress (${language}): ${Math.round(m.progress * 100)}%`); } } - }); + }; + + // In production the server is bundled, so tesseract.js's default + // __dirname-based worker path is wrong. Point it at the copy we + // place in dist/node_modules during the build step. + if (!isDev) { + workerOptions.workerPath = path.join( + getResourceDir(), + 'node_modules', 'tesseract.js', 'src', 'worker-script', 'node', 'index.js' + ); + } + + this.worker = await Tesseract.createWorker(language, 1, workerOptions); this.currentLanguage = language; }