feat(ocr): handle cache dir properly

This commit is contained in:
Elian Doran
2026-04-02 10:54:15 +03:00
parent 010f59df8a
commit ac310eaaf5
2 changed files with 10 additions and 6 deletions

View File

@@ -1,5 +1,3 @@
"use strict";
/*
* This file resolves trilium data path in this order of priority:
* - case A) if TRILIUM_DATA_DIR environment variable exists, then its value is used as the path
@@ -8,8 +6,8 @@
* - case D) as a fallback if the previous step fails, we'll use home dir
*/
import os from "os";
import fs from "fs";
import os from "os";
import { join as pathJoin } from "path";
const DIR_NAME = "trilium-data";
@@ -43,13 +41,14 @@ export function getTriliumDataDir(dataDirName: string) {
export function getDataDirs(TRILIUM_DATA_DIR: string) {
const dataDirs = {
TRILIUM_DATA_DIR: TRILIUM_DATA_DIR,
TRILIUM_DATA_DIR,
DOCUMENT_PATH: process.env.TRILIUM_DOCUMENT_PATH || pathJoin(TRILIUM_DATA_DIR, "document.db"),
BACKUP_DIR: process.env.TRILIUM_BACKUP_DIR || pathJoin(TRILIUM_DATA_DIR, "backup"),
LOG_DIR: process.env.TRILIUM_LOG_DIR || pathJoin(TRILIUM_DATA_DIR, "log"),
TMP_DIR: process.env.TRILIUM_TMP_DIR || pathJoin(TRILIUM_DATA_DIR, "tmp"),
ANONYMIZED_DB_DIR: process.env.TRILIUM_ANONYMIZED_DB_DIR || pathJoin(TRILIUM_DATA_DIR, "anonymized-db"),
CONFIG_INI_PATH: process.env.TRILIUM_CONFIG_INI_PATH || pathJoin(TRILIUM_DATA_DIR, "config.ini")
CONFIG_INI_PATH: process.env.TRILIUM_CONFIG_INI_PATH || pathJoin(TRILIUM_DATA_DIR, "config.ini"),
OCR_CACHE_DIR: pathJoin(TRILIUM_DATA_DIR, "ocr-cache")
} as const;
createDirIfNotExisting(dataDirs.TMP_DIR);

View File

@@ -1,5 +1,7 @@
import fs from 'fs';
import Tesseract from 'tesseract.js';
import dataDirs from '../../data_dir.js';
import log from '../../log.js';
import options from '../../options.js';
import { OCRProcessingOptions,OCRResult } from '../ocr_service.js';
@@ -55,6 +57,7 @@ export class ImageProcessor extends FileProcessor {
await this.worker.terminate();
log.info(`Initializing Tesseract worker for language(s): ${language}`);
this.worker = await Tesseract.createWorker(language, 1, {
cachePath: dataDirs.OCR_CACHE_DIR,
logger: (m: { status: string; progress: number }) => {
if (m.status === 'recognizing text') {
log.info(`Image OCR progress (${language}): ${Math.round(m.progress * 100)}%`);
@@ -97,8 +100,9 @@ export class ImageProcessor extends FileProcessor {
try {
log.info('Initializing image OCR processor with Tesseract.js...');
fs.mkdirSync(dataDirs.OCR_CACHE_DIR, { recursive: true });
// Configure proper paths for Node.js environment
const tesseractDir = require.resolve('tesseract.js').replace('/src/index.js', '');
const workerPath = require.resolve('tesseract.js/src/worker-script/node/index.js');
const corePath = require.resolve('tesseract.js-core/tesseract-core.wasm.js');
@@ -108,6 +112,7 @@ export class ImageProcessor extends FileProcessor {
this.worker = await Tesseract.createWorker("eng", 1, {
workerPath,
corePath,
cachePath: dataDirs.OCR_CACHE_DIR,
logger: (m: { status: string; progress: number }) => {
if (m.status === 'recognizing text') {
log.info(`Image OCR progress: ${Math.round(m.progress * 100)}%`);