mirror of
https://github.com/zadam/trilium.git
synced 2026-05-07 01:26:52 +02:00
refactor(ocr): get rid of require imports
This commit is contained in:
@@ -1,8 +1,9 @@
|
||||
import Tesseract from 'tesseract.js';
|
||||
import { FileProcessor } from './file_processor.js';
|
||||
import { OCRResult, OCRProcessingOptions } from '../ocr_service.js';
|
||||
|
||||
import log from '../../log.js';
|
||||
import options from '../../options.js';
|
||||
import { OCRProcessingOptions,OCRResult } from '../ocr_service.js';
|
||||
import { FileProcessor } from './file_processor.js';
|
||||
|
||||
/**
|
||||
* Image processor for extracting text from image files using Tesseract
|
||||
@@ -135,7 +136,6 @@ export class ImageProcessor extends FileProcessor {
|
||||
*/
|
||||
private getDefaultOCRLanguage(): string {
|
||||
try {
|
||||
const options = require('../../options.js').default;
|
||||
const ocrLanguage = options.getOption('ocrLanguage');
|
||||
if (!ocrLanguage) {
|
||||
throw new Error('OCR language not configured in user settings');
|
||||
@@ -161,8 +161,8 @@ export class ImageProcessor extends FileProcessor {
|
||||
};
|
||||
}
|
||||
|
||||
let filteredWords: string[] = [];
|
||||
let validConfidences: number[] = [];
|
||||
const filteredWords: string[] = [];
|
||||
const validConfidences: number[] = [];
|
||||
|
||||
// Tesseract provides word-level data
|
||||
if (data.words && Array.isArray(data.words)) {
|
||||
@@ -182,13 +182,12 @@ export class ImageProcessor extends FileProcessor {
|
||||
filteredText: data.text.trim(),
|
||||
overallConfidence
|
||||
};
|
||||
} else {
|
||||
log.info(`Entire text filtered out due to low confidence ${overallConfidence} (below threshold ${minConfidence})`);
|
||||
return {
|
||||
filteredText: '',
|
||||
overallConfidence
|
||||
};
|
||||
}
|
||||
log.info(`Entire text filtered out due to low confidence ${overallConfidence} (below threshold ${minConfidence})`);
|
||||
return {
|
||||
filteredText: '',
|
||||
overallConfidence
|
||||
};
|
||||
}
|
||||
|
||||
// Calculate average confidence of accepted words
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
import * as officeParser from 'officeparser';
|
||||
import { FileProcessor } from './file_processor.js';
|
||||
import { OCRResult, OCRProcessingOptions } from '../ocr_service.js';
|
||||
import { ImageProcessor } from './image_processor.js';
|
||||
|
||||
import log from '../../log.js';
|
||||
import options from '../../options.js';
|
||||
import { OCRProcessingOptions,OCRResult } from '../ocr_service.js';
|
||||
import { FileProcessor } from './file_processor.js';
|
||||
import { ImageProcessor } from './image_processor.js';
|
||||
|
||||
/**
|
||||
* Office document processor for extracting text and images from DOCX/XLSX/PPTX files
|
||||
@@ -51,9 +53,9 @@ export class OfficeProcessor extends FileProcessor {
|
||||
|
||||
const result: OCRResult = {
|
||||
text: combinedText,
|
||||
confidence: confidence,
|
||||
confidence,
|
||||
extractedAt: new Date().toISOString(),
|
||||
language: language,
|
||||
language,
|
||||
pageCount: 1 // Office documents are treated as single logical document
|
||||
};
|
||||
|
||||
@@ -97,7 +99,6 @@ export class OfficeProcessor extends FileProcessor {
|
||||
*/
|
||||
private getDefaultOCRLanguage(): string {
|
||||
try {
|
||||
const options = require('../../options.js').default;
|
||||
const ocrLanguage = options.getOption('ocrLanguage');
|
||||
if (!ocrLanguage) {
|
||||
throw new Error('OCR language not configured in user settings');
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
import * as pdfParse from 'pdf-parse';
|
||||
import { FileProcessor } from './file_processor.js';
|
||||
import { OCRResult, OCRProcessingOptions } from '../ocr_service.js';
|
||||
import { ImageProcessor } from './image_processor.js';
|
||||
|
||||
import log from '../../log.js';
|
||||
import sharp from 'sharp';
|
||||
import options from '../../options.js';
|
||||
import { OCRProcessingOptions,OCRResult } from '../ocr_service.js';
|
||||
import { FileProcessor } from './file_processor.js';
|
||||
import { ImageProcessor } from './image_processor.js';
|
||||
|
||||
/**
|
||||
* PDF processor for extracting text from PDF files
|
||||
@@ -58,7 +59,7 @@ export class PDFProcessor extends FileProcessor {
|
||||
private async extractTextFromPDF(buffer: Buffer, options: OCRProcessingOptions): Promise<OCRResult> {
|
||||
try {
|
||||
const data = await pdfParse(buffer);
|
||||
|
||||
|
||||
return {
|
||||
text: data.text.trim(),
|
||||
confidence: 0.99, // High confidence for direct text extraction
|
||||
@@ -77,15 +78,15 @@ export class PDFProcessor extends FileProcessor {
|
||||
// Convert PDF to images and OCR each page
|
||||
// For now, we'll use a simple approach - convert first page to image
|
||||
// In a full implementation, we'd convert all pages
|
||||
|
||||
|
||||
// This is a simplified implementation
|
||||
// In practice, you might want to use pdf2pic or similar library
|
||||
// to convert PDF pages to images for OCR
|
||||
|
||||
|
||||
// For now, we'll return a placeholder result
|
||||
// indicating that OCR on PDF is not fully implemented
|
||||
log.info('PDF to image conversion not fully implemented, returning placeholder');
|
||||
|
||||
|
||||
return {
|
||||
text: '[PDF OCR not fully implemented - would convert PDF pages to images and OCR each page]',
|
||||
confidence: 0.0,
|
||||
@@ -112,7 +113,6 @@ export class PDFProcessor extends FileProcessor {
|
||||
*/
|
||||
private getDefaultOCRLanguage(): string {
|
||||
try {
|
||||
const options = require('../../options.js').default;
|
||||
const ocrLanguage = options.getOption('ocrLanguage');
|
||||
if (!ocrLanguage) {
|
||||
throw new Error('OCR language not configured in user settings');
|
||||
@@ -132,16 +132,16 @@ export class PDFProcessor extends FileProcessor {
|
||||
if (!language || typeof language !== 'string') {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
// Split by '+' for multi-language format
|
||||
const languages = language.split('+');
|
||||
|
||||
|
||||
// Check each language code (should be 2-7 characters, alphanumeric with underscores)
|
||||
const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/;
|
||||
|
||||
|
||||
return languages.every(lang => {
|
||||
const trimmed = lang.trim();
|
||||
return trimmed.length > 0 && validLanguagePattern.test(trimmed);
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
import sharp from 'sharp';
|
||||
import { FileProcessor } from './file_processor.js';
|
||||
import { OCRResult, OCRProcessingOptions } from '../ocr_service.js';
|
||||
import { ImageProcessor } from './image_processor.js';
|
||||
|
||||
import log from '../../log.js';
|
||||
import options from '../../options.js';
|
||||
import { OCRProcessingOptions,OCRResult } from '../ocr_service.js';
|
||||
import { FileProcessor } from './file_processor.js';
|
||||
import { ImageProcessor } from './image_processor.js';
|
||||
|
||||
/**
|
||||
* TIFF processor for extracting text from multi-page TIFF files
|
||||
@@ -45,7 +47,7 @@ export class TIFFProcessor extends FileProcessor {
|
||||
for (let page = 0; page < pageCount; page++) {
|
||||
try {
|
||||
log.info(`Processing TIFF page ${page + 1}/${pageCount}...`);
|
||||
|
||||
|
||||
// Extract page as PNG buffer
|
||||
const pageBuffer = await sharp(buffer, { page })
|
||||
.png()
|
||||
@@ -53,10 +55,10 @@ export class TIFFProcessor extends FileProcessor {
|
||||
|
||||
// OCR the page
|
||||
const pageResult = await this.imageProcessor.extractText(pageBuffer, options);
|
||||
|
||||
|
||||
if (pageResult.text.trim().length > 0) {
|
||||
if (combinedText.length > 0) {
|
||||
combinedText += '\n\n--- Page ' + (page + 1) + ' ---\n';
|
||||
combinedText += `\n\n--- Page ${page + 1} ---\n`;
|
||||
}
|
||||
combinedText += pageResult.text;
|
||||
totalConfidence += pageResult.confidence;
|
||||
@@ -74,7 +76,7 @@ export class TIFFProcessor extends FileProcessor {
|
||||
confidence: averageConfidence,
|
||||
extractedAt: new Date().toISOString(),
|
||||
language: options.language || this.getDefaultOCRLanguage(),
|
||||
pageCount: pageCount
|
||||
pageCount
|
||||
};
|
||||
|
||||
log.info(`TIFF text extraction completed. Pages: ${pageCount}, Confidence: ${averageConfidence}%, Text length: ${result.text.length}`);
|
||||
@@ -99,7 +101,6 @@ export class TIFFProcessor extends FileProcessor {
|
||||
*/
|
||||
private getDefaultOCRLanguage(): string {
|
||||
try {
|
||||
const options = require('../../options.js').default;
|
||||
const ocrLanguage = options.getOption('ocrLanguage');
|
||||
if (!ocrLanguage) {
|
||||
throw new Error('OCR language not configured in user settings');
|
||||
@@ -119,16 +120,16 @@ export class TIFFProcessor extends FileProcessor {
|
||||
if (!language || typeof language !== 'string') {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
// Split by '+' for multi-language format
|
||||
const languages = language.split('+');
|
||||
|
||||
|
||||
// Check each language code (should be 2-7 characters, alphanumeric with underscores)
|
||||
const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/;
|
||||
|
||||
|
||||
return languages.every(lang => {
|
||||
const trimmed = lang.trim();
|
||||
return trimmed.length > 0 && validLanguagePattern.test(trimmed);
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user