diff --git a/apps/server/src/services/ocr/processors/image_processor.ts b/apps/server/src/services/ocr/processors/image_processor.ts index 52145cac3e..ecde074be6 100644 --- a/apps/server/src/services/ocr/processors/image_processor.ts +++ b/apps/server/src/services/ocr/processors/image_processor.ts @@ -1,8 +1,9 @@ import Tesseract from 'tesseract.js'; -import { FileProcessor } from './file_processor.js'; -import { OCRResult, OCRProcessingOptions } from '../ocr_service.js'; + import log from '../../log.js'; import options from '../../options.js'; +import { OCRProcessingOptions,OCRResult } from '../ocr_service.js'; +import { FileProcessor } from './file_processor.js'; /** * Image processor for extracting text from image files using Tesseract @@ -135,7 +136,6 @@ export class ImageProcessor extends FileProcessor { */ private getDefaultOCRLanguage(): string { try { - const options = require('../../options.js').default; const ocrLanguage = options.getOption('ocrLanguage'); if (!ocrLanguage) { throw new Error('OCR language not configured in user settings'); @@ -161,8 +161,8 @@ export class ImageProcessor extends FileProcessor { }; } - let filteredWords: string[] = []; - let validConfidences: number[] = []; + const filteredWords: string[] = []; + const validConfidences: number[] = []; // Tesseract provides word-level data if (data.words && Array.isArray(data.words)) { @@ -182,13 +182,12 @@ export class ImageProcessor extends FileProcessor { filteredText: data.text.trim(), overallConfidence }; - } else { - log.info(`Entire text filtered out due to low confidence ${overallConfidence} (below threshold ${minConfidence})`); - return { - filteredText: '', - overallConfidence - }; } + log.info(`Entire text filtered out due to low confidence ${overallConfidence} (below threshold ${minConfidence})`); + return { + filteredText: '', + overallConfidence + }; } // Calculate average confidence of accepted words diff --git a/apps/server/src/services/ocr/processors/office_processor.ts b/apps/server/src/services/ocr/processors/office_processor.ts index 8e99eea559..826ada7b3d 100644 --- a/apps/server/src/services/ocr/processors/office_processor.ts +++ b/apps/server/src/services/ocr/processors/office_processor.ts @@ -1,8 +1,10 @@ import * as officeParser from 'officeparser'; -import { FileProcessor } from './file_processor.js'; -import { OCRResult, OCRProcessingOptions } from '../ocr_service.js'; -import { ImageProcessor } from './image_processor.js'; + import log from '../../log.js'; +import options from '../../options.js'; +import { OCRProcessingOptions,OCRResult } from '../ocr_service.js'; +import { FileProcessor } from './file_processor.js'; +import { ImageProcessor } from './image_processor.js'; /** * Office document processor for extracting text and images from DOCX/XLSX/PPTX files @@ -51,9 +53,9 @@ export class OfficeProcessor extends FileProcessor { const result: OCRResult = { text: combinedText, - confidence: confidence, + confidence, extractedAt: new Date().toISOString(), - language: language, + language, pageCount: 1 // Office documents are treated as single logical document }; @@ -97,7 +99,6 @@ export class OfficeProcessor extends FileProcessor { */ private getDefaultOCRLanguage(): string { try { - const options = require('../../options.js').default; const ocrLanguage = options.getOption('ocrLanguage'); if (!ocrLanguage) { throw new Error('OCR language not configured in user settings'); diff --git a/apps/server/src/services/ocr/processors/pdf_processor.ts b/apps/server/src/services/ocr/processors/pdf_processor.ts index 9027159000..7b236d974b 100644 --- a/apps/server/src/services/ocr/processors/pdf_processor.ts +++ b/apps/server/src/services/ocr/processors/pdf_processor.ts @@ -1,9 +1,10 @@ import * as pdfParse from 'pdf-parse'; -import { FileProcessor } from './file_processor.js'; -import { OCRResult, OCRProcessingOptions } from '../ocr_service.js'; -import { ImageProcessor } from './image_processor.js'; + import log from '../../log.js'; -import sharp from 'sharp'; +import options from '../../options.js'; +import { OCRProcessingOptions,OCRResult } from '../ocr_service.js'; +import { FileProcessor } from './file_processor.js'; +import { ImageProcessor } from './image_processor.js'; /** * PDF processor for extracting text from PDF files @@ -58,7 +59,7 @@ export class PDFProcessor extends FileProcessor { private async extractTextFromPDF(buffer: Buffer, options: OCRProcessingOptions): Promise { try { const data = await pdfParse(buffer); - + return { text: data.text.trim(), confidence: 0.99, // High confidence for direct text extraction @@ -77,15 +78,15 @@ export class PDFProcessor extends FileProcessor { // Convert PDF to images and OCR each page // For now, we'll use a simple approach - convert first page to image // In a full implementation, we'd convert all pages - + // This is a simplified implementation // In practice, you might want to use pdf2pic or similar library // to convert PDF pages to images for OCR - + // For now, we'll return a placeholder result // indicating that OCR on PDF is not fully implemented log.info('PDF to image conversion not fully implemented, returning placeholder'); - + return { text: '[PDF OCR not fully implemented - would convert PDF pages to images and OCR each page]', confidence: 0.0, @@ -112,7 +113,6 @@ export class PDFProcessor extends FileProcessor { */ private getDefaultOCRLanguage(): string { try { - const options = require('../../options.js').default; const ocrLanguage = options.getOption('ocrLanguage'); if (!ocrLanguage) { throw new Error('OCR language not configured in user settings'); @@ -132,16 +132,16 @@ export class PDFProcessor extends FileProcessor { if (!language || typeof language !== 'string') { return false; } - + // Split by '+' for multi-language format const languages = language.split('+'); - + // Check each language code (should be 2-7 characters, alphanumeric with underscores) const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/; - + return languages.every(lang => { const trimmed = lang.trim(); return trimmed.length > 0 && validLanguagePattern.test(trimmed); }); } -} \ No newline at end of file +} diff --git a/apps/server/src/services/ocr/processors/tiff_processor.ts b/apps/server/src/services/ocr/processors/tiff_processor.ts index 2fba58ce92..6ce33b45c1 100644 --- a/apps/server/src/services/ocr/processors/tiff_processor.ts +++ b/apps/server/src/services/ocr/processors/tiff_processor.ts @@ -1,8 +1,10 @@ import sharp from 'sharp'; -import { FileProcessor } from './file_processor.js'; -import { OCRResult, OCRProcessingOptions } from '../ocr_service.js'; -import { ImageProcessor } from './image_processor.js'; + import log from '../../log.js'; +import options from '../../options.js'; +import { OCRProcessingOptions,OCRResult } from '../ocr_service.js'; +import { FileProcessor } from './file_processor.js'; +import { ImageProcessor } from './image_processor.js'; /** * TIFF processor for extracting text from multi-page TIFF files @@ -45,7 +47,7 @@ export class TIFFProcessor extends FileProcessor { for (let page = 0; page < pageCount; page++) { try { log.info(`Processing TIFF page ${page + 1}/${pageCount}...`); - + // Extract page as PNG buffer const pageBuffer = await sharp(buffer, { page }) .png() @@ -53,10 +55,10 @@ export class TIFFProcessor extends FileProcessor { // OCR the page const pageResult = await this.imageProcessor.extractText(pageBuffer, options); - + if (pageResult.text.trim().length > 0) { if (combinedText.length > 0) { - combinedText += '\n\n--- Page ' + (page + 1) + ' ---\n'; + combinedText += `\n\n--- Page ${page + 1} ---\n`; } combinedText += pageResult.text; totalConfidence += pageResult.confidence; @@ -74,7 +76,7 @@ export class TIFFProcessor extends FileProcessor { confidence: averageConfidence, extractedAt: new Date().toISOString(), language: options.language || this.getDefaultOCRLanguage(), - pageCount: pageCount + pageCount }; log.info(`TIFF text extraction completed. Pages: ${pageCount}, Confidence: ${averageConfidence}%, Text length: ${result.text.length}`); @@ -99,7 +101,6 @@ export class TIFFProcessor extends FileProcessor { */ private getDefaultOCRLanguage(): string { try { - const options = require('../../options.js').default; const ocrLanguage = options.getOption('ocrLanguage'); if (!ocrLanguage) { throw new Error('OCR language not configured in user settings'); @@ -119,16 +120,16 @@ export class TIFFProcessor extends FileProcessor { if (!language || typeof language !== 'string') { return false; } - + // Split by '+' for multi-language format const languages = language.split('+'); - + // Check each language code (should be 2-7 characters, alphanumeric with underscores) const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/; - + return languages.every(lang => { const trimmed = lang.trim(); return trimmed.length > 0 && validLanguagePattern.test(trimmed); }); } -} \ No newline at end of file +}