diff --git a/apps/server/src/services/ocr/ocr_service.ts b/apps/server/src/services/ocr/ocr_service.ts index 5284002367..5f4eb044c2 100644 --- a/apps/server/src/services/ocr/ocr_service.ts +++ b/apps/server/src/services/ocr/ocr_service.ts @@ -620,7 +620,9 @@ class OCRService { */ getBlobsNeedingOCR(): Array<{ blobId: string; mimeType: string; entityType: 'note' | 'attachment'; entityId: string }> { try { - // Get notes with blobs that need OCR (both image notes and file notes with supported MIME types) + const supportedMimes = this.getAllSupportedMimeTypes(); + const placeholders = supportedMimes.map(() => '?').join(', '); + const noteBlobs = sql.getRows<{ blobId: string; mimeType: string; @@ -629,35 +631,12 @@ class OCRService { SELECT n.blobId, n.mime as mimeType, n.noteId as entityId FROM notes n JOIN blobs b ON n.blobId = b.blobId - WHERE ( - n.type = 'image' - OR ( - n.type = 'file' - AND n.mime IN ( - 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', - 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', - 'application/vnd.openxmlformats-officedocument.presentationml.presentation', - 'application/msword', - 'application/vnd.ms-excel', - 'application/vnd.ms-powerpoint', - 'application/rtf', - 'application/pdf', - 'image/jpeg', - 'image/jpg', - 'image/png', - 'image/gif', - 'image/bmp', - 'image/tiff', - 'image/webp' - ) - ) - ) + WHERE (n.type = 'image' OR (n.type = 'file' AND n.mime IN (${placeholders}))) AND n.isDeleted = 0 AND n.blobId IS NOT NULL AND b.textRepresentation IS NULL - `); + `, supportedMimes); - // Get attachments with blobs that need OCR (both image and file attachments with supported MIME types) const attachmentBlobs = sql.getRows<{ blobId: string; mimeType: string; @@ -666,33 +645,11 @@ class OCRService { SELECT a.blobId, a.mime as mimeType, a.attachmentId as entityId FROM attachments a JOIN blobs b ON a.blobId = b.blobId - WHERE ( - a.role = 'image' - OR ( - a.role = 'file' - AND a.mime IN ( - 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', - 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', - 'application/vnd.openxmlformats-officedocument.presentationml.presentation', - 'application/msword', - 'application/vnd.ms-excel', - 'application/vnd.ms-powerpoint', - 'application/rtf', - 'application/pdf', - 'image/jpeg', - 'image/jpg', - 'image/png', - 'image/gif', - 'image/bmp', - 'image/tiff', - 'image/webp' - ) - ) - ) + WHERE (a.role = 'image' OR (a.role = 'file' AND a.mime IN (${placeholders}))) AND a.isDeleted = 0 AND a.blobId IS NOT NULL AND b.textRepresentation IS NULL - `); + `, supportedMimes); // Combine results const result = [ diff --git a/apps/server/src/services/ocr/processors/office_processor.ts b/apps/server/src/services/ocr/processors/office_processor.ts index bdafb2f45b..b6b0ef6cb0 100644 --- a/apps/server/src/services/ocr/processors/office_processor.ts +++ b/apps/server/src/services/ocr/processors/office_processor.ts @@ -1,83 +1,52 @@ import * as officeParser from 'officeparser'; import log from '../../log.js'; -import { OCRProcessingOptions,OCRResult } from '../ocr_service.js'; +import { OCRProcessingOptions, OCRResult } from '../ocr_service.js'; import { FileProcessor } from './file_processor.js'; -import { ImageProcessor } from './image_processor.js'; + +const SUPPORTED_TYPES = [ + // Office Open XML + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // DOCX + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // XLSX + 'application/vnd.openxmlformats-officedocument.presentationml.presentation', // PPTX + // OpenDocument + 'application/vnd.oasis.opendocument.text', // ODT + 'application/vnd.oasis.opendocument.spreadsheet', // ODS + 'application/vnd.oasis.opendocument.presentation' // ODP +]; /** - * Office document processor for extracting text and images from DOCX/XLSX/PPTX files + * Office document processor for extracting text from DOCX/XLSX/PPTX and ODT/ODS/ODP files. */ export class OfficeProcessor extends FileProcessor { - private imageProcessor: ImageProcessor; - private readonly supportedTypes = [ - 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // DOCX - 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // XLSX - 'application/vnd.openxmlformats-officedocument.presentationml.presentation', // PPTX - 'application/msword', // DOC - 'application/vnd.ms-excel', // XLS - 'application/vnd.ms-powerpoint', // PPT - 'application/rtf' // RTF - ]; - - constructor() { - super(); - this.imageProcessor = new ImageProcessor(); - } canProcess(mimeType: string): boolean { - return this.supportedTypes.includes(mimeType); + return SUPPORTED_TYPES.includes(mimeType); } getSupportedMimeTypes(): string[] { - return [...this.supportedTypes]; + return [...SUPPORTED_TYPES]; } async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise { - try { - log.info('Starting Office document text extraction...'); + log.info('Starting Office document text extraction...'); - const language = options.language || "eng"; + const text = await officeParser.parseOfficeAsync(buffer, { + outputErrorToConsole: false, + newlineDelimiter: '\n', + ignoreNotes: false, + putNotesAtLast: false + }); - // Extract text from Office document - const data = await this.parseOfficeDocument(buffer); + const trimmed = (text || '').trim(); - // Extract text from Office document - const combinedText = data.data && data.data.trim().length > 0 ? data.data.trim() : ''; - const confidence = combinedText.length > 0 ? 0.99 : 0; // High confidence for direct text extraction - - const result: OCRResult = { - text: combinedText, - confidence, - extractedAt: new Date().toISOString(), - language, - pageCount: 1 // Office documents are treated as single logical document - }; - - return result; - - } catch (error) { - log.error(`Office document text extraction failed: ${error}`); - throw error; - } - } - - private async parseOfficeDocument(buffer: Buffer): Promise<{ data: string }> { - try { - // Use promise-based API directly - const data = await officeParser.parseOfficeAsync(buffer, { - outputErrorToConsole: false, - newlineDelimiter: '\n', - ignoreNotes: false, - putNotesAtLast: false - }); - - return { - data: data || '' - }; - } catch (error) { - throw new Error(`Office document parsing failed: ${error}`); - } + return { + text: trimmed, + confidence: trimmed.length > 0 ? 0.99 : 0, + extractedAt: new Date().toISOString(), + language: options.language || "eng", + pageCount: 1 + }; } getProcessingType(): string { @@ -85,6 +54,6 @@ export class OfficeProcessor extends FileProcessor { } async cleanup(): Promise { - await this.imageProcessor.cleanup(); + // Nothing to clean up. } }