fix(ocr): properly handle office MIME types

2026-07-06 01:18:42 +02:00 · 2026-04-02 12:41:45 +03:00
parent ad29375975
commit bdd2b7e317
2 changed files with 38 additions and 112 deletions
--- a/apps/server/src/services/ocr/ocr_service.ts
+++ b/apps/server/src/services/ocr/ocr_service.ts
@@ -620,7 +620,9 @@ class OCRService {
     */
    getBlobsNeedingOCR(): Array<{ blobId: string; mimeType: string; entityType: 'note' | 'attachment'; entityId: string }> {
        try {
-            // Get notes with blobs that need OCR (both image notes and file notes with supported MIME types)
+            const supportedMimes = this.getAllSupportedMimeTypes();
+            const placeholders = supportedMimes.map(() => '?').join(', ');
+
            const noteBlobs = sql.getRows<{
                blobId: string;
                mimeType: string;
@@ -629,35 +631,12 @@ class OCRService {
                SELECT n.blobId, n.mime as mimeType, n.noteId as entityId
                FROM notes n
                JOIN blobs b ON n.blobId = b.blobId
-                WHERE (
-                    n.type = 'image'
-                    OR (
-                        n.type = 'file'
-                        AND n.mime IN (
-                            'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
-                            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
-                            'application/vnd.openxmlformats-officedocument.presentationml.presentation',
-                            'application/msword',
-                            'application/vnd.ms-excel',
-                            'application/vnd.ms-powerpoint',
-                            'application/rtf',
-                            'application/pdf',
-                            'image/jpeg',
-                            'image/jpg',
-                            'image/png',
-                            'image/gif',
-                            'image/bmp',
-                            'image/tiff',
-                            'image/webp'
-                        )
-                    )
-                )
+                WHERE (n.type = 'image' OR (n.type = 'file' AND n.mime IN (${placeholders})))
                AND n.isDeleted = 0
                AND n.blobId IS NOT NULL
                AND b.textRepresentation IS NULL
-            `);
+            `, supportedMimes);

-            // Get attachments with blobs that need OCR (both image and file attachments with supported MIME types)
            const attachmentBlobs = sql.getRows<{
                blobId: string;
                mimeType: string;
@@ -666,33 +645,11 @@ class OCRService {
                SELECT a.blobId, a.mime as mimeType, a.attachmentId as entityId
                FROM attachments a
                JOIN blobs b ON a.blobId = b.blobId
-                WHERE (
-                    a.role = 'image'
-                    OR (
-                        a.role = 'file'
-                        AND a.mime IN (
-                            'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
-                            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
-                            'application/vnd.openxmlformats-officedocument.presentationml.presentation',
-                            'application/msword',
-                            'application/vnd.ms-excel',
-                            'application/vnd.ms-powerpoint',
-                            'application/rtf',
-                            'application/pdf',
-                            'image/jpeg',
-                            'image/jpg',
-                            'image/png',
-                            'image/gif',
-                            'image/bmp',
-                            'image/tiff',
-                            'image/webp'
-                        )
-                    )
-                )
+                WHERE (a.role = 'image' OR (a.role = 'file' AND a.mime IN (${placeholders})))
                AND a.isDeleted = 0
                AND a.blobId IS NOT NULL
                AND b.textRepresentation IS NULL
-            `);
+            `, supportedMimes);

            // Combine results
            const result = [
--- a/apps/server/src/services/ocr/processors/office_processor.ts
+++ b/apps/server/src/services/ocr/processors/office_processor.ts
@@ -1,83 +1,52 @@
 import * as officeParser from 'officeparser';

 import log from '../../log.js';
-import { OCRProcessingOptions,OCRResult } from '../ocr_service.js';
+import { OCRProcessingOptions, OCRResult } from '../ocr_service.js';
 import { FileProcessor } from './file_processor.js';
-import { ImageProcessor } from './image_processor.js';
+
+const SUPPORTED_TYPES = [
+    // Office Open XML
+    'application/vnd.openxmlformats-officedocument.wordprocessingml.document',   // DOCX
+    'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',         // XLSX
+    'application/vnd.openxmlformats-officedocument.presentationml.presentation', // PPTX
+    // OpenDocument
+    'application/vnd.oasis.opendocument.text',                                  // ODT
+    'application/vnd.oasis.opendocument.spreadsheet',                            // ODS
+    'application/vnd.oasis.opendocument.presentation'                            // ODP
+];

 /**
- * Office document processor for extracting text and images from DOCX/XLSX/PPTX files
+ * Office document processor for extracting text from DOCX/XLSX/PPTX and ODT/ODS/ODP files.
 */
 export class OfficeProcessor extends FileProcessor {
-    private imageProcessor: ImageProcessor;
-    private readonly supportedTypes = [
-        'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // DOCX
-        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // XLSX
-        'application/vnd.openxmlformats-officedocument.presentationml.presentation', // PPTX
-        'application/msword', // DOC
-        'application/vnd.ms-excel', // XLS
-        'application/vnd.ms-powerpoint', // PPT
-        'application/rtf' // RTF
-    ];
-
-    constructor() {
-        super();
-        this.imageProcessor = new ImageProcessor();
-    }

    canProcess(mimeType: string): boolean {
-        return this.supportedTypes.includes(mimeType);
+        return SUPPORTED_TYPES.includes(mimeType);
    }

    getSupportedMimeTypes(): string[] {
-        return [...this.supportedTypes];
+        return [...SUPPORTED_TYPES];
    }

    async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
-        try {
-            log.info('Starting Office document text extraction...');
+        log.info('Starting Office document text extraction...');

-            const language = options.language || "eng";
+        const text = await officeParser.parseOfficeAsync(buffer, {
+            outputErrorToConsole: false,
+            newlineDelimiter: '\n',
+            ignoreNotes: false,
+            putNotesAtLast: false
+        });

-            // Extract text from Office document
-            const data = await this.parseOfficeDocument(buffer);
+        const trimmed = (text || '').trim();

-            // Extract text from Office document
-            const combinedText = data.data && data.data.trim().length > 0 ? data.data.trim() : '';
-            const confidence = combinedText.length > 0 ? 0.99 : 0; // High confidence for direct text extraction
-
-            const result: OCRResult = {
-                text: combinedText,
-                confidence,
-                extractedAt: new Date().toISOString(),
-                language,
-                pageCount: 1 // Office documents are treated as single logical document
-            };
-
-            return result;
-
-        } catch (error) {
-            log.error(`Office document text extraction failed: ${error}`);
-            throw error;
-        }
-    }
-
-    private async parseOfficeDocument(buffer: Buffer): Promise<{ data: string }> {
-        try {
-            // Use promise-based API directly
-            const data = await officeParser.parseOfficeAsync(buffer, {
-                outputErrorToConsole: false,
-                newlineDelimiter: '\n',
-                ignoreNotes: false,
-                putNotesAtLast: false
-            });
-
-            return {
-                data: data || ''
-            };
-        } catch (error) {
-            throw new Error(`Office document parsing failed: ${error}`);
-        }
+        return {
+            text: trimmed,
+            confidence: trimmed.length > 0 ? 0.99 : 0,
+            extractedAt: new Date().toISOString(),
+            language: options.language || "eng",
+            pageCount: 1
+        };
    }

    getProcessingType(): string {
@@ -85,6 +54,6 @@ export class OfficeProcessor extends FileProcessor {
    }

    async cleanup(): Promise<void> {
-        await this.imageProcessor.cleanup();
+        // Nothing to clean up.
    }
 }