Trilium/apps/server/src/services/ocr/ocr_service.ts

import { getTesseractCode } from '@triliumnext/commons';
import Tesseract from 'tesseract.js';

import becca from '../../becca/becca.js';
import log from '../log.js';
import options from '../options.js';
import sql from '../sql.js';
import { FileProcessor } from './processors/file_processor.js';
import { ImageProcessor } from './processors/image_processor.js';
import { OfficeProcessor } from './processors/office_processor.js';
import { PDFProcessor } from './processors/pdf_processor.js';
import { TIFFProcessor } from './processors/tiff_processor.js';

export interface OCRResult {
    text: string;
    confidence: number;
    extractedAt: string;
    language?: string;
    pageCount?: number;
}

export interface OCRProcessingOptions {
    language?: string;
    forceReprocess?: boolean;
    confidence?: number;
    enablePDFTextExtraction?: boolean;
}

interface OCRBlobRow {
    blobId: string;
    textRepresentation: string;
    textExtractionLastProcessed?: string;
}

/**
 * OCR Service for extracting text from images and other OCR-able objects
 * Uses Tesseract.js for text recognition
 */
class OCRService {
    private worker: Tesseract.Worker | null = null;
    private isProcessing = false;
    private processors: Map<string, FileProcessor> = new Map();

    constructor() {
        // Initialize file processors
        this.processors.set('image', new ImageProcessor());
        this.processors.set('pdf', new PDFProcessor());
        this.processors.set('tiff', new TIFFProcessor());
        this.processors.set('office', new OfficeProcessor());
    }

    /**
     * Resolves the Tesseract language code(s) for OCR processing.
     *
     * Priority:
     * 1. Explicitly passed `language` option (e.g. from API call)
     * 2. The note's `language` label (mapped via {@link getTesseractCode})
     * 3. All enabled content languages joined with `+`
     * 4. The UI locale
     * 5. Fallback to `eng`
     */
    resolveOcrLanguage(noteId?: string, explicitLanguage?: string): string {
        // 1. Explicit language from caller
        if (explicitLanguage) {
            return explicitLanguage;
        }

        // 2. Note's language label
        if (noteId) {
            const note = becca.getNote(noteId);
            const noteLanguage = note?.getLabelValue("language");
            if (noteLanguage) {
                const code = getTesseractCode(noteLanguage);
                if (code) {
                    return code;
                }
            }
        }

        // 3. All enabled content languages
        try {
            const languagesJson = options.getOption("languages");
            const enabledLanguages = JSON.parse(languagesJson || "[]") as string[];
            if (enabledLanguages.length > 0) {
                const codes = enabledLanguages
                    .map((id) => getTesseractCode(id))
                    .filter((code): code is string => code !== null);
                // Deduplicate (e.g. en + en-GB both map to eng)
                const unique = [...new Set(codes)];
                if (unique.length > 0) {
                    return unique.join("+");
                }
            }
        } catch {
            // Fall through
        }

        // 4. UI locale
        try {
            const uiLocale = options.getOption("locale");
            if (uiLocale) {
                const code = getTesseractCode(uiLocale);
                if (code) {
                    return code;
                }
            }
        } catch {
            // Fall through
        }

        // 5. Fallback
        return "eng";
    }

    /**
     * Check if a MIME type is supported for OCR
     */
    isSupportedMimeType(mimeType: string): boolean {
        if (!mimeType || typeof mimeType !== 'string') {
            return false;
        }

        const supportedTypes = [
            'image/jpeg',
            'image/jpg',
            'image/png',
            'image/gif',
            'image/bmp',
            'image/tiff',
            'image/webp'
        ];
        return supportedTypes.includes(mimeType.toLowerCase());
    }

    /**
     * Extract text from file buffer using appropriate processor
     */
    async extractTextFromFile(fileBuffer: Buffer, mimeType: string, options: OCRProcessingOptions = {}): Promise<OCRResult> {
        try {
            log.info(`Starting OCR text extraction for MIME type: ${mimeType} with language: ${options.language || "auto-detect"}`);
            this.isProcessing = true;

            // Find appropriate processor
            const processor = this.getProcessorForMimeType(mimeType);
            if (!processor) {
                throw new Error(`No processor found for MIME type: ${mimeType}`);
            }

            const result = await processor.extractText(fileBuffer, options);

            log.info(`OCR extraction completed. Confidence: ${result.confidence}%, Text length: ${result.text.length}`);
            return result;

        } catch (error) {
            log.error(`OCR text extraction failed: ${error}`);
            throw error;
        } finally {
            this.isProcessing = false;
        }
    }

    /**
     * Process OCR for a note (image type)
     */
    async processNoteOCR(noteId: string, options: OCRProcessingOptions = {}): Promise<OCRResult | null> {
        const note = becca.getNote(noteId);
        if (!note) {
            log.error(`Note ${noteId} not found`);
            return null;
        }

        // Check if note type and MIME type are supported for OCR
        if (note.type === 'image') {
            if (!this.isSupportedMimeType(note.mime)) {
                log.info(`Image note ${noteId} has unsupported MIME type ${note.mime}, skipping OCR`);
                return null;
            }
        } else if (note.type === 'file') {
            // Check if file MIME type is supported by any processor
            const processor = this.getProcessorForMimeType(note.mime);
            if (!processor) {
                log.info(`File note ${noteId} has unsupported MIME type ${note.mime} for OCR, skipping`);
                return null;
            }
        } else {
            log.info(`Note ${noteId} is not an image or file note, skipping OCR`);
            return null;
        }

        // Check if OCR already exists and is up-to-date
        const existingOCR = this.getStoredOCRResult(note.blobId);
        if (existingOCR && !options.forceReprocess && note.blobId && !this.needsReprocessing(note.blobId)) {
            log.info(`OCR already exists and is up-to-date for note ${noteId}, returning cached result`);
            return existingOCR;
        }

        try {
            const content = note.getContent();
            if (!content || !(content instanceof Buffer)) {
                throw new Error(`Cannot get image content for note ${noteId}`);
            }

            const language = this.resolveOcrLanguage(noteId, options.language);
            const ocrResult = await this.extractTextFromFile(content, note.mime, { ...options, language });

            // Store OCR result in blob
            await this.storeOCRResult(note.blobId, ocrResult);

            return ocrResult;
        } catch (error) {
            log.error(`Failed to process OCR for note ${noteId}: ${error}`);
            throw error;
        }
    }

    /**
     * Process OCR for an attachment
     */
    async processAttachmentOCR(attachmentId: string, options: OCRProcessingOptions = {}): Promise<OCRResult | null> {
        const attachment = becca.getAttachment(attachmentId);
        if (!attachment) {
            log.error(`Attachment ${attachmentId} not found`);
            return null;
        }

        // Check if attachment role and MIME type are supported for OCR
        if (attachment.role === 'image') {
            if (!this.isSupportedMimeType(attachment.mime)) {
                log.info(`Image attachment ${attachmentId} has unsupported MIME type ${attachment.mime}, skipping OCR`);
                return null;
            }
        } else if (attachment.role === 'file') {
            // Check if file MIME type is supported by any processor
            const processor = this.getProcessorForMimeType(attachment.mime);
            if (!processor) {
                log.info(`File attachment ${attachmentId} has unsupported MIME type ${attachment.mime} for OCR, skipping`);
                return null;
            }
        } else {
            log.info(`Attachment ${attachmentId} is not an image or file, skipping OCR`);
            return null;
        }

        // Check if OCR already exists and is up-to-date
        const existingOCR = this.getStoredOCRResult(attachment.blobId);
        if (existingOCR && !options.forceReprocess && attachment.blobId && !this.needsReprocessing(attachment.blobId)) {
            log.info(`OCR already exists and is up-to-date for attachment ${attachmentId}, returning cached result`);
            return existingOCR;
        }

        try {
            const content = attachment.getContent();
            if (!content || !(content instanceof Buffer)) {
                throw new Error(`Cannot get image content for attachment ${attachmentId}`);
            }

            const language = this.resolveOcrLanguage(attachment.ownerId, options.language);
            const ocrResult = await this.extractTextFromFile(content, attachment.mime, { ...options, language });

            // Store OCR result in blob
            await this.storeOCRResult(attachment.blobId, ocrResult);

            return ocrResult;
        } catch (error) {
            log.error(`Failed to process OCR for attachment ${attachmentId}: ${error}`);
            throw error;
        }
    }

    /**
     * Store OCR result in blob
     */
    async storeOCRResult(blobId: string | undefined, ocrResult: OCRResult): Promise<void> {
        if (!blobId) {
            log.error('Cannot store OCR result: blobId is undefined');
            return;
        }

        try {
            // Store OCR text and timestamp in blobs table
            sql.execute(`
                UPDATE blobs SET
                    textRepresentation = ?,
                    textExtractionLastProcessed = ?
                WHERE blobId = ?
            `, [
                ocrResult.text,
                new Date().toISOString(),
                blobId
            ]);

            log.info(`Stored OCR result for blob ${blobId}`);
        } catch (error) {
            log.error(`Failed to store OCR result for blob ${blobId}: ${error}`);
            throw error;
        }
    }

    /**
     * Get stored OCR result from blob
     */
    private getStoredOCRResult(blobId: string | undefined): OCRResult | null {
        if (!blobId) {
            return null;
        }

        try {
            const row = sql.getRow<{
                textRepresentation: string | null;
            }>(`
                SELECT textRepresentation
                FROM blobs
                WHERE blobId = ?
            `, [blobId]);

            if (!row || !row.textRepresentation) {
                return null;
            }

            // Return basic OCR result from stored text
            // Note: we lose confidence, language, and extractedAt metadata
            // but gain simplicity by storing directly in blob
            return {
                text: row.textRepresentation,
                confidence: 0.95, // Default high confidence for existing OCR
                extractedAt: new Date().toISOString(),
                language: 'eng'
            };
        } catch (error) {
            log.error(`Failed to get OCR result for blob ${blobId}: ${error}`);
            return null;
        }
    }

    /**
     * Search for text in OCR results
     */
    searchOCRResults(searchText: string): Array<{ blobId: string; text: string }> {
        try {
            const query = `
                SELECT blobId, textRepresentation
                FROM blobs
                WHERE textRepresentation LIKE ?
                AND textRepresentation IS NOT NULL
            `;
            const params = [`%${searchText}%`];

            const rows = sql.getRows<OCRBlobRow>(query, params);

            return rows.map(row => ({
                blobId: row.blobId,
                text: row.textRepresentation
            }));
        } catch (error) {
            log.error(`Failed to search OCR results: ${error}`);
            return [];
        }
    }

    /**
     * Delete OCR results for a blob
     */
    deleteOCRResult(blobId: string): void {
        try {
            sql.execute(`
                UPDATE blobs SET textRepresentation = NULL
                WHERE blobId = ?
            `, [blobId]);

            log.info(`Deleted OCR result for blob ${blobId}`);
        } catch (error) {
            log.error(`Failed to delete OCR result for blob ${blobId}: ${error}`);
            throw error;
        }
    }

    /**
     * Process OCR for all files that don't have OCR results yet or need reprocessing
     */
    async processAllImages(): Promise<void> {
        return this.processAllBlobsNeedingOCR();
    }

    /**
     * Get OCR statistics
     */
    getOCRStats(): { totalProcessed: number; imageNotes: number; imageAttachments: number } {
        try {
            const stats = sql.getRow<{
                total_processed: number;
            }>(`
                SELECT COUNT(*) as total_processed
                FROM blobs
                WHERE textRepresentation IS NOT NULL AND textRepresentation != ''
            `);

            // Count image notes with OCR
            const noteStats = sql.getRow<{
                count: number;
            }>(`
                SELECT COUNT(*) as count
                FROM notes n
                JOIN blobs b ON n.blobId = b.blobId
                WHERE n.type = 'image'
                AND n.isDeleted = 0
                AND b.textRepresentation IS NOT NULL AND b.textRepresentation != ''
            `);

            // Count image attachments with OCR
            const attachmentStats = sql.getRow<{
                count: number;
            }>(`
                SELECT COUNT(*) as count
                FROM attachments a
                JOIN blobs b ON a.blobId = b.blobId
                WHERE a.role = 'image'
                AND a.isDeleted = 0
                AND b.textRepresentation IS NOT NULL AND b.textRepresentation != ''
            `);

            return {
                totalProcessed: stats?.total_processed || 0,
                imageNotes: noteStats?.count || 0,
                imageAttachments: attachmentStats?.count || 0
            };
        } catch (error) {
            log.error(`Failed to get OCR stats: ${error}`);
            return { totalProcessed: 0, imageNotes: 0, imageAttachments: 0 };
        }
    }

    /**
     * Clean up OCR service
     */
    async cleanup(): Promise<void> {
        if (this.worker) {
            await this.worker.terminate();
            this.worker = null;
        }
        log.info('OCR service cleaned up');
    }

    /**
     * Check if currently processing
     */
    isCurrentlyProcessing(): boolean {
        return this.isProcessing;
    }

    // Batch processing state
    private batchProcessingState: {
        inProgress: boolean;
        total: number;
        processed: number;
        startTime?: Date;
    } = {
        inProgress: false,
        total: 0,
        processed: 0
    };

    /**
     * Start batch OCR processing with progress tracking
     */
    async startBatchProcessing(): Promise<{ success: boolean; message?: string }> {
        if (this.batchProcessingState.inProgress) {
            return { success: false, message: 'Batch processing already in progress' };
        }

        try {
            // Count total blobs needing OCR processing
            const blobsNeedingOCR = this.getBlobsNeedingOCR();
            const totalCount = blobsNeedingOCR.length;

            if (totalCount === 0) {
                return { success: false, message: 'No images found that need OCR processing' };
            }

            // Initialize batch processing state
            this.batchProcessingState = {
                inProgress: true,
                total: totalCount,
                processed: 0,
                startTime: new Date()
            };

            // Start processing in background
            this.processBatchInBackground(blobsNeedingOCR).catch(error => {
                log.error(`Batch processing failed: ${error instanceof Error ? error.message : String(error)}`);
                this.batchProcessingState.inProgress = false;
            });

            return { success: true };
        } catch (error) {
            log.error(`Failed to start batch processing: ${error instanceof Error ? error.message : String(error)}`);
            return { success: false, message: error instanceof Error ? error.message : String(error) };
        }
    }

    /**
     * Get batch processing progress
     */
    getBatchProgress(): { inProgress: boolean; total: number; processed: number; percentage?: number; startTime?: Date } {
        const result: { inProgress: boolean; total: number; processed: number; percentage?: number; startTime?: Date } = { ...this.batchProcessingState };
        if (result.total > 0) {
            result.percentage = (result.processed / result.total) * 100;
        }
        return result;
    }

    /**
     * Process batch OCR in background with progress tracking
     */
    private async processBatchInBackground(blobsToProcess: Array<{ blobId: string; mimeType: string; entityType: 'note' | 'attachment'; entityId: string }>): Promise<void> {
        try {
            log.info('Starting batch OCR processing...');

            for (const blobInfo of blobsToProcess) {
                if (!this.batchProcessingState.inProgress) {
                    break; // Stop if processing was cancelled
                }

                try {
                    if (blobInfo.entityType === 'note') {
                        await this.processNoteOCR(blobInfo.entityId);
                    } else {
                        await this.processAttachmentOCR(blobInfo.entityId);
                    }
                    this.batchProcessingState.processed++;
                    // Add small delay to prevent overwhelming the system
                    await new Promise(resolve => setTimeout(resolve, 500));
                } catch (error) {
                    log.error(`Failed to process OCR for ${blobInfo.entityType} ${blobInfo.entityId}: ${error}`);
                    this.batchProcessingState.processed++; // Count as processed even if failed
                }
            }

            // Mark as completed
            this.batchProcessingState.inProgress = false;
            log.info(`Batch OCR processing completed. Processed ${this.batchProcessingState.processed} files.`);
        } catch (error) {
            log.error(`Batch OCR processing failed: ${error}`);
            this.batchProcessingState.inProgress = false;
            throw error;
        }
    }

    /**
     * Cancel batch processing
     */
    cancelBatchProcessing(): void {
        if (this.batchProcessingState.inProgress) {
            this.batchProcessingState.inProgress = false;
            log.info('Batch OCR processing cancelled');
        }
    }

    /**
     * Get processor for a given MIME type
     */
    private getProcessorForMimeType(mimeType: string): FileProcessor | null {
        for (const processor of this.processors.values()) {
            if (processor.canProcess(mimeType)) {
                return processor;
            }
        }
        return null;
    }

    /**
     * Get all MIME types supported by all registered processors
     */
    getAllSupportedMimeTypes(): string[] {
        const supportedTypes = new Set<string>();

        // Gather MIME types from all registered processors
        for (const processor of this.processors.values()) {
            const processorTypes = processor.getSupportedMimeTypes();
            processorTypes.forEach(type => supportedTypes.add(type));
        }

        return Array.from(supportedTypes);
    }

    /**
     * Check if a MIME type is supported by any processor
     */
    isSupportedByAnyProcessor(mimeType: string): boolean {
        if (!mimeType) return false;

        // Check if any processor can handle this MIME type
        const processor = this.getProcessorForMimeType(mimeType);
        return processor !== null;
    }

    /**
     * Check if blob needs OCR re-processing due to content changes
     */
    needsReprocessing(blobId: string): boolean {
        if (!blobId) {
            return false;
        }

        try {
            const blobInfo = sql.getRow<{
                utcDateModified: string;
                textExtractionLastProcessed: string | null;
            }>(`
                SELECT utcDateModified, textExtractionLastProcessed
                FROM blobs
                WHERE blobId = ?
            `, [blobId]);

            if (!blobInfo) {
                return false;
            }

            // If OCR was never processed, it needs processing
            if (!blobInfo.textExtractionLastProcessed) {
                return true;
            }

            // If blob was modified after last OCR processing, it needs re-processing
            const blobModified = new Date(blobInfo.utcDateModified);
            const lastOcrProcessed = new Date(blobInfo.textExtractionLastProcessed);

            return blobModified > lastOcrProcessed;
        } catch (error) {
            log.error(`Failed to check if blob ${blobId} needs reprocessing: ${error}`);
            return false;
        }
    }

    /**
     * Invalidate OCR results for a blob (clear textRepresentation and textExtractionLastProcessed)
     */
    invalidateOCRResult(blobId: string): void {
        if (!blobId) {
            return;
        }

        try {
            sql.execute(`
                UPDATE blobs SET
                    textRepresentation = NULL,
                    textExtractionLastProcessed = NULL
                WHERE blobId = ?
            `, [blobId]);

            log.info(`Invalidated OCR result for blob ${blobId}`);
        } catch (error) {
            log.error(`Failed to invalidate OCR result for blob ${blobId}: ${error}`);
            throw error;
        }
    }

    /**
     * Get blobs that need OCR processing (modified after last OCR or never processed)
     */
    getBlobsNeedingOCR(): Array<{ blobId: string; mimeType: string; entityType: 'note' | 'attachment'; entityId: string }> {
        try {
            // Get notes with blobs that need OCR (both image notes and file notes with supported MIME types)
            const noteBlobs = sql.getRows<{
                blobId: string;
                mimeType: string;
                entityId: string;
            }>(`
                SELECT n.blobId, n.mime as mimeType, n.noteId as entityId
                FROM notes n
                JOIN blobs b ON n.blobId = b.blobId
                WHERE (
                    n.type = 'image'
                    OR (
                        n.type = 'file'
                        AND n.mime IN (
                            'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
                            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
                            'application/vnd.openxmlformats-officedocument.presentationml.presentation',
                            'application/msword',
                            'application/vnd.ms-excel',
                            'application/vnd.ms-powerpoint',
                            'application/rtf',
                            'application/pdf',
                            'image/jpeg',
                            'image/jpg',
                            'image/png',
                            'image/gif',
                            'image/bmp',
                            'image/tiff',
                            'image/webp'
                        )
                    )
                )
                AND n.isDeleted = 0
                AND n.blobId IS NOT NULL
                AND (
                    b.textExtractionLastProcessed IS NULL
                    OR b.utcDateModified > b.textExtractionLastProcessed
                )
            `);

            // Get attachments with blobs that need OCR (both image and file attachments with supported MIME types)
            const attachmentBlobs = sql.getRows<{
                blobId: string;
                mimeType: string;
                entityId: string;
            }>(`
                SELECT a.blobId, a.mime as mimeType, a.attachmentId as entityId
                FROM attachments a
                JOIN blobs b ON a.blobId = b.blobId
                WHERE (
                    a.role = 'image'
                    OR (
                        a.role = 'file'
                        AND a.mime IN (
                            'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
                            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
                            'application/vnd.openxmlformats-officedocument.presentationml.presentation',
                            'application/msword',
                            'application/vnd.ms-excel',
                            'application/vnd.ms-powerpoint',
                            'application/rtf',
                            'application/pdf',
                            'image/jpeg',
                            'image/jpg',
                            'image/png',
                            'image/gif',
                            'image/bmp',
                            'image/tiff',
                            'image/webp'
                        )
                    )
                )
                AND a.isDeleted = 0
                AND a.blobId IS NOT NULL
                AND (
                    b.textExtractionLastProcessed IS NULL
                    OR b.utcDateModified > b.textExtractionLastProcessed
                )
            `);

            // Combine results
            const result = [
                ...noteBlobs.map(blob => ({ ...blob, entityType: 'note' as const })),
                ...attachmentBlobs.map(blob => ({ ...blob, entityType: 'attachment' as const }))
            ];

            // Return all results (no need to filter by MIME type as we already did in the query)
            return result;
        } catch (error) {
            log.error(`Failed to get blobs needing OCR: ${error}`);
            return [];
        }
    }

    /**
     * Process OCR for all blobs that need it (auto-processing)
     */
    async processAllBlobsNeedingOCR(): Promise<void> {
        if (!options.getOptionBool('ocrAutoProcessImages')) {
            log.info('OCR auto-processing is disabled, skipping');
            return;
        }

        const blobsNeedingOCR = this.getBlobsNeedingOCR();
        if (blobsNeedingOCR.length === 0) {
            log.info('No blobs need OCR processing');
            return;
        }

        log.info(`Auto-processing OCR for ${blobsNeedingOCR.length} blobs...`);

        for (const blobInfo of blobsNeedingOCR) {
            try {
                if (blobInfo.entityType === 'note') {
                    await this.processNoteOCR(blobInfo.entityId);
                } else {
                    await this.processAttachmentOCR(blobInfo.entityId);
                }

                // Add small delay to prevent overwhelming the system
                await new Promise(resolve => setTimeout(resolve, 100));
            } catch (error) {
                log.error(`Failed to auto-process OCR for ${blobInfo.entityType} ${blobInfo.entityId}: ${error}`);
                // Continue with other blobs
            }
        }

        log.info('Auto-processing OCR completed');
    }
}

export default new OCRService();