mirror of
https://github.com/zadam/trilium.git
synced 2026-06-27 16:39:20 +02:00
794 lines
28 KiB
TypeScript
794 lines
28 KiB
TypeScript
import { getTesseractCode } from '@triliumnext/commons';
|
|
import Tesseract from 'tesseract.js';
|
|
|
|
import becca from '../../becca/becca.js';
|
|
import log from '../log.js';
|
|
import options from '../options.js';
|
|
import sql from '../sql.js';
|
|
import { FileProcessor } from './processors/file_processor.js';
|
|
import { ImageProcessor } from './processors/image_processor.js';
|
|
import { OfficeProcessor } from './processors/office_processor.js';
|
|
import { PDFProcessor } from './processors/pdf_processor.js';
|
|
import { TIFFProcessor } from './processors/tiff_processor.js';
|
|
|
|
export interface OCRResult {
|
|
text: string;
|
|
confidence: number;
|
|
extractedAt: string;
|
|
language?: string;
|
|
pageCount?: number;
|
|
}
|
|
|
|
export interface OCRProcessingOptions {
|
|
language?: string;
|
|
forceReprocess?: boolean;
|
|
confidence?: number;
|
|
enablePDFTextExtraction?: boolean;
|
|
}
|
|
|
|
interface OCRBlobRow {
|
|
blobId: string;
|
|
textRepresentation: string;
|
|
textExtractionLastProcessed?: string;
|
|
}
|
|
|
|
/**
|
|
* OCR Service for extracting text from images and other OCR-able objects
|
|
* Uses Tesseract.js for text recognition
|
|
*/
|
|
class OCRService {
|
|
private worker: Tesseract.Worker | null = null;
|
|
private isProcessing = false;
|
|
private processors: Map<string, FileProcessor> = new Map();
|
|
|
|
constructor() {
|
|
// Initialize file processors
|
|
this.processors.set('image', new ImageProcessor());
|
|
this.processors.set('pdf', new PDFProcessor());
|
|
this.processors.set('tiff', new TIFFProcessor());
|
|
this.processors.set('office', new OfficeProcessor());
|
|
}
|
|
|
|
/**
|
|
* Resolves the Tesseract language code(s) for OCR processing.
|
|
*
|
|
* Priority:
|
|
* 1. Explicitly passed `language` option (e.g. from API call)
|
|
* 2. The note's `language` label (mapped via {@link getTesseractCode})
|
|
* 3. All enabled content languages joined with `+`
|
|
* 4. The UI locale
|
|
* 5. Fallback to `eng`
|
|
*/
|
|
resolveOcrLanguage(noteId?: string, explicitLanguage?: string): string {
|
|
// 1. Explicit language from caller
|
|
if (explicitLanguage) {
|
|
return explicitLanguage;
|
|
}
|
|
|
|
// 2. Note's language label
|
|
if (noteId) {
|
|
const note = becca.getNote(noteId);
|
|
const noteLanguage = note?.getLabelValue("language");
|
|
if (noteLanguage) {
|
|
const code = getTesseractCode(noteLanguage);
|
|
if (code) {
|
|
return code;
|
|
}
|
|
}
|
|
}
|
|
|
|
// 3. All enabled content languages
|
|
try {
|
|
const languagesJson = options.getOption("languages");
|
|
const enabledLanguages = JSON.parse(languagesJson || "[]") as string[];
|
|
if (enabledLanguages.length > 0) {
|
|
const codes = enabledLanguages
|
|
.map((id) => getTesseractCode(id))
|
|
.filter((code): code is string => code !== null);
|
|
// Deduplicate (e.g. en + en-GB both map to eng)
|
|
const unique = [...new Set(codes)];
|
|
if (unique.length > 0) {
|
|
return unique.join("+");
|
|
}
|
|
}
|
|
} catch {
|
|
// Fall through
|
|
}
|
|
|
|
// 4. UI locale
|
|
try {
|
|
const uiLocale = options.getOption("locale");
|
|
if (uiLocale) {
|
|
const code = getTesseractCode(uiLocale);
|
|
if (code) {
|
|
return code;
|
|
}
|
|
}
|
|
} catch {
|
|
// Fall through
|
|
}
|
|
|
|
// 5. Fallback
|
|
return "eng";
|
|
}
|
|
|
|
/**
|
|
* Check if a MIME type is supported for OCR
|
|
*/
|
|
isSupportedMimeType(mimeType: string): boolean {
|
|
if (!mimeType || typeof mimeType !== 'string') {
|
|
return false;
|
|
}
|
|
|
|
const supportedTypes = [
|
|
'image/jpeg',
|
|
'image/jpg',
|
|
'image/png',
|
|
'image/gif',
|
|
'image/bmp',
|
|
'image/tiff',
|
|
'image/webp'
|
|
];
|
|
return supportedTypes.includes(mimeType.toLowerCase());
|
|
}
|
|
|
|
/**
|
|
* Extract text from file buffer using appropriate processor
|
|
*/
|
|
async extractTextFromFile(fileBuffer: Buffer, mimeType: string, options: OCRProcessingOptions = {}): Promise<OCRResult> {
|
|
try {
|
|
log.info(`Starting OCR text extraction for MIME type: ${mimeType} with language: ${options.language || "auto-detect"}`);
|
|
this.isProcessing = true;
|
|
|
|
// Find appropriate processor
|
|
const processor = this.getProcessorForMimeType(mimeType);
|
|
if (!processor) {
|
|
throw new Error(`No processor found for MIME type: ${mimeType}`);
|
|
}
|
|
|
|
const result = await processor.extractText(fileBuffer, options);
|
|
|
|
log.info(`OCR extraction completed. Confidence: ${result.confidence}%, Text length: ${result.text.length}`);
|
|
return result;
|
|
|
|
} catch (error) {
|
|
log.error(`OCR text extraction failed: ${error}`);
|
|
throw error;
|
|
} finally {
|
|
this.isProcessing = false;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Process OCR for a note (image type)
|
|
*/
|
|
async processNoteOCR(noteId: string, options: OCRProcessingOptions = {}): Promise<OCRResult | null> {
|
|
const note = becca.getNote(noteId);
|
|
if (!note) {
|
|
log.error(`Note ${noteId} not found`);
|
|
return null;
|
|
}
|
|
|
|
// Check if note type and MIME type are supported for OCR
|
|
if (note.type === 'image') {
|
|
if (!this.isSupportedMimeType(note.mime)) {
|
|
log.info(`Image note ${noteId} has unsupported MIME type ${note.mime}, skipping OCR`);
|
|
return null;
|
|
}
|
|
} else if (note.type === 'file') {
|
|
// Check if file MIME type is supported by any processor
|
|
const processor = this.getProcessorForMimeType(note.mime);
|
|
if (!processor) {
|
|
log.info(`File note ${noteId} has unsupported MIME type ${note.mime} for OCR, skipping`);
|
|
return null;
|
|
}
|
|
} else {
|
|
log.info(`Note ${noteId} is not an image or file note, skipping OCR`);
|
|
return null;
|
|
}
|
|
|
|
// Check if OCR already exists and is up-to-date
|
|
const existingOCR = this.getStoredOCRResult(note.blobId);
|
|
if (existingOCR && !options.forceReprocess && note.blobId && !this.needsReprocessing(note.blobId)) {
|
|
log.info(`OCR already exists and is up-to-date for note ${noteId}, returning cached result`);
|
|
return existingOCR;
|
|
}
|
|
|
|
try {
|
|
const content = note.getContent();
|
|
if (!content || !(content instanceof Buffer)) {
|
|
throw new Error(`Cannot get image content for note ${noteId}`);
|
|
}
|
|
|
|
const language = this.resolveOcrLanguage(noteId, options.language);
|
|
const ocrResult = await this.extractTextFromFile(content, note.mime, { ...options, language });
|
|
|
|
// Store OCR result in blob
|
|
await this.storeOCRResult(note.blobId, ocrResult);
|
|
|
|
return ocrResult;
|
|
} catch (error) {
|
|
log.error(`Failed to process OCR for note ${noteId}: ${error}`);
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Process OCR for an attachment
|
|
*/
|
|
async processAttachmentOCR(attachmentId: string, options: OCRProcessingOptions = {}): Promise<OCRResult | null> {
|
|
const attachment = becca.getAttachment(attachmentId);
|
|
if (!attachment) {
|
|
log.error(`Attachment ${attachmentId} not found`);
|
|
return null;
|
|
}
|
|
|
|
// Check if attachment role and MIME type are supported for OCR
|
|
if (attachment.role === 'image') {
|
|
if (!this.isSupportedMimeType(attachment.mime)) {
|
|
log.info(`Image attachment ${attachmentId} has unsupported MIME type ${attachment.mime}, skipping OCR`);
|
|
return null;
|
|
}
|
|
} else if (attachment.role === 'file') {
|
|
// Check if file MIME type is supported by any processor
|
|
const processor = this.getProcessorForMimeType(attachment.mime);
|
|
if (!processor) {
|
|
log.info(`File attachment ${attachmentId} has unsupported MIME type ${attachment.mime} for OCR, skipping`);
|
|
return null;
|
|
}
|
|
} else {
|
|
log.info(`Attachment ${attachmentId} is not an image or file, skipping OCR`);
|
|
return null;
|
|
}
|
|
|
|
// Check if OCR already exists and is up-to-date
|
|
const existingOCR = this.getStoredOCRResult(attachment.blobId);
|
|
if (existingOCR && !options.forceReprocess && attachment.blobId && !this.needsReprocessing(attachment.blobId)) {
|
|
log.info(`OCR already exists and is up-to-date for attachment ${attachmentId}, returning cached result`);
|
|
return existingOCR;
|
|
}
|
|
|
|
try {
|
|
const content = attachment.getContent();
|
|
if (!content || !(content instanceof Buffer)) {
|
|
throw new Error(`Cannot get image content for attachment ${attachmentId}`);
|
|
}
|
|
|
|
const language = this.resolveOcrLanguage(attachment.ownerId, options.language);
|
|
const ocrResult = await this.extractTextFromFile(content, attachment.mime, { ...options, language });
|
|
|
|
// Store OCR result in blob
|
|
await this.storeOCRResult(attachment.blobId, ocrResult);
|
|
|
|
return ocrResult;
|
|
} catch (error) {
|
|
log.error(`Failed to process OCR for attachment ${attachmentId}: ${error}`);
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Store OCR result in blob
|
|
*/
|
|
async storeOCRResult(blobId: string | undefined, ocrResult: OCRResult): Promise<void> {
|
|
if (!blobId) {
|
|
log.error('Cannot store OCR result: blobId is undefined');
|
|
return;
|
|
}
|
|
|
|
try {
|
|
// Store OCR text and timestamp in blobs table
|
|
sql.execute(`
|
|
UPDATE blobs SET
|
|
textRepresentation = ?,
|
|
textExtractionLastProcessed = ?
|
|
WHERE blobId = ?
|
|
`, [
|
|
ocrResult.text,
|
|
new Date().toISOString(),
|
|
blobId
|
|
]);
|
|
|
|
log.info(`Stored OCR result for blob ${blobId}`);
|
|
} catch (error) {
|
|
log.error(`Failed to store OCR result for blob ${blobId}: ${error}`);
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get stored OCR result from blob
|
|
*/
|
|
private getStoredOCRResult(blobId: string | undefined): OCRResult | null {
|
|
if (!blobId) {
|
|
return null;
|
|
}
|
|
|
|
try {
|
|
const row = sql.getRow<{
|
|
textRepresentation: string | null;
|
|
}>(`
|
|
SELECT textRepresentation
|
|
FROM blobs
|
|
WHERE blobId = ?
|
|
`, [blobId]);
|
|
|
|
if (!row || !row.textRepresentation) {
|
|
return null;
|
|
}
|
|
|
|
// Return basic OCR result from stored text
|
|
// Note: we lose confidence, language, and extractedAt metadata
|
|
// but gain simplicity by storing directly in blob
|
|
return {
|
|
text: row.textRepresentation,
|
|
confidence: 0.95, // Default high confidence for existing OCR
|
|
extractedAt: new Date().toISOString(),
|
|
language: 'eng'
|
|
};
|
|
} catch (error) {
|
|
log.error(`Failed to get OCR result for blob ${blobId}: ${error}`);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Search for text in OCR results
|
|
*/
|
|
searchOCRResults(searchText: string): Array<{ blobId: string; text: string }> {
|
|
try {
|
|
const query = `
|
|
SELECT blobId, textRepresentation
|
|
FROM blobs
|
|
WHERE textRepresentation LIKE ?
|
|
AND textRepresentation IS NOT NULL
|
|
`;
|
|
const params = [`%${searchText}%`];
|
|
|
|
const rows = sql.getRows<OCRBlobRow>(query, params);
|
|
|
|
return rows.map(row => ({
|
|
blobId: row.blobId,
|
|
text: row.textRepresentation
|
|
}));
|
|
} catch (error) {
|
|
log.error(`Failed to search OCR results: ${error}`);
|
|
return [];
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Delete OCR results for a blob
|
|
*/
|
|
deleteOCRResult(blobId: string): void {
|
|
try {
|
|
sql.execute(`
|
|
UPDATE blobs SET textRepresentation = NULL
|
|
WHERE blobId = ?
|
|
`, [blobId]);
|
|
|
|
log.info(`Deleted OCR result for blob ${blobId}`);
|
|
} catch (error) {
|
|
log.error(`Failed to delete OCR result for blob ${blobId}: ${error}`);
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Process OCR for all files that don't have OCR results yet or need reprocessing
|
|
*/
|
|
async processAllImages(): Promise<void> {
|
|
return this.processAllBlobsNeedingOCR();
|
|
}
|
|
|
|
/**
|
|
* Get OCR statistics
|
|
*/
|
|
getOCRStats(): { totalProcessed: number; imageNotes: number; imageAttachments: number } {
|
|
try {
|
|
const stats = sql.getRow<{
|
|
total_processed: number;
|
|
}>(`
|
|
SELECT COUNT(*) as total_processed
|
|
FROM blobs
|
|
WHERE textRepresentation IS NOT NULL AND textRepresentation != ''
|
|
`);
|
|
|
|
// Count image notes with OCR
|
|
const noteStats = sql.getRow<{
|
|
count: number;
|
|
}>(`
|
|
SELECT COUNT(*) as count
|
|
FROM notes n
|
|
JOIN blobs b ON n.blobId = b.blobId
|
|
WHERE n.type = 'image'
|
|
AND n.isDeleted = 0
|
|
AND b.textRepresentation IS NOT NULL AND b.textRepresentation != ''
|
|
`);
|
|
|
|
// Count image attachments with OCR
|
|
const attachmentStats = sql.getRow<{
|
|
count: number;
|
|
}>(`
|
|
SELECT COUNT(*) as count
|
|
FROM attachments a
|
|
JOIN blobs b ON a.blobId = b.blobId
|
|
WHERE a.role = 'image'
|
|
AND a.isDeleted = 0
|
|
AND b.textRepresentation IS NOT NULL AND b.textRepresentation != ''
|
|
`);
|
|
|
|
return {
|
|
totalProcessed: stats?.total_processed || 0,
|
|
imageNotes: noteStats?.count || 0,
|
|
imageAttachments: attachmentStats?.count || 0
|
|
};
|
|
} catch (error) {
|
|
log.error(`Failed to get OCR stats: ${error}`);
|
|
return { totalProcessed: 0, imageNotes: 0, imageAttachments: 0 };
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Clean up OCR service
|
|
*/
|
|
async cleanup(): Promise<void> {
|
|
if (this.worker) {
|
|
await this.worker.terminate();
|
|
this.worker = null;
|
|
}
|
|
log.info('OCR service cleaned up');
|
|
}
|
|
|
|
/**
|
|
* Check if currently processing
|
|
*/
|
|
isCurrentlyProcessing(): boolean {
|
|
return this.isProcessing;
|
|
}
|
|
|
|
// Batch processing state
|
|
private batchProcessingState: {
|
|
inProgress: boolean;
|
|
total: number;
|
|
processed: number;
|
|
startTime?: Date;
|
|
} = {
|
|
inProgress: false,
|
|
total: 0,
|
|
processed: 0
|
|
};
|
|
|
|
/**
|
|
* Start batch OCR processing with progress tracking
|
|
*/
|
|
async startBatchProcessing(): Promise<{ success: boolean; message?: string }> {
|
|
if (this.batchProcessingState.inProgress) {
|
|
return { success: false, message: 'Batch processing already in progress' };
|
|
}
|
|
|
|
try {
|
|
// Count total blobs needing OCR processing
|
|
const blobsNeedingOCR = this.getBlobsNeedingOCR();
|
|
const totalCount = blobsNeedingOCR.length;
|
|
|
|
if (totalCount === 0) {
|
|
return { success: false, message: 'No images found that need OCR processing' };
|
|
}
|
|
|
|
// Initialize batch processing state
|
|
this.batchProcessingState = {
|
|
inProgress: true,
|
|
total: totalCount,
|
|
processed: 0,
|
|
startTime: new Date()
|
|
};
|
|
|
|
// Start processing in background
|
|
this.processBatchInBackground(blobsNeedingOCR).catch(error => {
|
|
log.error(`Batch processing failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
this.batchProcessingState.inProgress = false;
|
|
});
|
|
|
|
return { success: true };
|
|
} catch (error) {
|
|
log.error(`Failed to start batch processing: ${error instanceof Error ? error.message : String(error)}`);
|
|
return { success: false, message: error instanceof Error ? error.message : String(error) };
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get batch processing progress
|
|
*/
|
|
getBatchProgress(): { inProgress: boolean; total: number; processed: number; percentage?: number; startTime?: Date } {
|
|
const result: { inProgress: boolean; total: number; processed: number; percentage?: number; startTime?: Date } = { ...this.batchProcessingState };
|
|
if (result.total > 0) {
|
|
result.percentage = (result.processed / result.total) * 100;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* Process batch OCR in background with progress tracking
|
|
*/
|
|
private async processBatchInBackground(blobsToProcess: Array<{ blobId: string; mimeType: string; entityType: 'note' | 'attachment'; entityId: string }>): Promise<void> {
|
|
try {
|
|
log.info('Starting batch OCR processing...');
|
|
|
|
for (const blobInfo of blobsToProcess) {
|
|
if (!this.batchProcessingState.inProgress) {
|
|
break; // Stop if processing was cancelled
|
|
}
|
|
|
|
try {
|
|
if (blobInfo.entityType === 'note') {
|
|
await this.processNoteOCR(blobInfo.entityId);
|
|
} else {
|
|
await this.processAttachmentOCR(blobInfo.entityId);
|
|
}
|
|
this.batchProcessingState.processed++;
|
|
// Add small delay to prevent overwhelming the system
|
|
await new Promise(resolve => setTimeout(resolve, 500));
|
|
} catch (error) {
|
|
log.error(`Failed to process OCR for ${blobInfo.entityType} ${blobInfo.entityId}: ${error}`);
|
|
this.batchProcessingState.processed++; // Count as processed even if failed
|
|
}
|
|
}
|
|
|
|
// Mark as completed
|
|
this.batchProcessingState.inProgress = false;
|
|
log.info(`Batch OCR processing completed. Processed ${this.batchProcessingState.processed} files.`);
|
|
} catch (error) {
|
|
log.error(`Batch OCR processing failed: ${error}`);
|
|
this.batchProcessingState.inProgress = false;
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Cancel batch processing
|
|
*/
|
|
cancelBatchProcessing(): void {
|
|
if (this.batchProcessingState.inProgress) {
|
|
this.batchProcessingState.inProgress = false;
|
|
log.info('Batch OCR processing cancelled');
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get processor for a given MIME type
|
|
*/
|
|
private getProcessorForMimeType(mimeType: string): FileProcessor | null {
|
|
for (const processor of this.processors.values()) {
|
|
if (processor.canProcess(mimeType)) {
|
|
return processor;
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
/**
|
|
* Get all MIME types supported by all registered processors
|
|
*/
|
|
getAllSupportedMimeTypes(): string[] {
|
|
const supportedTypes = new Set<string>();
|
|
|
|
// Gather MIME types from all registered processors
|
|
for (const processor of this.processors.values()) {
|
|
const processorTypes = processor.getSupportedMimeTypes();
|
|
processorTypes.forEach(type => supportedTypes.add(type));
|
|
}
|
|
|
|
return Array.from(supportedTypes);
|
|
}
|
|
|
|
/**
|
|
* Check if a MIME type is supported by any processor
|
|
*/
|
|
isSupportedByAnyProcessor(mimeType: string): boolean {
|
|
if (!mimeType) return false;
|
|
|
|
// Check if any processor can handle this MIME type
|
|
const processor = this.getProcessorForMimeType(mimeType);
|
|
return processor !== null;
|
|
}
|
|
|
|
/**
|
|
* Check if blob needs OCR re-processing due to content changes
|
|
*/
|
|
needsReprocessing(blobId: string): boolean {
|
|
if (!blobId) {
|
|
return false;
|
|
}
|
|
|
|
try {
|
|
const blobInfo = sql.getRow<{
|
|
utcDateModified: string;
|
|
textExtractionLastProcessed: string | null;
|
|
}>(`
|
|
SELECT utcDateModified, textExtractionLastProcessed
|
|
FROM blobs
|
|
WHERE blobId = ?
|
|
`, [blobId]);
|
|
|
|
if (!blobInfo) {
|
|
return false;
|
|
}
|
|
|
|
// If OCR was never processed, it needs processing
|
|
if (!blobInfo.textExtractionLastProcessed) {
|
|
return true;
|
|
}
|
|
|
|
// If blob was modified after last OCR processing, it needs re-processing
|
|
const blobModified = new Date(blobInfo.utcDateModified);
|
|
const lastOcrProcessed = new Date(blobInfo.textExtractionLastProcessed);
|
|
|
|
return blobModified > lastOcrProcessed;
|
|
} catch (error) {
|
|
log.error(`Failed to check if blob ${blobId} needs reprocessing: ${error}`);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Invalidate OCR results for a blob (clear textRepresentation and textExtractionLastProcessed)
|
|
*/
|
|
invalidateOCRResult(blobId: string): void {
|
|
if (!blobId) {
|
|
return;
|
|
}
|
|
|
|
try {
|
|
sql.execute(`
|
|
UPDATE blobs SET
|
|
textRepresentation = NULL,
|
|
textExtractionLastProcessed = NULL
|
|
WHERE blobId = ?
|
|
`, [blobId]);
|
|
|
|
log.info(`Invalidated OCR result for blob ${blobId}`);
|
|
} catch (error) {
|
|
log.error(`Failed to invalidate OCR result for blob ${blobId}: ${error}`);
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get blobs that need OCR processing (modified after last OCR or never processed)
|
|
*/
|
|
getBlobsNeedingOCR(): Array<{ blobId: string; mimeType: string; entityType: 'note' | 'attachment'; entityId: string }> {
|
|
try {
|
|
// Get notes with blobs that need OCR (both image notes and file notes with supported MIME types)
|
|
const noteBlobs = sql.getRows<{
|
|
blobId: string;
|
|
mimeType: string;
|
|
entityId: string;
|
|
}>(`
|
|
SELECT n.blobId, n.mime as mimeType, n.noteId as entityId
|
|
FROM notes n
|
|
JOIN blobs b ON n.blobId = b.blobId
|
|
WHERE (
|
|
n.type = 'image'
|
|
OR (
|
|
n.type = 'file'
|
|
AND n.mime IN (
|
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
|
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
|
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
|
'application/msword',
|
|
'application/vnd.ms-excel',
|
|
'application/vnd.ms-powerpoint',
|
|
'application/rtf',
|
|
'application/pdf',
|
|
'image/jpeg',
|
|
'image/jpg',
|
|
'image/png',
|
|
'image/gif',
|
|
'image/bmp',
|
|
'image/tiff',
|
|
'image/webp'
|
|
)
|
|
)
|
|
)
|
|
AND n.isDeleted = 0
|
|
AND n.blobId IS NOT NULL
|
|
AND (
|
|
b.textExtractionLastProcessed IS NULL
|
|
OR b.utcDateModified > b.textExtractionLastProcessed
|
|
)
|
|
`);
|
|
|
|
// Get attachments with blobs that need OCR (both image and file attachments with supported MIME types)
|
|
const attachmentBlobs = sql.getRows<{
|
|
blobId: string;
|
|
mimeType: string;
|
|
entityId: string;
|
|
}>(`
|
|
SELECT a.blobId, a.mime as mimeType, a.attachmentId as entityId
|
|
FROM attachments a
|
|
JOIN blobs b ON a.blobId = b.blobId
|
|
WHERE (
|
|
a.role = 'image'
|
|
OR (
|
|
a.role = 'file'
|
|
AND a.mime IN (
|
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
|
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
|
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
|
'application/msword',
|
|
'application/vnd.ms-excel',
|
|
'application/vnd.ms-powerpoint',
|
|
'application/rtf',
|
|
'application/pdf',
|
|
'image/jpeg',
|
|
'image/jpg',
|
|
'image/png',
|
|
'image/gif',
|
|
'image/bmp',
|
|
'image/tiff',
|
|
'image/webp'
|
|
)
|
|
)
|
|
)
|
|
AND a.isDeleted = 0
|
|
AND a.blobId IS NOT NULL
|
|
AND (
|
|
b.textExtractionLastProcessed IS NULL
|
|
OR b.utcDateModified > b.textExtractionLastProcessed
|
|
)
|
|
`);
|
|
|
|
// Combine results
|
|
const result = [
|
|
...noteBlobs.map(blob => ({ ...blob, entityType: 'note' as const })),
|
|
...attachmentBlobs.map(blob => ({ ...blob, entityType: 'attachment' as const }))
|
|
];
|
|
|
|
// Return all results (no need to filter by MIME type as we already did in the query)
|
|
return result;
|
|
} catch (error) {
|
|
log.error(`Failed to get blobs needing OCR: ${error}`);
|
|
return [];
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Process OCR for all blobs that need it (auto-processing)
|
|
*/
|
|
async processAllBlobsNeedingOCR(): Promise<void> {
|
|
if (!options.getOptionBool('ocrAutoProcessImages')) {
|
|
log.info('OCR auto-processing is disabled, skipping');
|
|
return;
|
|
}
|
|
|
|
const blobsNeedingOCR = this.getBlobsNeedingOCR();
|
|
if (blobsNeedingOCR.length === 0) {
|
|
log.info('No blobs need OCR processing');
|
|
return;
|
|
}
|
|
|
|
log.info(`Auto-processing OCR for ${blobsNeedingOCR.length} blobs...`);
|
|
|
|
for (const blobInfo of blobsNeedingOCR) {
|
|
try {
|
|
if (blobInfo.entityType === 'note') {
|
|
await this.processNoteOCR(blobInfo.entityId);
|
|
} else {
|
|
await this.processAttachmentOCR(blobInfo.entityId);
|
|
}
|
|
|
|
// Add small delay to prevent overwhelming the system
|
|
await new Promise(resolve => setTimeout(resolve, 100));
|
|
} catch (error) {
|
|
log.error(`Failed to auto-process OCR for ${blobInfo.entityType} ${blobInfo.entityId}: ${error}`);
|
|
// Continue with other blobs
|
|
}
|
|
}
|
|
|
|
log.info('Auto-processing OCR completed');
|
|
}
|
|
}
|
|
|
|
export default new OCRService();
|