mirror of
https://github.com/zadam/trilium.git
synced 2026-07-05 23:49:22 +02:00
fix(ocr): properly handle office MIME types
This commit is contained in:
@@ -620,7 +620,9 @@ class OCRService {
|
||||
*/
|
||||
getBlobsNeedingOCR(): Array<{ blobId: string; mimeType: string; entityType: 'note' | 'attachment'; entityId: string }> {
|
||||
try {
|
||||
// Get notes with blobs that need OCR (both image notes and file notes with supported MIME types)
|
||||
const supportedMimes = this.getAllSupportedMimeTypes();
|
||||
const placeholders = supportedMimes.map(() => '?').join(', ');
|
||||
|
||||
const noteBlobs = sql.getRows<{
|
||||
blobId: string;
|
||||
mimeType: string;
|
||||
@@ -629,35 +631,12 @@ class OCRService {
|
||||
SELECT n.blobId, n.mime as mimeType, n.noteId as entityId
|
||||
FROM notes n
|
||||
JOIN blobs b ON n.blobId = b.blobId
|
||||
WHERE (
|
||||
n.type = 'image'
|
||||
OR (
|
||||
n.type = 'file'
|
||||
AND n.mime IN (
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
||||
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
||||
'application/msword',
|
||||
'application/vnd.ms-excel',
|
||||
'application/vnd.ms-powerpoint',
|
||||
'application/rtf',
|
||||
'application/pdf',
|
||||
'image/jpeg',
|
||||
'image/jpg',
|
||||
'image/png',
|
||||
'image/gif',
|
||||
'image/bmp',
|
||||
'image/tiff',
|
||||
'image/webp'
|
||||
)
|
||||
)
|
||||
)
|
||||
WHERE (n.type = 'image' OR (n.type = 'file' AND n.mime IN (${placeholders})))
|
||||
AND n.isDeleted = 0
|
||||
AND n.blobId IS NOT NULL
|
||||
AND b.textRepresentation IS NULL
|
||||
`);
|
||||
`, supportedMimes);
|
||||
|
||||
// Get attachments with blobs that need OCR (both image and file attachments with supported MIME types)
|
||||
const attachmentBlobs = sql.getRows<{
|
||||
blobId: string;
|
||||
mimeType: string;
|
||||
@@ -666,33 +645,11 @@ class OCRService {
|
||||
SELECT a.blobId, a.mime as mimeType, a.attachmentId as entityId
|
||||
FROM attachments a
|
||||
JOIN blobs b ON a.blobId = b.blobId
|
||||
WHERE (
|
||||
a.role = 'image'
|
||||
OR (
|
||||
a.role = 'file'
|
||||
AND a.mime IN (
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
||||
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
||||
'application/msword',
|
||||
'application/vnd.ms-excel',
|
||||
'application/vnd.ms-powerpoint',
|
||||
'application/rtf',
|
||||
'application/pdf',
|
||||
'image/jpeg',
|
||||
'image/jpg',
|
||||
'image/png',
|
||||
'image/gif',
|
||||
'image/bmp',
|
||||
'image/tiff',
|
||||
'image/webp'
|
||||
)
|
||||
)
|
||||
)
|
||||
WHERE (a.role = 'image' OR (a.role = 'file' AND a.mime IN (${placeholders})))
|
||||
AND a.isDeleted = 0
|
||||
AND a.blobId IS NOT NULL
|
||||
AND b.textRepresentation IS NULL
|
||||
`);
|
||||
`, supportedMimes);
|
||||
|
||||
// Combine results
|
||||
const result = [
|
||||
|
||||
@@ -1,83 +1,52 @@
|
||||
import * as officeParser from 'officeparser';
|
||||
|
||||
import log from '../../log.js';
|
||||
import { OCRProcessingOptions,OCRResult } from '../ocr_service.js';
|
||||
import { OCRProcessingOptions, OCRResult } from '../ocr_service.js';
|
||||
import { FileProcessor } from './file_processor.js';
|
||||
import { ImageProcessor } from './image_processor.js';
|
||||
|
||||
const SUPPORTED_TYPES = [
|
||||
// Office Open XML
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // DOCX
|
||||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // XLSX
|
||||
'application/vnd.openxmlformats-officedocument.presentationml.presentation', // PPTX
|
||||
// OpenDocument
|
||||
'application/vnd.oasis.opendocument.text', // ODT
|
||||
'application/vnd.oasis.opendocument.spreadsheet', // ODS
|
||||
'application/vnd.oasis.opendocument.presentation' // ODP
|
||||
];
|
||||
|
||||
/**
|
||||
* Office document processor for extracting text and images from DOCX/XLSX/PPTX files
|
||||
* Office document processor for extracting text from DOCX/XLSX/PPTX and ODT/ODS/ODP files.
|
||||
*/
|
||||
export class OfficeProcessor extends FileProcessor {
|
||||
private imageProcessor: ImageProcessor;
|
||||
private readonly supportedTypes = [
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // DOCX
|
||||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // XLSX
|
||||
'application/vnd.openxmlformats-officedocument.presentationml.presentation', // PPTX
|
||||
'application/msword', // DOC
|
||||
'application/vnd.ms-excel', // XLS
|
||||
'application/vnd.ms-powerpoint', // PPT
|
||||
'application/rtf' // RTF
|
||||
];
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
this.imageProcessor = new ImageProcessor();
|
||||
}
|
||||
|
||||
canProcess(mimeType: string): boolean {
|
||||
return this.supportedTypes.includes(mimeType);
|
||||
return SUPPORTED_TYPES.includes(mimeType);
|
||||
}
|
||||
|
||||
getSupportedMimeTypes(): string[] {
|
||||
return [...this.supportedTypes];
|
||||
return [...SUPPORTED_TYPES];
|
||||
}
|
||||
|
||||
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
|
||||
try {
|
||||
log.info('Starting Office document text extraction...');
|
||||
log.info('Starting Office document text extraction...');
|
||||
|
||||
const language = options.language || "eng";
|
||||
const text = await officeParser.parseOfficeAsync(buffer, {
|
||||
outputErrorToConsole: false,
|
||||
newlineDelimiter: '\n',
|
||||
ignoreNotes: false,
|
||||
putNotesAtLast: false
|
||||
});
|
||||
|
||||
// Extract text from Office document
|
||||
const data = await this.parseOfficeDocument(buffer);
|
||||
const trimmed = (text || '').trim();
|
||||
|
||||
// Extract text from Office document
|
||||
const combinedText = data.data && data.data.trim().length > 0 ? data.data.trim() : '';
|
||||
const confidence = combinedText.length > 0 ? 0.99 : 0; // High confidence for direct text extraction
|
||||
|
||||
const result: OCRResult = {
|
||||
text: combinedText,
|
||||
confidence,
|
||||
extractedAt: new Date().toISOString(),
|
||||
language,
|
||||
pageCount: 1 // Office documents are treated as single logical document
|
||||
};
|
||||
|
||||
return result;
|
||||
|
||||
} catch (error) {
|
||||
log.error(`Office document text extraction failed: ${error}`);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
private async parseOfficeDocument(buffer: Buffer): Promise<{ data: string }> {
|
||||
try {
|
||||
// Use promise-based API directly
|
||||
const data = await officeParser.parseOfficeAsync(buffer, {
|
||||
outputErrorToConsole: false,
|
||||
newlineDelimiter: '\n',
|
||||
ignoreNotes: false,
|
||||
putNotesAtLast: false
|
||||
});
|
||||
|
||||
return {
|
||||
data: data || ''
|
||||
};
|
||||
} catch (error) {
|
||||
throw new Error(`Office document parsing failed: ${error}`);
|
||||
}
|
||||
return {
|
||||
text: trimmed,
|
||||
confidence: trimmed.length > 0 ? 0.99 : 0,
|
||||
extractedAt: new Date().toISOString(),
|
||||
language: options.language || "eng",
|
||||
pageCount: 1
|
||||
};
|
||||
}
|
||||
|
||||
getProcessingType(): string {
|
||||
@@ -85,6 +54,6 @@ export class OfficeProcessor extends FileProcessor {
|
||||
}
|
||||
|
||||
async cleanup(): Promise<void> {
|
||||
await this.imageProcessor.cleanup();
|
||||
// Nothing to clean up.
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user