fix(ocr): properly handle office MIME types

This commit is contained in:
Elian Doran
2026-04-02 12:41:45 +03:00
parent ad29375975
commit bdd2b7e317
2 changed files with 38 additions and 112 deletions

View File

@@ -620,7 +620,9 @@ class OCRService {
*/
getBlobsNeedingOCR(): Array<{ blobId: string; mimeType: string; entityType: 'note' | 'attachment'; entityId: string }> {
try {
// Get notes with blobs that need OCR (both image notes and file notes with supported MIME types)
const supportedMimes = this.getAllSupportedMimeTypes();
const placeholders = supportedMimes.map(() => '?').join(', ');
const noteBlobs = sql.getRows<{
blobId: string;
mimeType: string;
@@ -629,35 +631,12 @@ class OCRService {
SELECT n.blobId, n.mime as mimeType, n.noteId as entityId
FROM notes n
JOIN blobs b ON n.blobId = b.blobId
WHERE (
n.type = 'image'
OR (
n.type = 'file'
AND n.mime IN (
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'application/msword',
'application/vnd.ms-excel',
'application/vnd.ms-powerpoint',
'application/rtf',
'application/pdf',
'image/jpeg',
'image/jpg',
'image/png',
'image/gif',
'image/bmp',
'image/tiff',
'image/webp'
)
)
)
WHERE (n.type = 'image' OR (n.type = 'file' AND n.mime IN (${placeholders})))
AND n.isDeleted = 0
AND n.blobId IS NOT NULL
AND b.textRepresentation IS NULL
`);
`, supportedMimes);
// Get attachments with blobs that need OCR (both image and file attachments with supported MIME types)
const attachmentBlobs = sql.getRows<{
blobId: string;
mimeType: string;
@@ -666,33 +645,11 @@ class OCRService {
SELECT a.blobId, a.mime as mimeType, a.attachmentId as entityId
FROM attachments a
JOIN blobs b ON a.blobId = b.blobId
WHERE (
a.role = 'image'
OR (
a.role = 'file'
AND a.mime IN (
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'application/msword',
'application/vnd.ms-excel',
'application/vnd.ms-powerpoint',
'application/rtf',
'application/pdf',
'image/jpeg',
'image/jpg',
'image/png',
'image/gif',
'image/bmp',
'image/tiff',
'image/webp'
)
)
)
WHERE (a.role = 'image' OR (a.role = 'file' AND a.mime IN (${placeholders})))
AND a.isDeleted = 0
AND a.blobId IS NOT NULL
AND b.textRepresentation IS NULL
`);
`, supportedMimes);
// Combine results
const result = [

View File

@@ -1,83 +1,52 @@
import * as officeParser from 'officeparser';
import log from '../../log.js';
import { OCRProcessingOptions,OCRResult } from '../ocr_service.js';
import { OCRProcessingOptions, OCRResult } from '../ocr_service.js';
import { FileProcessor } from './file_processor.js';
import { ImageProcessor } from './image_processor.js';
const SUPPORTED_TYPES = [
// Office Open XML
'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // DOCX
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // XLSX
'application/vnd.openxmlformats-officedocument.presentationml.presentation', // PPTX
// OpenDocument
'application/vnd.oasis.opendocument.text', // ODT
'application/vnd.oasis.opendocument.spreadsheet', // ODS
'application/vnd.oasis.opendocument.presentation' // ODP
];
/**
* Office document processor for extracting text and images from DOCX/XLSX/PPTX files
* Office document processor for extracting text from DOCX/XLSX/PPTX and ODT/ODS/ODP files.
*/
export class OfficeProcessor extends FileProcessor {
private imageProcessor: ImageProcessor;
private readonly supportedTypes = [
'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // DOCX
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // XLSX
'application/vnd.openxmlformats-officedocument.presentationml.presentation', // PPTX
'application/msword', // DOC
'application/vnd.ms-excel', // XLS
'application/vnd.ms-powerpoint', // PPT
'application/rtf' // RTF
];
constructor() {
super();
this.imageProcessor = new ImageProcessor();
}
canProcess(mimeType: string): boolean {
return this.supportedTypes.includes(mimeType);
return SUPPORTED_TYPES.includes(mimeType);
}
getSupportedMimeTypes(): string[] {
return [...this.supportedTypes];
return [...SUPPORTED_TYPES];
}
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
try {
log.info('Starting Office document text extraction...');
log.info('Starting Office document text extraction...');
const language = options.language || "eng";
const text = await officeParser.parseOfficeAsync(buffer, {
outputErrorToConsole: false,
newlineDelimiter: '\n',
ignoreNotes: false,
putNotesAtLast: false
});
// Extract text from Office document
const data = await this.parseOfficeDocument(buffer);
const trimmed = (text || '').trim();
// Extract text from Office document
const combinedText = data.data && data.data.trim().length > 0 ? data.data.trim() : '';
const confidence = combinedText.length > 0 ? 0.99 : 0; // High confidence for direct text extraction
const result: OCRResult = {
text: combinedText,
confidence,
extractedAt: new Date().toISOString(),
language,
pageCount: 1 // Office documents are treated as single logical document
};
return result;
} catch (error) {
log.error(`Office document text extraction failed: ${error}`);
throw error;
}
}
private async parseOfficeDocument(buffer: Buffer): Promise<{ data: string }> {
try {
// Use promise-based API directly
const data = await officeParser.parseOfficeAsync(buffer, {
outputErrorToConsole: false,
newlineDelimiter: '\n',
ignoreNotes: false,
putNotesAtLast: false
});
return {
data: data || ''
};
} catch (error) {
throw new Error(`Office document parsing failed: ${error}`);
}
return {
text: trimmed,
confidence: trimmed.length > 0 ? 0.99 : 0,
extractedAt: new Date().toISOString(),
language: options.language || "eng",
pageCount: 1
};
}
getProcessingType(): string {
@@ -85,6 +54,6 @@ export class OfficeProcessor extends FileProcessor {
}
async cleanup(): Promise<void> {
await this.imageProcessor.cleanup();
// Nothing to clean up.
}
}