diff --git a/apps/server/src/services/ocr/processors/office_processor.ts b/apps/server/src/services/ocr/processors/office_processor.ts index f2ee7e8ebe..cb6b5f0a27 100644 --- a/apps/server/src/services/ocr/processors/office_processor.ts +++ b/apps/server/src/services/ocr/processors/office_processor.ts @@ -1,25 +1,20 @@ -import { parseExcel } from 'officeparser/dist/parsers/ExcelParser.js'; -import { parseOpenOffice } from 'officeparser/dist/parsers/OpenOfficeParser.js'; -import { parsePowerPoint } from 'officeparser/dist/parsers/PowerPointParser.js'; -import { parseWord } from 'officeparser/dist/parsers/WordParser.js'; -import type { OfficeParserConfig } from 'officeparser/dist/types.js'; +import officeparser from 'officeparser'; +import type { OfficeParserConfig } from 'officeparser'; import log from '../../log.js'; import { OCRProcessingOptions, OCRResult } from '../ocr_service.js'; import { FileProcessor } from './file_processor.js'; -type Parser = (buffer: Buffer, config: OfficeParserConfig) => Promise<{ toText(): string }>; - -const PARSER_BY_MIME: Record = { +const SUPPORTED_MIME_TYPES = new Set([ // Office Open XML - 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': parseWord, - 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': parseExcel, - 'application/vnd.openxmlformats-officedocument.presentationml.presentation': parsePowerPoint, + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + 'application/vnd.openxmlformats-officedocument.presentationml.presentation', // OpenDocument - 'application/vnd.oasis.opendocument.text': parseOpenOffice, - 'application/vnd.oasis.opendocument.spreadsheet': parseOpenOffice, - 'application/vnd.oasis.opendocument.presentation': parseOpenOffice -}; + 'application/vnd.oasis.opendocument.text', + 'application/vnd.oasis.opendocument.spreadsheet', + 'application/vnd.oasis.opendocument.presentation' +]); const PARSER_CONFIG: OfficeParserConfig = { outputErrorToConsole: false, @@ -30,29 +25,28 @@ const PARSER_CONFIG: OfficeParserConfig = { /** * Office document processor for extracting text from DOCX/XLSX/PPTX and ODT/ODS/ODP files. - * Uses individual parsers from officeparser v6 to avoid pulling in pdfjs-dist. + * Uses officeparser's main API, which auto-detects the format from the buffer's magic bytes. */ export class OfficeProcessor extends FileProcessor { canProcess(mimeType: string): boolean { - return mimeType in PARSER_BY_MIME; + return SUPPORTED_MIME_TYPES.has(mimeType); } getSupportedMimeTypes(): string[] { - return Object.keys(PARSER_BY_MIME); + return [...SUPPORTED_MIME_TYPES]; } async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise { const mimeType = options.mimeType; - if (!mimeType || !(mimeType in PARSER_BY_MIME)) { + if (!mimeType || !SUPPORTED_MIME_TYPES.has(mimeType)) { throw new Error(`Unsupported MIME type for Office processor: ${mimeType}`); } log.info(`Starting Office document text extraction for ${mimeType}...`); - const parse = PARSER_BY_MIME[mimeType]; - const ast = await parse(buffer, PARSER_CONFIG); - const trimmed = ast.toText().trim(); + const text = await officeparser.parseOfficeAsync(buffer, PARSER_CONFIG); + const trimmed = text.trim(); return { text: trimmed,