diff --git a/apps/server/package.json b/apps/server/package.json index 14852983f0..0e5e75acf8 100644 --- a/apps/server/package.json +++ b/apps/server/package.json @@ -114,7 +114,7 @@ "mime-types": "3.0.2", "multer": "2.1.1", "normalize-strings": "1.1.1", - "officeparser": "6.0.7", + "officeparser": "6.1.0", "rand-token": "1.0.1", "safe-compare": "1.1.4", "sanitize-filename": "1.6.4", diff --git a/apps/server/src/services/ocr/processors/office_processor.ts b/apps/server/src/services/ocr/processors/office_processor.ts index f2ee7e8ebe..42e4c7bb24 100644 --- a/apps/server/src/services/ocr/processors/office_processor.ts +++ b/apps/server/src/services/ocr/processors/office_processor.ts @@ -1,25 +1,19 @@ -import { parseExcel } from 'officeparser/dist/parsers/ExcelParser.js'; -import { parseOpenOffice } from 'officeparser/dist/parsers/OpenOfficeParser.js'; -import { parsePowerPoint } from 'officeparser/dist/parsers/PowerPointParser.js'; -import { parseWord } from 'officeparser/dist/parsers/WordParser.js'; -import type { OfficeParserConfig } from 'officeparser/dist/types.js'; +import { OfficeParser, type OfficeParserConfig } from 'officeparser'; import log from '../../log.js'; import { OCRProcessingOptions, OCRResult } from '../ocr_service.js'; import { FileProcessor } from './file_processor.js'; -type Parser = (buffer: Buffer, config: OfficeParserConfig) => Promise<{ toText(): string }>; - -const PARSER_BY_MIME: Record = { +const SUPPORTED_MIME_TYPES = new Set([ // Office Open XML - 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': parseWord, - 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': parseExcel, - 'application/vnd.openxmlformats-officedocument.presentationml.presentation': parsePowerPoint, + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + 'application/vnd.openxmlformats-officedocument.presentationml.presentation', // OpenDocument - 'application/vnd.oasis.opendocument.text': parseOpenOffice, - 'application/vnd.oasis.opendocument.spreadsheet': parseOpenOffice, - 'application/vnd.oasis.opendocument.presentation': parseOpenOffice -}; + 'application/vnd.oasis.opendocument.text', + 'application/vnd.oasis.opendocument.spreadsheet', + 'application/vnd.oasis.opendocument.presentation' +]); const PARSER_CONFIG: OfficeParserConfig = { outputErrorToConsole: false, @@ -30,28 +24,27 @@ const PARSER_CONFIG: OfficeParserConfig = { /** * Office document processor for extracting text from DOCX/XLSX/PPTX and ODT/ODS/ODP files. - * Uses individual parsers from officeparser v6 to avoid pulling in pdfjs-dist. + * Uses officeparser's main API, which auto-detects the format from the buffer's magic bytes. */ export class OfficeProcessor extends FileProcessor { canProcess(mimeType: string): boolean { - return mimeType in PARSER_BY_MIME; + return SUPPORTED_MIME_TYPES.has(mimeType); } getSupportedMimeTypes(): string[] { - return Object.keys(PARSER_BY_MIME); + return [...SUPPORTED_MIME_TYPES]; } async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise { const mimeType = options.mimeType; - if (!mimeType || !(mimeType in PARSER_BY_MIME)) { + if (!mimeType || !SUPPORTED_MIME_TYPES.has(mimeType)) { throw new Error(`Unsupported MIME type for Office processor: ${mimeType}`); } log.info(`Starting Office document text extraction for ${mimeType}...`); - const parse = PARSER_BY_MIME[mimeType]; - const ast = await parse(buffer, PARSER_CONFIG); + const ast = await OfficeParser.parseOffice(buffer, PARSER_CONFIG); const trimmed = ast.toText().trim(); return { diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index a186016fa0..6eb381bda8 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -819,8 +819,8 @@ importers: specifier: 1.1.1 version: 1.1.1 officeparser: - specifier: 6.0.7 - version: 6.0.7(encoding@0.1.13) + specifier: 6.1.0 + version: 6.1.0(encoding@0.1.13) rand-token: specifier: 1.0.1 version: 1.0.1 @@ -6680,6 +6680,10 @@ packages: resolution: {integrity: sha512-9k/gHF6n/pAi/9tqr3m3aqkuiNosYTurLLUtc7xQ9sxB/wm7WPygCv8GYa6mS0fLJEHhqMC1ATYhz++U/lRHqg==} engines: {node: '>=10.0.0'} + '@xmldom/xmldom@0.9.9': + resolution: {integrity: sha512-qycIHAucxy/LXAYIjmLmtQ8q9GPnMbnjG1KXhWm9o5sCr6pOYDATkMPiTNa6/v8eELyqOQ2FsEqeoFYmgv/gJg==} + engines: {node: '>=14.6'} + '@xtuc/ieee754@1.2.0': resolution: {integrity: sha512-DX8nKgqcGwsc0eJSqYt5lwP4DH5FlHnmuWWBRy7X0NcaGR0ZtuyeESgMwTYVEtxmsNGY+qit4QYT/MIYTOTPeA==} @@ -8827,6 +8831,10 @@ packages: resolution: {integrity: sha512-Ievi/yy8DS3ygGvT47PjSfdFoX+2isQueoYP1cntFW1JLYAuS4GD7NUPGg4zv2iZfV52uDyk5w5Z0TdpRS6Q1g==} engines: {node: '>=20'} + file-type@22.0.1: + resolution: {integrity: sha512-ww5Mhre0EE+jmBvOXTmXAbEMuZE7uX4a3+oRCQFNj8w++g3ev913N6tXQz0XTXbueQ5TWQfm6BdaViEHHn8bhA==} + engines: {node: '>=22'} + file-uri-to-path@1.0.0: resolution: {integrity: sha512-0Zt+s3L7Vf1biwWZ29aARiVYLx7iMGnEUl9x33fbB/j3jR81u/O2LbqK+Bm1CDSNDKVtJ/YjwY7TUd5SkeLQLw==} @@ -11199,8 +11207,8 @@ packages: ofetch@1.5.1: resolution: {integrity: sha512-2W4oUZlVaqAPAil6FUg/difl6YhqhUR7x2eZY4bQCko22UXg3hptq9KLQdqFClV+Wu85UX7hNtdGTngi/1BxcA==} - officeparser@6.0.7: - resolution: {integrity: sha512-MkNHyWIfEZRDtB8c0fgJHdb4Ui0I/WztBjlUjlPiEbTO6dIYaJMt+llS5p5Foj13guUZgGxkkM9VwsVRthHNAA==} + officeparser@6.1.0: + resolution: {integrity: sha512-S/dMjUyhbeyDNUjnuGKsmuDx3IoOTcyy6uFzZ6321paaF5NVQVS+Ht8SkQEzEQ85DJ256LnTroMTP2PVKebX1Q==} engines: {node: '>=18.0.0'} hasBin: true @@ -13427,6 +13435,10 @@ packages: resolution: {integrity: sha512-ZPtzy0hu4cZjv3z5NW9gfKnNLjoz4y6uv4HlelAjDK7sY/xOkKZv9xK/WQpcsBB3jEybChz9DPC2U/+cusjJVQ==} engines: {node: '>=18'} + uint8array-extras@1.5.0: + resolution: {integrity: sha512-rvKSBiC5zqCCiDZ9kAOszZcDvdAHwwIKJG33Ykj43OKcWsnmcBRL09YTU4nOeHZ8Y2a7l1MgTd08SBe9A8Qj6A==} + engines: {node: '>=18'} + ulid@2.4.0: resolution: {integrity: sha512-fIRiVTJNcSRmXKPZtGzFQv9WRrZ3M9eoptl/teFJvjOzmpU+/K/JH6HZ8deBfb5vMEpicJcLn7JmvdknlMq7Zg==} hasBin: true @@ -21603,6 +21615,8 @@ snapshots: '@xmldom/xmldom@0.8.12': {} + '@xmldom/xmldom@0.9.9': {} + '@xtuc/ieee754@1.2.0': {} '@xtuc/long@4.2.2': {} @@ -24297,6 +24311,15 @@ snapshots: transitivePeerDependencies: - supports-color + file-type@22.0.1: + dependencies: + '@tokenizer/inflate': 0.4.1 + strtok3: 10.3.5 + token-types: 6.1.2 + uint8array-extras: 1.5.0 + transitivePeerDependencies: + - supports-color + file-uri-to-path@1.0.0: {} file-uri-to-path@2.0.0: {} @@ -27078,14 +27101,13 @@ snapshots: node-fetch-native: 1.6.7 ufo: 1.6.1 - officeparser@6.0.7(encoding@0.1.13): + officeparser@6.1.0(encoding@0.1.13): dependencies: - '@xmldom/xmldom': 0.8.12 - concat-stream: 2.0.0 - file-type: 21.3.4 + '@xmldom/xmldom': 0.9.9 + fflate: 0.8.2 + file-type: 22.0.1 pdfjs-dist: 5.6.205 tesseract.js: 7.0.0(encoding@0.1.13) - yauzl: 3.3.0 transitivePeerDependencies: - encoding - supports-color @@ -29640,6 +29662,8 @@ snapshots: uint8array-extras@1.4.0: {} + uint8array-extras@1.5.0: {} + ulid@2.4.0: {} ulid@3.0.2: {}