From a9e7cd7bfead66432f21171944179fd0c1d67582 Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Sat, 18 Apr 2026 02:36:22 +0000 Subject: [PATCH 1/3] chore(deps): update dependency officeparser to v6.1.0 --- apps/server/package.json | 2 +- pnpm-lock.yaml | 52 +++++++++++++++++++++++++--------------- 2 files changed, 34 insertions(+), 20 deletions(-) diff --git a/apps/server/package.json b/apps/server/package.json index 3f5871dfda..6cefc7f673 100644 --- a/apps/server/package.json +++ b/apps/server/package.json @@ -114,7 +114,7 @@ "mime-types": "3.0.2", "multer": "2.1.1", "normalize-strings": "1.1.1", - "officeparser": "6.0.7", + "officeparser": "6.1.0", "rand-token": "1.0.1", "safe-compare": "1.1.4", "sanitize-filename": "1.6.4", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 6c1c7168a7..f485d949ee 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -816,8 +816,8 @@ importers: specifier: 1.1.1 version: 1.1.1 officeparser: - specifier: 6.0.7 - version: 6.0.7(encoding@0.1.13) + specifier: 6.1.0 + version: 6.1.0(encoding@0.1.13) rand-token: specifier: 1.0.1 version: 1.0.1 @@ -6674,6 +6674,10 @@ packages: resolution: {integrity: sha512-9k/gHF6n/pAi/9tqr3m3aqkuiNosYTurLLUtc7xQ9sxB/wm7WPygCv8GYa6mS0fLJEHhqMC1ATYhz++U/lRHqg==} engines: {node: '>=10.0.0'} + '@xmldom/xmldom@0.9.9': + resolution: {integrity: sha512-qycIHAucxy/LXAYIjmLmtQ8q9GPnMbnjG1KXhWm9o5sCr6pOYDATkMPiTNa6/v8eELyqOQ2FsEqeoFYmgv/gJg==} + engines: {node: '>=14.6'} + '@xtuc/ieee754@1.2.0': resolution: {integrity: sha512-DX8nKgqcGwsc0eJSqYt5lwP4DH5FlHnmuWWBRy7X0NcaGR0ZtuyeESgMwTYVEtxmsNGY+qit4QYT/MIYTOTPeA==} @@ -8821,6 +8825,10 @@ packages: resolution: {integrity: sha512-Ievi/yy8DS3ygGvT47PjSfdFoX+2isQueoYP1cntFW1JLYAuS4GD7NUPGg4zv2iZfV52uDyk5w5Z0TdpRS6Q1g==} engines: {node: '>=20'} + file-type@22.0.1: + resolution: {integrity: sha512-ww5Mhre0EE+jmBvOXTmXAbEMuZE7uX4a3+oRCQFNj8w++g3ev913N6tXQz0XTXbueQ5TWQfm6BdaViEHHn8bhA==} + engines: {node: '>=22'} + file-uri-to-path@1.0.0: resolution: {integrity: sha512-0Zt+s3L7Vf1biwWZ29aARiVYLx7iMGnEUl9x33fbB/j3jR81u/O2LbqK+Bm1CDSNDKVtJ/YjwY7TUd5SkeLQLw==} @@ -11193,8 +11201,8 @@ packages: ofetch@1.5.1: resolution: {integrity: sha512-2W4oUZlVaqAPAil6FUg/difl6YhqhUR7x2eZY4bQCko22UXg3hptq9KLQdqFClV+Wu85UX7hNtdGTngi/1BxcA==} - officeparser@6.0.7: - resolution: {integrity: sha512-MkNHyWIfEZRDtB8c0fgJHdb4Ui0I/WztBjlUjlPiEbTO6dIYaJMt+llS5p5Foj13guUZgGxkkM9VwsVRthHNAA==} + officeparser@6.1.0: + resolution: {integrity: sha512-S/dMjUyhbeyDNUjnuGKsmuDx3IoOTcyy6uFzZ6321paaF5NVQVS+Ht8SkQEzEQ85DJ256LnTroMTP2PVKebX1Q==} engines: {node: '>=18.0.0'} hasBin: true @@ -11656,10 +11664,6 @@ packages: resolution: {integrity: sha512-OCVPnIObs4N29kxTjzLfUryOkvZEq+pf8jTF0lg8E7uETuWHA+v7j3c/xJmiqpX450191LlmZfUKkXxkTry7nA==} engines: {node: ^10 || ^12 || >=14} - postcss@8.5.8: - resolution: {integrity: sha512-OW/rX8O/jXnm82Ey1k44pObPtdblfiuWnrd8X7GJ7emImCOstunGbXUpp7HdBrFQX6rJzn3sPT397Wp5aCwCHg==} - engines: {node: ^10 || ^12 || >=14} - postcss@8.5.9: resolution: {integrity: sha512-7a70Nsot+EMX9fFU3064K/kdHWZqGVY+BADLyXc8Dfv+mTLLVl6JzJpPaCZ2kQL9gIJvKXSLMHhqdRRjwQeFtw==} engines: {node: ^10 || ^12 || >=14} @@ -13425,6 +13429,10 @@ packages: resolution: {integrity: sha512-ZPtzy0hu4cZjv3z5NW9gfKnNLjoz4y6uv4HlelAjDK7sY/xOkKZv9xK/WQpcsBB3jEybChz9DPC2U/+cusjJVQ==} engines: {node: '>=18'} + uint8array-extras@1.5.0: + resolution: {integrity: sha512-rvKSBiC5zqCCiDZ9kAOszZcDvdAHwwIKJG33Ykj43OKcWsnmcBRL09YTU4nOeHZ8Y2a7l1MgTd08SBe9A8Qj6A==} + engines: {node: '>=18'} + ulid@2.4.0: resolution: {integrity: sha512-fIRiVTJNcSRmXKPZtGzFQv9WRrZ3M9eoptl/teFJvjOzmpU+/K/JH6HZ8deBfb5vMEpicJcLn7JmvdknlMq7Zg==} hasBin: true @@ -21589,6 +21597,8 @@ snapshots: '@xmldom/xmldom@0.8.12': {} + '@xmldom/xmldom@0.9.9': {} + '@xtuc/ieee754@1.2.0': {} '@xtuc/long@4.2.2': {} @@ -24283,6 +24293,15 @@ snapshots: transitivePeerDependencies: - supports-color + file-type@22.0.1: + dependencies: + '@tokenizer/inflate': 0.4.1 + strtok3: 10.3.5 + token-types: 6.1.2 + uint8array-extras: 1.5.0 + transitivePeerDependencies: + - supports-color + file-uri-to-path@1.0.0: {} file-uri-to-path@2.0.0: {} @@ -27069,14 +27088,13 @@ snapshots: node-fetch-native: 1.6.7 ufo: 1.6.1 - officeparser@6.0.7(encoding@0.1.13): + officeparser@6.1.0(encoding@0.1.13): dependencies: - '@xmldom/xmldom': 0.8.12 - concat-stream: 2.0.0 - file-type: 21.3.4 + '@xmldom/xmldom': 0.9.9 + fflate: 0.8.2 + file-type: 22.0.1 pdfjs-dist: 5.6.205 tesseract.js: 7.0.0(encoding@0.1.13) - yauzl: 3.3.0 transitivePeerDependencies: - encoding - supports-color @@ -27552,12 +27570,6 @@ snapshots: picocolors: 1.1.1 source-map-js: 1.2.1 - postcss@8.5.8: - dependencies: - nanoid: 3.3.11 - picocolors: 1.1.1 - source-map-js: 1.2.1 - postcss@8.5.9: dependencies: nanoid: 3.3.11 @@ -29637,6 +29649,8 @@ snapshots: uint8array-extras@1.4.0: {} + uint8array-extras@1.5.0: {} + ulid@2.4.0: {} ulid@3.0.2: {} From b2bcccb4c7e47b1a8c07da66e59f13098aabbb4d Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Sat, 18 Apr 2026 17:07:02 +0000 Subject: [PATCH 2/3] fix(ocr): adapt OfficeProcessor to officeparser v6.1.0 ESM changes v6.1.0 added native ESM with Node16 resolution and a strict exports field, breaking deep subpath imports like officeparser/dist/parsers/ExcelParser.js. Switch to the main package entry and use parseOfficeAsync(), which accepts a Buffer and auto-detects the format via magic bytes. Co-authored-by: Elian Doran --- .../ocr/processors/office_processor.ts | 38 ++++++++----------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/apps/server/src/services/ocr/processors/office_processor.ts b/apps/server/src/services/ocr/processors/office_processor.ts index f2ee7e8ebe..cb6b5f0a27 100644 --- a/apps/server/src/services/ocr/processors/office_processor.ts +++ b/apps/server/src/services/ocr/processors/office_processor.ts @@ -1,25 +1,20 @@ -import { parseExcel } from 'officeparser/dist/parsers/ExcelParser.js'; -import { parseOpenOffice } from 'officeparser/dist/parsers/OpenOfficeParser.js'; -import { parsePowerPoint } from 'officeparser/dist/parsers/PowerPointParser.js'; -import { parseWord } from 'officeparser/dist/parsers/WordParser.js'; -import type { OfficeParserConfig } from 'officeparser/dist/types.js'; +import officeparser from 'officeparser'; +import type { OfficeParserConfig } from 'officeparser'; import log from '../../log.js'; import { OCRProcessingOptions, OCRResult } from '../ocr_service.js'; import { FileProcessor } from './file_processor.js'; -type Parser = (buffer: Buffer, config: OfficeParserConfig) => Promise<{ toText(): string }>; - -const PARSER_BY_MIME: Record = { +const SUPPORTED_MIME_TYPES = new Set([ // Office Open XML - 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': parseWord, - 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': parseExcel, - 'application/vnd.openxmlformats-officedocument.presentationml.presentation': parsePowerPoint, + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + 'application/vnd.openxmlformats-officedocument.presentationml.presentation', // OpenDocument - 'application/vnd.oasis.opendocument.text': parseOpenOffice, - 'application/vnd.oasis.opendocument.spreadsheet': parseOpenOffice, - 'application/vnd.oasis.opendocument.presentation': parseOpenOffice -}; + 'application/vnd.oasis.opendocument.text', + 'application/vnd.oasis.opendocument.spreadsheet', + 'application/vnd.oasis.opendocument.presentation' +]); const PARSER_CONFIG: OfficeParserConfig = { outputErrorToConsole: false, @@ -30,29 +25,28 @@ const PARSER_CONFIG: OfficeParserConfig = { /** * Office document processor for extracting text from DOCX/XLSX/PPTX and ODT/ODS/ODP files. - * Uses individual parsers from officeparser v6 to avoid pulling in pdfjs-dist. + * Uses officeparser's main API, which auto-detects the format from the buffer's magic bytes. */ export class OfficeProcessor extends FileProcessor { canProcess(mimeType: string): boolean { - return mimeType in PARSER_BY_MIME; + return SUPPORTED_MIME_TYPES.has(mimeType); } getSupportedMimeTypes(): string[] { - return Object.keys(PARSER_BY_MIME); + return [...SUPPORTED_MIME_TYPES]; } async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise { const mimeType = options.mimeType; - if (!mimeType || !(mimeType in PARSER_BY_MIME)) { + if (!mimeType || !SUPPORTED_MIME_TYPES.has(mimeType)) { throw new Error(`Unsupported MIME type for Office processor: ${mimeType}`); } log.info(`Starting Office document text extraction for ${mimeType}...`); - const parse = PARSER_BY_MIME[mimeType]; - const ast = await parse(buffer, PARSER_CONFIG); - const trimmed = ast.toText().trim(); + const text = await officeparser.parseOfficeAsync(buffer, PARSER_CONFIG); + const trimmed = text.trim(); return { text: trimmed, From f9baac34ccd514edf31c1e8d06bc1a0f0f26452e Mon Sep 17 00:00:00 2001 From: Elian Doran Date: Sun, 19 Apr 2026 12:10:08 +0300 Subject: [PATCH 3/3] fix(ocr): use correct officeparser v6.1.0 API v6.1.0 renamed parseOfficeAsync to OfficeParser.parseOffice (static method) and returns an AST object with toText() instead of a plain string. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../server/src/services/ocr/processors/office_processor.ts | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/apps/server/src/services/ocr/processors/office_processor.ts b/apps/server/src/services/ocr/processors/office_processor.ts index cb6b5f0a27..42e4c7bb24 100644 --- a/apps/server/src/services/ocr/processors/office_processor.ts +++ b/apps/server/src/services/ocr/processors/office_processor.ts @@ -1,5 +1,4 @@ -import officeparser from 'officeparser'; -import type { OfficeParserConfig } from 'officeparser'; +import { OfficeParser, type OfficeParserConfig } from 'officeparser'; import log from '../../log.js'; import { OCRProcessingOptions, OCRResult } from '../ocr_service.js'; @@ -45,8 +44,8 @@ export class OfficeProcessor extends FileProcessor { log.info(`Starting Office document text extraction for ${mimeType}...`); - const text = await officeparser.parseOfficeAsync(buffer, PARSER_CONFIG); - const trimmed = text.trim(); + const ast = await OfficeParser.parseOffice(buffer, PARSER_CONFIG); + const trimmed = ast.toText().trim(); return { text: trimmed,