fix(ocr): use correct officeparser v6.1.0 API

v6.1.0 renamed parseOfficeAsync to OfficeParser.parseOffice (static
method) and returns an AST object with toText() instead of a plain
string.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Elian Doran
2026-04-19 12:10:08 +03:00
parent b2bcccb4c7
commit f9baac34cc

View File

@@ -1,5 +1,4 @@
import officeparser from 'officeparser';
import type { OfficeParserConfig } from 'officeparser';
import { OfficeParser, type OfficeParserConfig } from 'officeparser';
import log from '../../log.js';
import { OCRProcessingOptions, OCRResult } from '../ocr_service.js';
@@ -45,8 +44,8 @@ export class OfficeProcessor extends FileProcessor {
log.info(`Starting Office document text extraction for ${mimeType}...`);
const text = await officeparser.parseOfficeAsync(buffer, PARSER_CONFIG);
const trimmed = text.trim();
const ast = await OfficeParser.parseOffice(buffer, PARSER_CONFIG);
const trimmed = ast.toText().trim();
return {
text: trimmed,