chore(deps): update dependency officeparser to v6.1.0 (#9482)

This commit is contained in:
Elian Doran
2026-04-19 12:36:47 +03:00
committed by GitHub
3 changed files with 48 additions and 31 deletions

View File

@@ -114,7 +114,7 @@
"mime-types": "3.0.2",
"multer": "2.1.1",
"normalize-strings": "1.1.1",
"officeparser": "6.0.7",
"officeparser": "6.1.0",
"rand-token": "1.0.1",
"safe-compare": "1.1.4",
"sanitize-filename": "1.6.4",

View File

@@ -1,25 +1,19 @@
import { parseExcel } from 'officeparser/dist/parsers/ExcelParser.js';
import { parseOpenOffice } from 'officeparser/dist/parsers/OpenOfficeParser.js';
import { parsePowerPoint } from 'officeparser/dist/parsers/PowerPointParser.js';
import { parseWord } from 'officeparser/dist/parsers/WordParser.js';
import type { OfficeParserConfig } from 'officeparser/dist/types.js';
import { OfficeParser, type OfficeParserConfig } from 'officeparser';
import log from '../../log.js';
import { OCRProcessingOptions, OCRResult } from '../ocr_service.js';
import { FileProcessor } from './file_processor.js';
type Parser = (buffer: Buffer, config: OfficeParserConfig) => Promise<{ toText(): string }>;
const PARSER_BY_MIME: Record<string, Parser> = {
const SUPPORTED_MIME_TYPES = new Set([
// Office Open XML
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': parseWord,
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': parseExcel,
'application/vnd.openxmlformats-officedocument.presentationml.presentation': parsePowerPoint,
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
// OpenDocument
'application/vnd.oasis.opendocument.text': parseOpenOffice,
'application/vnd.oasis.opendocument.spreadsheet': parseOpenOffice,
'application/vnd.oasis.opendocument.presentation': parseOpenOffice
};
'application/vnd.oasis.opendocument.text',
'application/vnd.oasis.opendocument.spreadsheet',
'application/vnd.oasis.opendocument.presentation'
]);
const PARSER_CONFIG: OfficeParserConfig = {
outputErrorToConsole: false,
@@ -30,28 +24,27 @@ const PARSER_CONFIG: OfficeParserConfig = {
/**
* Office document processor for extracting text from DOCX/XLSX/PPTX and ODT/ODS/ODP files.
* Uses individual parsers from officeparser v6 to avoid pulling in pdfjs-dist.
* Uses officeparser's main API, which auto-detects the format from the buffer's magic bytes.
*/
export class OfficeProcessor extends FileProcessor {
canProcess(mimeType: string): boolean {
return mimeType in PARSER_BY_MIME;
return SUPPORTED_MIME_TYPES.has(mimeType);
}
getSupportedMimeTypes(): string[] {
return Object.keys(PARSER_BY_MIME);
return [...SUPPORTED_MIME_TYPES];
}
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
const mimeType = options.mimeType;
if (!mimeType || !(mimeType in PARSER_BY_MIME)) {
if (!mimeType || !SUPPORTED_MIME_TYPES.has(mimeType)) {
throw new Error(`Unsupported MIME type for Office processor: ${mimeType}`);
}
log.info(`Starting Office document text extraction for ${mimeType}...`);
const parse = PARSER_BY_MIME[mimeType];
const ast = await parse(buffer, PARSER_CONFIG);
const ast = await OfficeParser.parseOffice(buffer, PARSER_CONFIG);
const trimmed = ast.toText().trim();
return {

42
pnpm-lock.yaml generated
View File

@@ -819,8 +819,8 @@ importers:
specifier: 1.1.1
version: 1.1.1
officeparser:
specifier: 6.0.7
version: 6.0.7(encoding@0.1.13)
specifier: 6.1.0
version: 6.1.0(encoding@0.1.13)
rand-token:
specifier: 1.0.1
version: 1.0.1
@@ -6680,6 +6680,10 @@ packages:
resolution: {integrity: sha512-9k/gHF6n/pAi/9tqr3m3aqkuiNosYTurLLUtc7xQ9sxB/wm7WPygCv8GYa6mS0fLJEHhqMC1ATYhz++U/lRHqg==}
engines: {node: '>=10.0.0'}
'@xmldom/xmldom@0.9.9':
resolution: {integrity: sha512-qycIHAucxy/LXAYIjmLmtQ8q9GPnMbnjG1KXhWm9o5sCr6pOYDATkMPiTNa6/v8eELyqOQ2FsEqeoFYmgv/gJg==}
engines: {node: '>=14.6'}
'@xtuc/ieee754@1.2.0':
resolution: {integrity: sha512-DX8nKgqcGwsc0eJSqYt5lwP4DH5FlHnmuWWBRy7X0NcaGR0ZtuyeESgMwTYVEtxmsNGY+qit4QYT/MIYTOTPeA==}
@@ -8827,6 +8831,10 @@ packages:
resolution: {integrity: sha512-Ievi/yy8DS3ygGvT47PjSfdFoX+2isQueoYP1cntFW1JLYAuS4GD7NUPGg4zv2iZfV52uDyk5w5Z0TdpRS6Q1g==}
engines: {node: '>=20'}
file-type@22.0.1:
resolution: {integrity: sha512-ww5Mhre0EE+jmBvOXTmXAbEMuZE7uX4a3+oRCQFNj8w++g3ev913N6tXQz0XTXbueQ5TWQfm6BdaViEHHn8bhA==}
engines: {node: '>=22'}
file-uri-to-path@1.0.0:
resolution: {integrity: sha512-0Zt+s3L7Vf1biwWZ29aARiVYLx7iMGnEUl9x33fbB/j3jR81u/O2LbqK+Bm1CDSNDKVtJ/YjwY7TUd5SkeLQLw==}
@@ -11199,8 +11207,8 @@ packages:
ofetch@1.5.1:
resolution: {integrity: sha512-2W4oUZlVaqAPAil6FUg/difl6YhqhUR7x2eZY4bQCko22UXg3hptq9KLQdqFClV+Wu85UX7hNtdGTngi/1BxcA==}
officeparser@6.0.7:
resolution: {integrity: sha512-MkNHyWIfEZRDtB8c0fgJHdb4Ui0I/WztBjlUjlPiEbTO6dIYaJMt+llS5p5Foj13guUZgGxkkM9VwsVRthHNAA==}
officeparser@6.1.0:
resolution: {integrity: sha512-S/dMjUyhbeyDNUjnuGKsmuDx3IoOTcyy6uFzZ6321paaF5NVQVS+Ht8SkQEzEQ85DJ256LnTroMTP2PVKebX1Q==}
engines: {node: '>=18.0.0'}
hasBin: true
@@ -13427,6 +13435,10 @@ packages:
resolution: {integrity: sha512-ZPtzy0hu4cZjv3z5NW9gfKnNLjoz4y6uv4HlelAjDK7sY/xOkKZv9xK/WQpcsBB3jEybChz9DPC2U/+cusjJVQ==}
engines: {node: '>=18'}
uint8array-extras@1.5.0:
resolution: {integrity: sha512-rvKSBiC5zqCCiDZ9kAOszZcDvdAHwwIKJG33Ykj43OKcWsnmcBRL09YTU4nOeHZ8Y2a7l1MgTd08SBe9A8Qj6A==}
engines: {node: '>=18'}
ulid@2.4.0:
resolution: {integrity: sha512-fIRiVTJNcSRmXKPZtGzFQv9WRrZ3M9eoptl/teFJvjOzmpU+/K/JH6HZ8deBfb5vMEpicJcLn7JmvdknlMq7Zg==}
hasBin: true
@@ -21603,6 +21615,8 @@ snapshots:
'@xmldom/xmldom@0.8.12': {}
'@xmldom/xmldom@0.9.9': {}
'@xtuc/ieee754@1.2.0': {}
'@xtuc/long@4.2.2': {}
@@ -24297,6 +24311,15 @@ snapshots:
transitivePeerDependencies:
- supports-color
file-type@22.0.1:
dependencies:
'@tokenizer/inflate': 0.4.1
strtok3: 10.3.5
token-types: 6.1.2
uint8array-extras: 1.5.0
transitivePeerDependencies:
- supports-color
file-uri-to-path@1.0.0: {}
file-uri-to-path@2.0.0: {}
@@ -27078,14 +27101,13 @@ snapshots:
node-fetch-native: 1.6.7
ufo: 1.6.1
officeparser@6.0.7(encoding@0.1.13):
officeparser@6.1.0(encoding@0.1.13):
dependencies:
'@xmldom/xmldom': 0.8.12
concat-stream: 2.0.0
file-type: 21.3.4
'@xmldom/xmldom': 0.9.9
fflate: 0.8.2
file-type: 22.0.1
pdfjs-dist: 5.6.205
tesseract.js: 7.0.0(encoding@0.1.13)
yauzl: 3.3.0
transitivePeerDependencies:
- encoding
- supports-color
@@ -29640,6 +29662,8 @@ snapshots:
uint8array-extras@1.4.0: {}
uint8array-extras@1.5.0: {}
ulid@2.4.0: {}
ulid@3.0.2: {}