From a3a52aaafe221bad1920a22021150d08901fb8eb Mon Sep 17 00:00:00 2001 From: Elian Doran Date: Fri, 3 Apr 2026 09:22:56 +0300 Subject: [PATCH] chore(ocr): switch to unpdf due to issues with pdfjs-dist --- apps/server/package.json | 5 ++- apps/server/src/services/ocr/ocr_service.ts | 2 + .../ocr/processors/office_processor.ts | 2 - .../services/ocr/processors/pdf_processor.ts | 39 +++++++++++++++++++ pnpm-lock.yaml | 19 ++++++++- 5 files changed, 61 insertions(+), 6 deletions(-) create mode 100644 apps/server/src/services/ocr/processors/pdf_processor.ts diff --git a/apps/server/package.json b/apps/server/package.json index 6ab053e984..20e87efdba 100644 --- a/apps/server/package.json +++ b/apps/server/package.json @@ -37,7 +37,8 @@ "better-sqlite3": "12.8.0", "html-to-text": "9.0.5", "node-html-parser": "7.1.0", - "sucrase": "3.35.1" + "sucrase": "3.35.1", + "unpdf": "1.4.0" }, "devDependencies": { "@braintree/sanitize-url": "7.1.2", @@ -126,8 +127,8 @@ "strip-bom": "5.0.0", "striptags": "3.2.0", "supertest": "7.2.2", - "tesseract.js": "6.0.1", "swagger-jsdoc": "6.2.8", + "tesseract.js": "6.0.1", "time2fa": "1.4.2", "tmp": "0.2.5", "turnish": "1.8.0", diff --git a/apps/server/src/services/ocr/ocr_service.ts b/apps/server/src/services/ocr/ocr_service.ts index f5594740ea..583e38e08c 100644 --- a/apps/server/src/services/ocr/ocr_service.ts +++ b/apps/server/src/services/ocr/ocr_service.ts @@ -9,6 +9,7 @@ import sql from '../sql.js'; import { FileProcessor } from './processors/file_processor.js'; import { ImageProcessor } from './processors/image_processor.js'; import { OfficeProcessor } from './processors/office_processor.js'; +import { PDFProcessor } from './processors/pdf_processor.js'; export interface OCRResult { text: string; @@ -34,6 +35,7 @@ class OCRService { constructor() { this.processors.set('image', new ImageProcessor()); + this.processors.set('pdf', new PDFProcessor()); this.processors.set('office', new OfficeProcessor()); } diff --git a/apps/server/src/services/ocr/processors/office_processor.ts b/apps/server/src/services/ocr/processors/office_processor.ts index 50b208e692..cabc07b3e0 100644 --- a/apps/server/src/services/ocr/processors/office_processor.ts +++ b/apps/server/src/services/ocr/processors/office_processor.ts @@ -5,8 +5,6 @@ import { OCRProcessingOptions, OCRResult } from '../ocr_service.js'; import { FileProcessor } from './file_processor.js'; const SUPPORTED_TYPES = [ - // PDF - 'application/pdf', // Office Open XML 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // DOCX 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // XLSX diff --git a/apps/server/src/services/ocr/processors/pdf_processor.ts b/apps/server/src/services/ocr/processors/pdf_processor.ts new file mode 100644 index 0000000000..9605665857 --- /dev/null +++ b/apps/server/src/services/ocr/processors/pdf_processor.ts @@ -0,0 +1,39 @@ +import { extractText, getDocumentProxy } from 'unpdf'; + +import log from '../../log.js'; +import { OCRProcessingOptions, OCRResult } from '../ocr_service.js'; +import { FileProcessor } from './file_processor.js'; + +/** + * PDF processor for extracting embedded text from PDF files using unpdf. + */ +export class PDFProcessor extends FileProcessor { + + canProcess(mimeType: string): boolean { + return mimeType.toLowerCase() === 'application/pdf'; + } + + getSupportedMimeTypes(): string[] { + return ['application/pdf']; + } + + async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise { + log.info('Starting PDF text extraction...'); + + const pdf = await getDocumentProxy(new Uint8Array(buffer)); + const { totalPages, text } = await extractText(pdf, { mergePages: true }); + + return { + text: text.trim(), + confidence: 0.99, + extractedAt: new Date().toISOString(), + language: options.language || "eng", + pageCount: totalPages + }; + } + + getProcessingType(): string { + return 'pdf'; + } + +} diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index c0326eb43e..87f74ca7b0 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -580,6 +580,9 @@ importers: sucrase: specifier: 3.35.1 version: 3.35.1 + unpdf: + specifier: 1.4.0 + version: 1.4.0(@napi-rs/canvas@0.1.96) devDependencies: '@braintree/sanitize-url': specifier: 7.1.2 @@ -15341,6 +15344,14 @@ packages: resolution: {integrity: sha512-b2/KCUlYZUeA7JFUuRJZPUtr4gZvBh7tavtv4fvk4+KV9pfGiR6CQAQAWl49ZpR3ts2dk4FYkP7EIgDJoiOLDA==} engines: {node: '>= 0.4.0'} + unpdf@1.4.0: + resolution: {integrity: sha512-TahIk0xdH/4jh/MxfclzU79g40OyxtP00VnEUZdEkJoYtXAHWLiir6t3FC6z3vDqQTzc2ZHcla6uEiVTNjejuA==} + peerDependencies: + '@napi-rs/canvas': ^0.1.69 + peerDependenciesMeta: + '@napi-rs/canvas': + optional: true + unpipe@1.0.0: resolution: {integrity: sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ==} engines: {node: '>= 0.8'} @@ -17596,8 +17607,6 @@ snapshots: '@ckeditor/ckeditor5-widget': 47.6.1 ckeditor5: 47.6.1 es-toolkit: 1.39.5 - transitivePeerDependencies: - - supports-color '@ckeditor/ckeditor5-icons@47.6.1': {} @@ -17650,6 +17659,8 @@ snapshots: '@ckeditor/ckeditor5-ui': 47.6.1 '@ckeditor/ckeditor5-utils': 47.6.1 ckeditor5: 47.6.1 + transitivePeerDependencies: + - supports-color '@ckeditor/ckeditor5-line-height@47.6.1': dependencies: @@ -34340,6 +34351,10 @@ snapshots: unorm@1.6.0: optional: true + unpdf@1.4.0(@napi-rs/canvas@0.1.96): + optionalDependencies: + '@napi-rs/canvas': 0.1.96 + unpipe@1.0.0: {} unplugin-utils@0.3.1: