From 23799562aefe0b8a2dfe3417360571b825397bd7 Mon Sep 17 00:00:00 2001 From: Elian Doran Date: Thu, 2 Apr 2026 22:53:57 +0300 Subject: [PATCH] refactor(ocr): reuse office processor for PDFs --- apps/server/src/services/ocr/ocr_service.ts | 2 - .../ocr/processors/office_processor.ts | 2 + .../services/ocr/processors/pdf_processor.ts | 40 ------------------- apps/server/src/types.d.ts | 5 --- 4 files changed, 2 insertions(+), 47 deletions(-) delete mode 100644 apps/server/src/services/ocr/processors/pdf_processor.ts diff --git a/apps/server/src/services/ocr/ocr_service.ts b/apps/server/src/services/ocr/ocr_service.ts index 12ab5d034a..0c6ec29ff3 100644 --- a/apps/server/src/services/ocr/ocr_service.ts +++ b/apps/server/src/services/ocr/ocr_service.ts @@ -9,7 +9,6 @@ import sql from '../sql.js'; import { FileProcessor } from './processors/file_processor.js'; import { ImageProcessor } from './processors/image_processor.js'; import { OfficeProcessor } from './processors/office_processor.js'; -import { PDFProcessor } from './processors/pdf_processor.js'; import { TIFFProcessor } from './processors/tiff_processor.js'; export interface OCRResult { @@ -37,7 +36,6 @@ class OCRService { constructor() { const imageProcessor = new ImageProcessor(); this.processors.set('image', imageProcessor); - this.processors.set('pdf', new PDFProcessor()); this.processors.set('tiff', new TIFFProcessor(imageProcessor)); this.processors.set('office', new OfficeProcessor()); } diff --git a/apps/server/src/services/ocr/processors/office_processor.ts b/apps/server/src/services/ocr/processors/office_processor.ts index cabc07b3e0..50b208e692 100644 --- a/apps/server/src/services/ocr/processors/office_processor.ts +++ b/apps/server/src/services/ocr/processors/office_processor.ts @@ -5,6 +5,8 @@ import { OCRProcessingOptions, OCRResult } from '../ocr_service.js'; import { FileProcessor } from './file_processor.js'; const SUPPORTED_TYPES = [ + // PDF + 'application/pdf', // Office Open XML 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // DOCX 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // XLSX diff --git a/apps/server/src/services/ocr/processors/pdf_processor.ts b/apps/server/src/services/ocr/processors/pdf_processor.ts deleted file mode 100644 index c59fe84b81..0000000000 --- a/apps/server/src/services/ocr/processors/pdf_processor.ts +++ /dev/null @@ -1,40 +0,0 @@ -// Import the actual library directly, bypassing the wrapper index.js -// which tries to read a test PDF file at import time. -import pdfParse from 'pdf-parse/lib/pdf-parse.js'; - -import log from '../../log.js'; -import { OCRProcessingOptions, OCRResult } from '../ocr_service.js'; -import { FileProcessor } from './file_processor.js'; - -/** - * PDF processor for extracting embedded text from PDF files using pdf-parse. - */ -export class PDFProcessor extends FileProcessor { - - canProcess(mimeType: string): boolean { - return mimeType.toLowerCase() === 'application/pdf'; - } - - getSupportedMimeTypes(): string[] { - return ['application/pdf']; - } - - async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise { - log.info('Starting PDF text extraction...'); - - const data = await pdfParse(buffer); - - return { - text: data.text.trim(), - confidence: 0.99, - extractedAt: new Date().toISOString(), - language: options.language || "eng", - pageCount: data.numpages - }; - } - - getProcessingType(): string { - return 'pdf'; - } - -} diff --git a/apps/server/src/types.d.ts b/apps/server/src/types.d.ts index 5c859c1342..8d6048a76c 100644 --- a/apps/server/src/types.d.ts +++ b/apps/server/src/types.d.ts @@ -39,11 +39,6 @@ declare module "@triliumnext/share-theme/styles.css" { export default content; } -declare module "pdf-parse/lib/pdf-parse.js" { - import pdfParse from "pdf-parse"; - export default pdfParse; -} - declare module '*.css' {} declare module '*?raw' { const src: string