chore(ocr): remove unimplemented logic

2026-06-26 22:11:07 +02:00 · 2026-04-02 12:36:10 +03:00
parent cf73a4ef43
commit ad29375975
1 changed files with 13 additions and 75 deletions
--- a/apps/server/src/services/ocr/processors/pdf_processor.ts
+++ b/apps/server/src/services/ocr/processors/pdf_processor.ts
@@ -1,96 +1,34 @@
 import pdfParse from 'pdf-parse';

 import log from '../../log.js';
-import { OCRProcessingOptions,OCRResult } from '../ocr_service.js';
+import { OCRProcessingOptions, OCRResult } from '../ocr_service.js';
 import { FileProcessor } from './file_processor.js';
-import { ImageProcessor } from './image_processor.js';

 /**
- * PDF processor for extracting text from PDF files
- * First tries to extract existing text, then falls back to OCR on images
+ * PDF processor for extracting embedded text from PDF files using pdf-parse.
 */
 export class PDFProcessor extends FileProcessor {
-    private imageProcessor: ImageProcessor;
-    private readonly supportedTypes = ['application/pdf'];
-
-    constructor() {
-        super();
-        this.imageProcessor = new ImageProcessor();
-    }

    canProcess(mimeType: string): boolean {
        return mimeType.toLowerCase() === 'application/pdf';
    }

    getSupportedMimeTypes(): string[] {
-        return [...this.supportedTypes];
+        return ['application/pdf'];
    }

    async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
-        try {
-            log.info('Starting PDF text extraction...');
+        log.info('Starting PDF text extraction...');

-            // First try to extract existing text from PDF
-            if (options.enablePDFTextExtraction !== false) {
-                const textResult = await this.extractTextFromPDF(buffer, options);
-                if (textResult.text.trim().length > 0) {
-                    log.info(`PDF text extraction successful. Length: ${textResult.text.length}`);
-                    return textResult;
-                }
-            }
+        const data = await pdfParse(buffer);

-            // Fall back to OCR if no text found or PDF text extraction is disabled
-            log.info('No text found in PDF or text extraction disabled, falling back to OCR...');
-            return await this.extractTextViaOCR(buffer, options);
-
-        } catch (error) {
-            log.error(`PDF text extraction failed: ${error}`);
-            throw error;
-        }
-    }
-
-    private async extractTextFromPDF(buffer: Buffer, options: OCRProcessingOptions): Promise<OCRResult> {
-        try {
-            const data = await pdfParse(buffer);
-
-            return {
-                text: data.text.trim(),
-                confidence: 0.99, // High confidence for direct text extraction
-                extractedAt: new Date().toISOString(),
-                language: options.language || "eng",
-                pageCount: data.numpages
-            };
-        } catch (error) {
-            log.error(`PDF text extraction failed: ${error}`);
-            throw error;
-        }
-    }
-
-    private async extractTextViaOCR(buffer: Buffer, options: OCRProcessingOptions): Promise<OCRResult> {
-        try {
-            // Convert PDF to images and OCR each page
-            // For now, we'll use a simple approach - convert first page to image
-            // In a full implementation, we'd convert all pages
-
-            // This is a simplified implementation
-            // In practice, you might want to use pdf2pic or similar library
-            // to convert PDF pages to images for OCR
-
-            // For now, we'll return a placeholder result
-            // indicating that OCR on PDF is not fully implemented
-            log.info('PDF to image conversion not fully implemented, returning placeholder');
-
-            return {
-                text: '[PDF OCR not fully implemented - would convert PDF pages to images and OCR each page]',
-                confidence: 0.0,
-                extractedAt: new Date().toISOString(),
-                language: options.language || "eng",
-                pageCount: 1
-            };
-        } catch (error) {
-            log.error(`PDF OCR extraction failed: ${error}`);
-            throw error;
-        }
+        return {
+            text: data.text.trim(),
+            confidence: 0.99,
+            extractedAt: new Date().toISOString(),
+            language: options.language || "eng",
+            pageCount: data.numpages
+        };
    }

    getProcessingType(): string {
@@ -98,6 +36,6 @@ export class PDFProcessor extends FileProcessor {
    }

    async cleanup(): Promise<void> {
-        await this.imageProcessor.cleanup();
+        // Nothing to clean up.
    }
 }