refactor(ocr): reuse office processor for PDFs

This commit is contained in:
Elian Doran
2026-04-02 22:53:57 +03:00
parent f441a145b5
commit 23799562ae
4 changed files with 2 additions and 47 deletions

View File

@@ -9,7 +9,6 @@ import sql from '../sql.js';
import { FileProcessor } from './processors/file_processor.js';
import { ImageProcessor } from './processors/image_processor.js';
import { OfficeProcessor } from './processors/office_processor.js';
import { PDFProcessor } from './processors/pdf_processor.js';
import { TIFFProcessor } from './processors/tiff_processor.js';
export interface OCRResult {
@@ -37,7 +36,6 @@ class OCRService {
constructor() {
const imageProcessor = new ImageProcessor();
this.processors.set('image', imageProcessor);
this.processors.set('pdf', new PDFProcessor());
this.processors.set('tiff', new TIFFProcessor(imageProcessor));
this.processors.set('office', new OfficeProcessor());
}

View File

@@ -5,6 +5,8 @@ import { OCRProcessingOptions, OCRResult } from '../ocr_service.js';
import { FileProcessor } from './file_processor.js';
const SUPPORTED_TYPES = [
// PDF
'application/pdf',
// Office Open XML
'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // DOCX
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // XLSX

View File

@@ -1,40 +0,0 @@
// Import the actual library directly, bypassing the wrapper index.js
// which tries to read a test PDF file at import time.
import pdfParse from 'pdf-parse/lib/pdf-parse.js';
import log from '../../log.js';
import { OCRProcessingOptions, OCRResult } from '../ocr_service.js';
import { FileProcessor } from './file_processor.js';
/**
* PDF processor for extracting embedded text from PDF files using pdf-parse.
*/
export class PDFProcessor extends FileProcessor {
canProcess(mimeType: string): boolean {
return mimeType.toLowerCase() === 'application/pdf';
}
getSupportedMimeTypes(): string[] {
return ['application/pdf'];
}
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
log.info('Starting PDF text extraction...');
const data = await pdfParse(buffer);
return {
text: data.text.trim(),
confidence: 0.99,
extractedAt: new Date().toISOString(),
language: options.language || "eng",
pageCount: data.numpages
};
}
getProcessingType(): string {
return 'pdf';
}
}

View File

@@ -39,11 +39,6 @@ declare module "@triliumnext/share-theme/styles.css" {
export default content;
}
declare module "pdf-parse/lib/pdf-parse.js" {
import pdfParse from "pdf-parse";
export default pdfParse;
}
declare module '*.css' {}
declare module '*?raw' {
const src: string