mirror of
https://github.com/zadam/trilium.git
synced 2026-05-07 00:25:30 +02:00
refactor(ocr): reuse office processor for PDFs
This commit is contained in:
@@ -9,7 +9,6 @@ import sql from '../sql.js';
|
||||
import { FileProcessor } from './processors/file_processor.js';
|
||||
import { ImageProcessor } from './processors/image_processor.js';
|
||||
import { OfficeProcessor } from './processors/office_processor.js';
|
||||
import { PDFProcessor } from './processors/pdf_processor.js';
|
||||
import { TIFFProcessor } from './processors/tiff_processor.js';
|
||||
|
||||
export interface OCRResult {
|
||||
@@ -37,7 +36,6 @@ class OCRService {
|
||||
constructor() {
|
||||
const imageProcessor = new ImageProcessor();
|
||||
this.processors.set('image', imageProcessor);
|
||||
this.processors.set('pdf', new PDFProcessor());
|
||||
this.processors.set('tiff', new TIFFProcessor(imageProcessor));
|
||||
this.processors.set('office', new OfficeProcessor());
|
||||
}
|
||||
|
||||
@@ -5,6 +5,8 @@ import { OCRProcessingOptions, OCRResult } from '../ocr_service.js';
|
||||
import { FileProcessor } from './file_processor.js';
|
||||
|
||||
const SUPPORTED_TYPES = [
|
||||
// PDF
|
||||
'application/pdf',
|
||||
// Office Open XML
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // DOCX
|
||||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // XLSX
|
||||
|
||||
@@ -1,40 +0,0 @@
|
||||
// Import the actual library directly, bypassing the wrapper index.js
|
||||
// which tries to read a test PDF file at import time.
|
||||
import pdfParse from 'pdf-parse/lib/pdf-parse.js';
|
||||
|
||||
import log from '../../log.js';
|
||||
import { OCRProcessingOptions, OCRResult } from '../ocr_service.js';
|
||||
import { FileProcessor } from './file_processor.js';
|
||||
|
||||
/**
|
||||
* PDF processor for extracting embedded text from PDF files using pdf-parse.
|
||||
*/
|
||||
export class PDFProcessor extends FileProcessor {
|
||||
|
||||
canProcess(mimeType: string): boolean {
|
||||
return mimeType.toLowerCase() === 'application/pdf';
|
||||
}
|
||||
|
||||
getSupportedMimeTypes(): string[] {
|
||||
return ['application/pdf'];
|
||||
}
|
||||
|
||||
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
|
||||
log.info('Starting PDF text extraction...');
|
||||
|
||||
const data = await pdfParse(buffer);
|
||||
|
||||
return {
|
||||
text: data.text.trim(),
|
||||
confidence: 0.99,
|
||||
extractedAt: new Date().toISOString(),
|
||||
language: options.language || "eng",
|
||||
pageCount: data.numpages
|
||||
};
|
||||
}
|
||||
|
||||
getProcessingType(): string {
|
||||
return 'pdf';
|
||||
}
|
||||
|
||||
}
|
||||
5
apps/server/src/types.d.ts
vendored
5
apps/server/src/types.d.ts
vendored
@@ -39,11 +39,6 @@ declare module "@triliumnext/share-theme/styles.css" {
|
||||
export default content;
|
||||
}
|
||||
|
||||
declare module "pdf-parse/lib/pdf-parse.js" {
|
||||
import pdfParse from "pdf-parse";
|
||||
export default pdfParse;
|
||||
}
|
||||
|
||||
declare module '*.css' {}
|
||||
declare module '*?raw' {
|
||||
const src: string
|
||||
|
||||
Reference in New Issue
Block a user