mirror of
https://github.com/zadam/trilium.git
synced 2026-05-09 14:37:49 +02:00
chore(ocr): switch to unpdf due to issues with pdfjs-dist
This commit is contained in:
@@ -37,7 +37,8 @@
|
||||
"better-sqlite3": "12.8.0",
|
||||
"html-to-text": "9.0.5",
|
||||
"node-html-parser": "7.1.0",
|
||||
"sucrase": "3.35.1"
|
||||
"sucrase": "3.35.1",
|
||||
"unpdf": "1.4.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@braintree/sanitize-url": "7.1.2",
|
||||
@@ -126,8 +127,8 @@
|
||||
"strip-bom": "5.0.0",
|
||||
"striptags": "3.2.0",
|
||||
"supertest": "7.2.2",
|
||||
"tesseract.js": "6.0.1",
|
||||
"swagger-jsdoc": "6.2.8",
|
||||
"tesseract.js": "6.0.1",
|
||||
"time2fa": "1.4.2",
|
||||
"tmp": "0.2.5",
|
||||
"turnish": "1.8.0",
|
||||
|
||||
@@ -9,6 +9,7 @@ import sql from '../sql.js';
|
||||
import { FileProcessor } from './processors/file_processor.js';
|
||||
import { ImageProcessor } from './processors/image_processor.js';
|
||||
import { OfficeProcessor } from './processors/office_processor.js';
|
||||
import { PDFProcessor } from './processors/pdf_processor.js';
|
||||
|
||||
export interface OCRResult {
|
||||
text: string;
|
||||
@@ -34,6 +35,7 @@ class OCRService {
|
||||
|
||||
constructor() {
|
||||
this.processors.set('image', new ImageProcessor());
|
||||
this.processors.set('pdf', new PDFProcessor());
|
||||
this.processors.set('office', new OfficeProcessor());
|
||||
}
|
||||
|
||||
|
||||
@@ -5,8 +5,6 @@ import { OCRProcessingOptions, OCRResult } from '../ocr_service.js';
|
||||
import { FileProcessor } from './file_processor.js';
|
||||
|
||||
const SUPPORTED_TYPES = [
|
||||
// PDF
|
||||
'application/pdf',
|
||||
// Office Open XML
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // DOCX
|
||||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // XLSX
|
||||
|
||||
39
apps/server/src/services/ocr/processors/pdf_processor.ts
Normal file
39
apps/server/src/services/ocr/processors/pdf_processor.ts
Normal file
@@ -0,0 +1,39 @@
|
||||
import { extractText, getDocumentProxy } from 'unpdf';
|
||||
|
||||
import log from '../../log.js';
|
||||
import { OCRProcessingOptions, OCRResult } from '../ocr_service.js';
|
||||
import { FileProcessor } from './file_processor.js';
|
||||
|
||||
/**
|
||||
* PDF processor for extracting embedded text from PDF files using unpdf.
|
||||
*/
|
||||
export class PDFProcessor extends FileProcessor {
|
||||
|
||||
canProcess(mimeType: string): boolean {
|
||||
return mimeType.toLowerCase() === 'application/pdf';
|
||||
}
|
||||
|
||||
getSupportedMimeTypes(): string[] {
|
||||
return ['application/pdf'];
|
||||
}
|
||||
|
||||
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
|
||||
log.info('Starting PDF text extraction...');
|
||||
|
||||
const pdf = await getDocumentProxy(new Uint8Array(buffer));
|
||||
const { totalPages, text } = await extractText(pdf, { mergePages: true });
|
||||
|
||||
return {
|
||||
text: text.trim(),
|
||||
confidence: 0.99,
|
||||
extractedAt: new Date().toISOString(),
|
||||
language: options.language || "eng",
|
||||
pageCount: totalPages
|
||||
};
|
||||
}
|
||||
|
||||
getProcessingType(): string {
|
||||
return 'pdf';
|
||||
}
|
||||
|
||||
}
|
||||
19
pnpm-lock.yaml
generated
19
pnpm-lock.yaml
generated
@@ -580,6 +580,9 @@ importers:
|
||||
sucrase:
|
||||
specifier: 3.35.1
|
||||
version: 3.35.1
|
||||
unpdf:
|
||||
specifier: 1.4.0
|
||||
version: 1.4.0(@napi-rs/canvas@0.1.96)
|
||||
devDependencies:
|
||||
'@braintree/sanitize-url':
|
||||
specifier: 7.1.2
|
||||
@@ -15341,6 +15344,14 @@ packages:
|
||||
resolution: {integrity: sha512-b2/KCUlYZUeA7JFUuRJZPUtr4gZvBh7tavtv4fvk4+KV9pfGiR6CQAQAWl49ZpR3ts2dk4FYkP7EIgDJoiOLDA==}
|
||||
engines: {node: '>= 0.4.0'}
|
||||
|
||||
unpdf@1.4.0:
|
||||
resolution: {integrity: sha512-TahIk0xdH/4jh/MxfclzU79g40OyxtP00VnEUZdEkJoYtXAHWLiir6t3FC6z3vDqQTzc2ZHcla6uEiVTNjejuA==}
|
||||
peerDependencies:
|
||||
'@napi-rs/canvas': ^0.1.69
|
||||
peerDependenciesMeta:
|
||||
'@napi-rs/canvas':
|
||||
optional: true
|
||||
|
||||
unpipe@1.0.0:
|
||||
resolution: {integrity: sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ==}
|
||||
engines: {node: '>= 0.8'}
|
||||
@@ -17596,8 +17607,6 @@ snapshots:
|
||||
'@ckeditor/ckeditor5-widget': 47.6.1
|
||||
ckeditor5: 47.6.1
|
||||
es-toolkit: 1.39.5
|
||||
transitivePeerDependencies:
|
||||
- supports-color
|
||||
|
||||
'@ckeditor/ckeditor5-icons@47.6.1': {}
|
||||
|
||||
@@ -17650,6 +17659,8 @@ snapshots:
|
||||
'@ckeditor/ckeditor5-ui': 47.6.1
|
||||
'@ckeditor/ckeditor5-utils': 47.6.1
|
||||
ckeditor5: 47.6.1
|
||||
transitivePeerDependencies:
|
||||
- supports-color
|
||||
|
||||
'@ckeditor/ckeditor5-line-height@47.6.1':
|
||||
dependencies:
|
||||
@@ -34340,6 +34351,10 @@ snapshots:
|
||||
unorm@1.6.0:
|
||||
optional: true
|
||||
|
||||
unpdf@1.4.0(@napi-rs/canvas@0.1.96):
|
||||
optionalDependencies:
|
||||
'@napi-rs/canvas': 0.1.96
|
||||
|
||||
unpipe@1.0.0: {}
|
||||
|
||||
unplugin-utils@0.3.1:
|
||||
|
||||
Reference in New Issue
Block a user