chore(ocr): switch to unpdf due to issues with pdfjs-dist

This commit is contained in:
Elian Doran
2026-04-03 09:22:56 +03:00
parent a6c4401973
commit a3a52aaafe
5 changed files with 61 additions and 6 deletions

View File

@@ -37,7 +37,8 @@
"better-sqlite3": "12.8.0",
"html-to-text": "9.0.5",
"node-html-parser": "7.1.0",
"sucrase": "3.35.1"
"sucrase": "3.35.1",
"unpdf": "1.4.0"
},
"devDependencies": {
"@braintree/sanitize-url": "7.1.2",
@@ -126,8 +127,8 @@
"strip-bom": "5.0.0",
"striptags": "3.2.0",
"supertest": "7.2.2",
"tesseract.js": "6.0.1",
"swagger-jsdoc": "6.2.8",
"tesseract.js": "6.0.1",
"time2fa": "1.4.2",
"tmp": "0.2.5",
"turnish": "1.8.0",

View File

@@ -9,6 +9,7 @@ import sql from '../sql.js';
import { FileProcessor } from './processors/file_processor.js';
import { ImageProcessor } from './processors/image_processor.js';
import { OfficeProcessor } from './processors/office_processor.js';
import { PDFProcessor } from './processors/pdf_processor.js';
export interface OCRResult {
text: string;
@@ -34,6 +35,7 @@ class OCRService {
constructor() {
this.processors.set('image', new ImageProcessor());
this.processors.set('pdf', new PDFProcessor());
this.processors.set('office', new OfficeProcessor());
}

View File

@@ -5,8 +5,6 @@ import { OCRProcessingOptions, OCRResult } from '../ocr_service.js';
import { FileProcessor } from './file_processor.js';
const SUPPORTED_TYPES = [
// PDF
'application/pdf',
// Office Open XML
'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // DOCX
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // XLSX

View File

@@ -0,0 +1,39 @@
import { extractText, getDocumentProxy } from 'unpdf';
import log from '../../log.js';
import { OCRProcessingOptions, OCRResult } from '../ocr_service.js';
import { FileProcessor } from './file_processor.js';
/**
* PDF processor for extracting embedded text from PDF files using unpdf.
*/
export class PDFProcessor extends FileProcessor {
canProcess(mimeType: string): boolean {
return mimeType.toLowerCase() === 'application/pdf';
}
getSupportedMimeTypes(): string[] {
return ['application/pdf'];
}
async extractText(buffer: Buffer, options: OCRProcessingOptions = {}): Promise<OCRResult> {
log.info('Starting PDF text extraction...');
const pdf = await getDocumentProxy(new Uint8Array(buffer));
const { totalPages, text } = await extractText(pdf, { mergePages: true });
return {
text: text.trim(),
confidence: 0.99,
extractedAt: new Date().toISOString(),
language: options.language || "eng",
pageCount: totalPages
};
}
getProcessingType(): string {
return 'pdf';
}
}

19
pnpm-lock.yaml generated
View File

@@ -580,6 +580,9 @@ importers:
sucrase:
specifier: 3.35.1
version: 3.35.1
unpdf:
specifier: 1.4.0
version: 1.4.0(@napi-rs/canvas@0.1.96)
devDependencies:
'@braintree/sanitize-url':
specifier: 7.1.2
@@ -15341,6 +15344,14 @@ packages:
resolution: {integrity: sha512-b2/KCUlYZUeA7JFUuRJZPUtr4gZvBh7tavtv4fvk4+KV9pfGiR6CQAQAWl49ZpR3ts2dk4FYkP7EIgDJoiOLDA==}
engines: {node: '>= 0.4.0'}
unpdf@1.4.0:
resolution: {integrity: sha512-TahIk0xdH/4jh/MxfclzU79g40OyxtP00VnEUZdEkJoYtXAHWLiir6t3FC6z3vDqQTzc2ZHcla6uEiVTNjejuA==}
peerDependencies:
'@napi-rs/canvas': ^0.1.69
peerDependenciesMeta:
'@napi-rs/canvas':
optional: true
unpipe@1.0.0:
resolution: {integrity: sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ==}
engines: {node: '>= 0.8'}
@@ -17596,8 +17607,6 @@ snapshots:
'@ckeditor/ckeditor5-widget': 47.6.1
ckeditor5: 47.6.1
es-toolkit: 1.39.5
transitivePeerDependencies:
- supports-color
'@ckeditor/ckeditor5-icons@47.6.1': {}
@@ -17650,6 +17659,8 @@ snapshots:
'@ckeditor/ckeditor5-ui': 47.6.1
'@ckeditor/ckeditor5-utils': 47.6.1
ckeditor5: 47.6.1
transitivePeerDependencies:
- supports-color
'@ckeditor/ckeditor5-line-height@47.6.1':
dependencies:
@@ -34340,6 +34351,10 @@ snapshots:
unorm@1.6.0:
optional: true
unpdf@1.4.0(@napi-rs/canvas@0.1.96):
optionalDependencies:
'@napi-rs/canvas': 0.1.96
unpipe@1.0.0: {}
unplugin-utils@0.3.1: