refactor(ocr): get rid of require imports

This commit is contained in:
Elian Doran
2026-04-01 16:30:27 +03:00
parent 38f6fb5a7f
commit b626fb448b
4 changed files with 43 additions and 42 deletions

View File

@@ -1,8 +1,9 @@
import Tesseract from 'tesseract.js';
import { FileProcessor } from './file_processor.js';
import { OCRResult, OCRProcessingOptions } from '../ocr_service.js';
import log from '../../log.js';
import options from '../../options.js';
import { OCRProcessingOptions,OCRResult } from '../ocr_service.js';
import { FileProcessor } from './file_processor.js';
/**
* Image processor for extracting text from image files using Tesseract
@@ -135,7 +136,6 @@ export class ImageProcessor extends FileProcessor {
*/
private getDefaultOCRLanguage(): string {
try {
const options = require('../../options.js').default;
const ocrLanguage = options.getOption('ocrLanguage');
if (!ocrLanguage) {
throw new Error('OCR language not configured in user settings');
@@ -161,8 +161,8 @@ export class ImageProcessor extends FileProcessor {
};
}
let filteredWords: string[] = [];
let validConfidences: number[] = [];
const filteredWords: string[] = [];
const validConfidences: number[] = [];
// Tesseract provides word-level data
if (data.words && Array.isArray(data.words)) {
@@ -182,13 +182,12 @@ export class ImageProcessor extends FileProcessor {
filteredText: data.text.trim(),
overallConfidence
};
} else {
log.info(`Entire text filtered out due to low confidence ${overallConfidence} (below threshold ${minConfidence})`);
return {
filteredText: '',
overallConfidence
};
}
log.info(`Entire text filtered out due to low confidence ${overallConfidence} (below threshold ${minConfidence})`);
return {
filteredText: '',
overallConfidence
};
}
// Calculate average confidence of accepted words

View File

@@ -1,8 +1,10 @@
import * as officeParser from 'officeparser';
import { FileProcessor } from './file_processor.js';
import { OCRResult, OCRProcessingOptions } from '../ocr_service.js';
import { ImageProcessor } from './image_processor.js';
import log from '../../log.js';
import options from '../../options.js';
import { OCRProcessingOptions,OCRResult } from '../ocr_service.js';
import { FileProcessor } from './file_processor.js';
import { ImageProcessor } from './image_processor.js';
/**
* Office document processor for extracting text and images from DOCX/XLSX/PPTX files
@@ -51,9 +53,9 @@ export class OfficeProcessor extends FileProcessor {
const result: OCRResult = {
text: combinedText,
confidence: confidence,
confidence,
extractedAt: new Date().toISOString(),
language: language,
language,
pageCount: 1 // Office documents are treated as single logical document
};
@@ -97,7 +99,6 @@ export class OfficeProcessor extends FileProcessor {
*/
private getDefaultOCRLanguage(): string {
try {
const options = require('../../options.js').default;
const ocrLanguage = options.getOption('ocrLanguage');
if (!ocrLanguage) {
throw new Error('OCR language not configured in user settings');

View File

@@ -1,9 +1,10 @@
import * as pdfParse from 'pdf-parse';
import { FileProcessor } from './file_processor.js';
import { OCRResult, OCRProcessingOptions } from '../ocr_service.js';
import { ImageProcessor } from './image_processor.js';
import log from '../../log.js';
import sharp from 'sharp';
import options from '../../options.js';
import { OCRProcessingOptions,OCRResult } from '../ocr_service.js';
import { FileProcessor } from './file_processor.js';
import { ImageProcessor } from './image_processor.js';
/**
* PDF processor for extracting text from PDF files
@@ -58,7 +59,7 @@ export class PDFProcessor extends FileProcessor {
private async extractTextFromPDF(buffer: Buffer, options: OCRProcessingOptions): Promise<OCRResult> {
try {
const data = await pdfParse(buffer);
return {
text: data.text.trim(),
confidence: 0.99, // High confidence for direct text extraction
@@ -77,15 +78,15 @@ export class PDFProcessor extends FileProcessor {
// Convert PDF to images and OCR each page
// For now, we'll use a simple approach - convert first page to image
// In a full implementation, we'd convert all pages
// This is a simplified implementation
// In practice, you might want to use pdf2pic or similar library
// to convert PDF pages to images for OCR
// For now, we'll return a placeholder result
// indicating that OCR on PDF is not fully implemented
log.info('PDF to image conversion not fully implemented, returning placeholder');
return {
text: '[PDF OCR not fully implemented - would convert PDF pages to images and OCR each page]',
confidence: 0.0,
@@ -112,7 +113,6 @@ export class PDFProcessor extends FileProcessor {
*/
private getDefaultOCRLanguage(): string {
try {
const options = require('../../options.js').default;
const ocrLanguage = options.getOption('ocrLanguage');
if (!ocrLanguage) {
throw new Error('OCR language not configured in user settings');
@@ -132,16 +132,16 @@ export class PDFProcessor extends FileProcessor {
if (!language || typeof language !== 'string') {
return false;
}
// Split by '+' for multi-language format
const languages = language.split('+');
// Check each language code (should be 2-7 characters, alphanumeric with underscores)
const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/;
return languages.every(lang => {
const trimmed = lang.trim();
return trimmed.length > 0 && validLanguagePattern.test(trimmed);
});
}
}
}

View File

@@ -1,8 +1,10 @@
import sharp from 'sharp';
import { FileProcessor } from './file_processor.js';
import { OCRResult, OCRProcessingOptions } from '../ocr_service.js';
import { ImageProcessor } from './image_processor.js';
import log from '../../log.js';
import options from '../../options.js';
import { OCRProcessingOptions,OCRResult } from '../ocr_service.js';
import { FileProcessor } from './file_processor.js';
import { ImageProcessor } from './image_processor.js';
/**
* TIFF processor for extracting text from multi-page TIFF files
@@ -45,7 +47,7 @@ export class TIFFProcessor extends FileProcessor {
for (let page = 0; page < pageCount; page++) {
try {
log.info(`Processing TIFF page ${page + 1}/${pageCount}...`);
// Extract page as PNG buffer
const pageBuffer = await sharp(buffer, { page })
.png()
@@ -53,10 +55,10 @@ export class TIFFProcessor extends FileProcessor {
// OCR the page
const pageResult = await this.imageProcessor.extractText(pageBuffer, options);
if (pageResult.text.trim().length > 0) {
if (combinedText.length > 0) {
combinedText += '\n\n--- Page ' + (page + 1) + ' ---\n';
combinedText += `\n\n--- Page ${page + 1} ---\n`;
}
combinedText += pageResult.text;
totalConfidence += pageResult.confidence;
@@ -74,7 +76,7 @@ export class TIFFProcessor extends FileProcessor {
confidence: averageConfidence,
extractedAt: new Date().toISOString(),
language: options.language || this.getDefaultOCRLanguage(),
pageCount: pageCount
pageCount
};
log.info(`TIFF text extraction completed. Pages: ${pageCount}, Confidence: ${averageConfidence}%, Text length: ${result.text.length}`);
@@ -99,7 +101,6 @@ export class TIFFProcessor extends FileProcessor {
*/
private getDefaultOCRLanguage(): string {
try {
const options = require('../../options.js').default;
const ocrLanguage = options.getOption('ocrLanguage');
if (!ocrLanguage) {
throw new Error('OCR language not configured in user settings');
@@ -119,16 +120,16 @@ export class TIFFProcessor extends FileProcessor {
if (!language || typeof language !== 'string') {
return false;
}
// Split by '+' for multi-language format
const languages = language.split('+');
// Check each language code (should be 2-7 characters, alphanumeric with underscores)
const validLanguagePattern = /^[a-zA-Z]{2,3}(_[a-zA-Z]{2,3})?$/;
return languages.every(lang => {
const trimmed = lang.trim();
return trimmed.length > 0 && validLanguagePattern.test(trimmed);
});
}
}
}