mirror of
https://github.com/zadam/trilium.git
synced 2025-11-08 06:15:48 +01:00
added image OCR and parsing text from PDF (and OCR of PDF images)
This commit is contained in:
129
src/services/text_extracting.js
Normal file
129
src/services/text_extracting.js
Normal file
@@ -0,0 +1,129 @@
|
||||
const Canvas = require("canvas");
|
||||
const OCRAD = require("ocrad.js");
|
||||
const log = require("./log.js");
|
||||
const optionService = require("./options.js");
|
||||
|
||||
function ocrFromByteArray(img) {
|
||||
// byte array contains raw uncompressed pixel data
|
||||
// kind: 1 - GRAYSCALE_1BPP (unsupported)
|
||||
// kind: 2 - RGB_24BPP
|
||||
// kind: 3 - RGBA_32BPP
|
||||
|
||||
if (!(img.data instanceof Uint8ClampedArray) || ![2, 3].includes(img.kind)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const start = Date.now();
|
||||
const canvas = new Canvas.createCanvas(img.width, img.height);
|
||||
const ctx = canvas.getContext('2d');
|
||||
|
||||
const imageData = ctx.createImageData(img.width, img.height);
|
||||
const imageBytes = imageData.data;
|
||||
|
||||
for (let j = 0, k = 0, jj = img.width * img.height * 4; j < jj;) {
|
||||
imageBytes[j++] = img.data[k++];
|
||||
imageBytes[j++] = img.data[k++];
|
||||
imageBytes[j++] = img.data[k++];
|
||||
// in case of kind = 2, the alpha channel is missing in source pixels and we'll add it
|
||||
imageBytes[j++] = img.kind === 2 ? 255 : img.data[k++];
|
||||
}
|
||||
|
||||
ctx.putImageData(imageData, 0, 0);
|
||||
const text = OCRAD(canvas);
|
||||
|
||||
log.info(`OCR of ${img.data.length} canvas into ${text.length} chars of text took ${Date.now() - start}ms`);
|
||||
|
||||
return text;
|
||||
}
|
||||
|
||||
async function ocrTextFromPdfImages(pdfjsLib, page, strings) {
|
||||
const ops = await page.getOperatorList();
|
||||
|
||||
const fns = ops.fnArray;
|
||||
const args = ops.argsArray;
|
||||
|
||||
for (const arg of args) {
|
||||
const i = args.indexOf(arg);
|
||||
|
||||
if (fns[i] !== pdfjsLib.OPS.paintXObject && fns[i] !== pdfjsLib.OPS.paintImageXObject) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const imgKey = arg[0];
|
||||
const img = await new Promise((res) => page.objs.get(imgKey, r => res(r)));
|
||||
|
||||
if (!img) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const text = ocrFromByteArray(img);
|
||||
|
||||
if (text) {
|
||||
strings.push(text);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function extractTextFromPdf(note, buffer) {
|
||||
if (note.mime !== 'application/pdf' || !optionService.getOptionBool('extractTextFromPdf')) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const pdfjsLib = require("pdfjs-dist");
|
||||
const doc = await pdfjsLib.getDocument({data: buffer}).promise;
|
||||
let strings = [];
|
||||
|
||||
for (let p = 1; p <= doc.numPages; p++) {
|
||||
const page = await doc.getPage(p);
|
||||
|
||||
const content = await page.getTextContent({
|
||||
normalizeWhitespace: true,
|
||||
disableCombineTextItems: false
|
||||
});
|
||||
|
||||
content.items.forEach(({str}) => strings.push(str));
|
||||
|
||||
try {
|
||||
if (optionService.getOptionBool('ocrImages')) {
|
||||
await ocrTextFromPdfImages(pdfjsLib, page, strings);
|
||||
}
|
||||
}
|
||||
catch (e) {
|
||||
log.info(`Could not OCR images from PDF note '${note.noteId}': '${e.message}', stack '${e.stack}'`);
|
||||
}
|
||||
}
|
||||
|
||||
strings = strings.filter(str => str?.trim());
|
||||
|
||||
note.saveNoteAttachment('plainText', 'text/plain', strings.join(" "));
|
||||
}
|
||||
catch (e) {
|
||||
log.info(`Extracting text from PDF on note '${note.noteId}' failed with error '${e.message}', stack ${e.stack}`);
|
||||
}
|
||||
}
|
||||
|
||||
async function ocrTextFromBuffer(buffer) {
|
||||
// buffer is expected to contain an image in JPEG, PNG etc.
|
||||
const start = Date.now();
|
||||
|
||||
const img = await new Promise((res, rej) => {
|
||||
const img = new Canvas.Image();
|
||||
img.onload = () => res(img);
|
||||
img.onerror = err => rej(new Error("Can't load the image " + err));
|
||||
img.src = buffer;
|
||||
});
|
||||
|
||||
const canvas = new Canvas.createCanvas(img.width, img.height);
|
||||
const ctx = canvas.getContext('2d');
|
||||
ctx.drawImage(img, 0, 0, img.width, img.height);
|
||||
const plainText = OCRAD(canvas);
|
||||
|
||||
log.info(`OCR of ${buffer.byteLength} image bytes into ${plainText.length} chars of text took ${Date.now() - start}ms`);
|
||||
return plainText;
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
ocrTextFromBuffer,
|
||||
extractTextFromPdf
|
||||
};
|
||||
Reference in New Issue
Block a user