fix(ocr): too many blob queries in search

2026-07-04 15:18:07 +02:00 · 2026-04-02 20:58:11 +03:00
parent 30ba36894d
commit d79d2e9ad2
1 changed files with 34 additions and 23 deletions
--- a/apps/server/src/services/search/expressions/ocr_content.ts
+++ b/apps/server/src/services/search/expressions/ocr_content.ts
@@ -1,4 +1,5 @@
 import becca from "../../../becca/becca.js";
+import sql from "../../sql.js";
 import NoteSet from "../note_set.js";
 import type SearchContext from "../search_context.js";
 import Expression from "./expression.js";
@@ -6,6 +7,9 @@ import Expression from "./expression.js";
 /**
 * Search expression for finding text within OCR-extracted content (textRepresentation)
 * from image notes and their attachments.
+ *
+ * Uses a single SQL query to find all noteIds whose own blob or attachment blobs
+ * contain matching text, then intersects with the input note set.
 */
 export default class OCRContentExpression extends Expression {
    private tokens: string[];
@@ -17,9 +21,11 @@ export default class OCRContentExpression extends Expression {

    execute(inputNoteSet: NoteSet, executionContext: object, searchContext: SearchContext): NoteSet {
        const resultNoteSet = new NoteSet();
+        const matchingNoteIds = this.findNoteIdsWithMatchingOCR();

-        for (const note of inputNoteSet.notes) {
-            if (this.noteMatchesOCR(note.noteId)) {
+        for (const noteId of matchingNoteIds) {
+            const note = becca.notes[noteId];
+            if (note && inputNoteSet.hasNoteId(noteId)) {
                resultNoteSet.add(note);
            }
        }
@@ -35,32 +41,37 @@ export default class OCRContentExpression extends Expression {
    }

    /**
-     * Check if a note (or its attachments) has OCR text matching all tokens.
+     * Find all noteIds that have OCR text matching all tokens, in a single query.
+     * Checks both the note's own blob and its attachment blobs.
     */
-    private noteMatchesOCR(noteId: string): boolean {
-        const note = becca.notes[noteId];
-        if (!note) return false;
+    private findNoteIdsWithMatchingOCR(): Set<string> {
+        if (this.tokens.length === 0) return new Set();

-        // Collect all textRepresentation values for this note
-        const texts: string[] = [];
+        // Build WHERE conditions: all tokens must appear in textRepresentation
+        const likeConditions = this.tokens.map(() => `b.textRepresentation LIKE ?`).join(' AND ');
+        const params = this.tokens.map(token => `%${token}%`);

-        const noteBlob = becca.getBlob({ blobId: note.blobId });
-        if (noteBlob?.textRepresentation) {
-            texts.push(noteBlob.textRepresentation.toLowerCase());
-        }
+        // Find notes whose own blob matches
+        const noteIds = sql.getColumn<string>(`
+            SELECT n.noteId
+            FROM notes n
+            JOIN blobs b ON n.blobId = b.blobId
+            WHERE b.textRepresentation IS NOT NULL
+              AND n.isDeleted = 0
+              AND ${likeConditions}
+        `, params);

-        for (const attachment of note.getAttachments()) {
-            const blob = becca.getBlob({ blobId: attachment.blobId });
-            if (blob?.textRepresentation) {
-                texts.push(blob.textRepresentation.toLowerCase());
-            }
-        }
+        // Find notes that own attachments whose blob matches
+        const attachmentOwnerIds = sql.getColumn<string>(`
+            SELECT a.ownerId
+            FROM attachments a
+            JOIN blobs b ON a.blobId = b.blobId
+            WHERE b.textRepresentation IS NOT NULL
+              AND a.isDeleted = 0
+              AND ${likeConditions}
+        `, params);

-        if (texts.length === 0) return false;
-
-        // All tokens must appear in at least one of the text representations
-        const combined = texts.join(" ");
-        return this.tokens.every(token => combined.includes(token.toLowerCase()));
+        return new Set([...noteIds, ...attachmentOwnerIds]);
    }

    toString(): string {