From 77733ce2050953eeb927939becfcbdf97882a2eb Mon Sep 17 00:00:00 2001 From: perfectra1n Date: Wed, 11 Mar 2026 21:11:55 -0700 Subject: [PATCH] feat(search): try to rice performance some more --- apps/server/spec/search_profiling.spec.ts | 380 +++++++++--------- apps/server/src/becca/becca-interface.ts | 30 ++ apps/server/src/becca/entities/bnote.ts | 3 + .../search/expressions/note_flat_text.ts | 48 ++- .../src/services/search/services/search.ts | 84 ++-- 5 files changed, 318 insertions(+), 227 deletions(-) diff --git a/apps/server/spec/search_profiling.spec.ts b/apps/server/spec/search_profiling.spec.ts index 9f5f848034..8099a322b4 100644 --- a/apps/server/spec/search_profiling.spec.ts +++ b/apps/server/spec/search_profiling.spec.ts @@ -4,8 +4,8 @@ * Uses the real SQLite database (spec/db/document.db loaded in-memory), * real sql module, real becca cache, and the full app stack. * - * Seeds a large number of notes via direct SQL (much faster than ETAPI) - * to create a realistic dataset for profiling. + * Profiles search at large scale (50K+ notes) to match real-world + * performance reports from users with 240K+ notes. */ import { Application } from "express"; import { beforeAll, describe, expect, it } from "vitest"; @@ -58,224 +58,246 @@ describe("Search profiling (integration)", () => { app = await buildApp(); }); - it("seed and profile with realistic data", async () => { + it("large-scale profiling (50K notes)", async () => { const sql = (await import("../src/services/sql.js")).default; const becca = (await import("../src/becca/becca.js")).default; const beccaLoader = (await import("../src/becca/becca_loader.js")).default; const cls = (await import("../src/services/cls.js")).default; const searchService = (await import("../src/services/search/services/search.js")).default; const SearchContext = (await import("../src/services/search/search_context.js")).default; + const beccaService = (await import("../src/becca/becca_service.js")).default; await new Promise((resolve) => { cls.init(() => { const initialNoteCount = Object.keys(becca.notes).length; console.log(`\n Initial becca notes: ${initialNoteCount}`); - const configs = [ - { notes: 2000, words: 500, label: "2K notes × 500 words (~4KB)" }, - { notes: 2000, words: 2000, label: "2K notes × 2000 words (~15KB)" }, - { notes: 5000, words: 500, label: "5K notes × 500 words (~4KB)" }, - { notes: 5000, words: 2000, label: "5K notes × 2000 words (~15KB)" }, - { notes: 10000, words: 1000, label: "10K notes × 1000 words (~8KB)" }, - ]; + // ── Seed 50K notes with hierarchy ── + // Some folders (depth), some with common keyword "test" in title + const TOTAL_NOTES = 50000; + const FOLDER_COUNT = 500; // 500 folders + const NOTES_PER_FOLDER = (TOTAL_NOTES - FOLDER_COUNT) / FOLDER_COUNT; // ~99 notes per folder + const MATCH_FRACTION = 0.10; // 10% match "test" — ~5000 notes + const CONTENT_WORDS = 500; - for (const cfg of configs) { - // Reset DB: delete all seeded notes from prior iteration - sql.execute(`DELETE FROM blobs WHERE blobId LIKE 'seed%'`); - sql.execute(`DELETE FROM notes WHERE noteId LIKE 'seed%'`); - sql.execute(`DELETE FROM branches WHERE branchId LIKE 'seed%'`); + const now = new Date().toISOString().replace("T", " ").replace("Z", "+0000"); + console.log(` Seeding ${TOTAL_NOTES} notes (${FOLDER_COUNT} folders, ~${NOTES_PER_FOLDER.toFixed(0)} per folder)...`); - const TOTAL_NOTES = cfg.notes; - const MATCH_FRACTION = 0.15; - const CONTENT_WORDS = cfg.words; - const matchCount = Math.floor(TOTAL_NOTES * MATCH_FRACTION); + const [, seedMs] = timed(() => { + sql.transactional(() => { + const folderIds: string[] = []; - const now = new Date().toISOString().replace("T", " ").replace("Z", "+0000"); + // Create folders under root + for (let f = 0; f < FOLDER_COUNT; f++) { + const noteId = `seed${randomId(8)}`; + const branchId = `seed${randomId(8)}`; + const blobId = `seed${randomId(16)}`; + folderIds.push(noteId); - console.log(`\n ──── ${cfg.label} ────`); - console.log(` Seeding ${TOTAL_NOTES} notes (${matchCount} with keyword)...`); + sql.execute( + `INSERT INTO blobs (blobId, content, dateModified, utcDateModified) VALUES (?, ?, ?, ?)`, + [blobId, `

Folder ${f}

`, now, now] + ); + sql.execute( + `INSERT INTO notes (noteId, title, type, mime, blobId, isProtected, isDeleted, + dateCreated, dateModified, utcDateCreated, utcDateModified) + VALUES (?, ?, 'text', 'text/html', ?, 0, 0, ?, ?, ?, ?)`, + [noteId, `Folder ${f} ${randomWord(5)}`, blobId, now, now, now, now] + ); + sql.execute( + `INSERT INTO branches (branchId, noteId, parentNoteId, notePosition, isDeleted, isExpanded, utcDateModified) + VALUES (?, ?, 'root', ?, 0, 0, ?)`, + [branchId, noteId, f * 10, now] + ); + } - const [, seedMs] = timed(() => { - sql.transactional(() => { - for (let i = 0; i < TOTAL_NOTES; i++) { - const isMatch = i < matchCount; + // Create notes under folders + let noteIdx = 0; + for (let f = 0; f < FOLDER_COUNT; f++) { + const parentId = folderIds[f]; + for (let n = 0; n < NOTES_PER_FOLDER; n++) { + const isMatch = noteIdx < TOTAL_NOTES * MATCH_FRACTION; const noteId = `seed${randomId(8)}`; const branchId = `seed${randomId(8)}`; const blobId = `seed${randomId(16)}`; const title = isMatch - ? `Performance Doc ${i} ${randomWord(6)}` - : `General Note ${i} ${randomWord(6)} ${randomWord(5)}`; - const content = generateContent( - CONTENT_WORDS, - isMatch ? "performance" : undefined - ); + ? `Test Document ${noteIdx} ${randomWord(6)}` + : `Note ${noteIdx} ${randomWord(6)} ${randomWord(5)}`; + const content = generateContent(CONTENT_WORDS, isMatch ? "test" : undefined); sql.execute( - `INSERT INTO blobs (blobId, content, dateModified, utcDateModified) - VALUES (?, ?, ?, ?)`, + `INSERT INTO blobs (blobId, content, dateModified, utcDateModified) VALUES (?, ?, ?, ?)`, [blobId, content, now, now] ); - sql.execute( `INSERT INTO notes (noteId, title, type, mime, blobId, isProtected, isDeleted, dateCreated, dateModified, utcDateCreated, utcDateModified) VALUES (?, ?, 'text', 'text/html', ?, 0, 0, ?, ?, ?, ?)`, [noteId, title, blobId, now, now, now, now] ); - sql.execute( - `INSERT INTO branches (branchId, noteId, parentNoteId, notePosition, isDeleted, isExpanded, - utcDateModified) - VALUES (?, ?, 'root', ?, 0, 0, ?)`, - [branchId, noteId, i * 10, now] + `INSERT INTO branches (branchId, noteId, parentNoteId, notePosition, isDeleted, isExpanded, utcDateModified) + VALUES (?, ?, ?, ?, 0, 0, ?)`, + [branchId, noteId, parentId, n * 10, now] ); + noteIdx++; } - }); - }); - console.log(` SQL seeding: ${seedMs.toFixed(0)}ms`); - - // Reload becca to pick up new notes - const [, reloadMs] = timed(() => { - beccaLoader.load(); - }); - console.log(` Becca reload: ${reloadMs.toFixed(0)}ms`); - console.log(` Becca notes after seed: ${Object.keys(becca.notes).length}`); - - // Verify content is accessible - const sampleNote = Object.values(becca.notes).find(n => n.title.startsWith("Performance Doc")); - if (sampleNote) { - const content = sampleNote.getContent(); - console.log(` Sample content length: ${typeof content === 'string' ? content.length : 0} chars`); - } - - // ========================================== - // PROFILING - // ========================================== - - console.log(`\n --- PROFILING (${cfg.label}) ---\n`); - - // --- 1. Fast search (NoteFlatTextExp only) --- - searchService.findResultsWithQuery("performance", new SearchContext({ fastSearch: true })); - - const fastTimes: number[] = []; - let fastResultCount = 0; - for (let i = 0; i < 5; i++) { - const [r, ms] = timed(() => - searchService.findResultsWithQuery("performance", - new SearchContext({ fastSearch: true }) - ) - ); - fastTimes.push(ms); - fastResultCount = r.length; - } - const fastAvg = fastTimes.reduce((a, b) => a + b, 0) / fastTimes.length; - console.log(` Fast search (flat text only): avg ${fastAvg.toFixed(1)}ms (${fastResultCount} results)`); - - // --- 2. Full search (flat text + content fulltext via SQL) --- - const fullTimes: number[] = []; - let fullResultCount = 0; - for (let i = 0; i < 3; i++) { - const [r, ms] = timed(() => - searchService.findResultsWithQuery("performance", - new SearchContext({ fastSearch: false }) - ) - ); - fullTimes.push(ms); - fullResultCount = r.length; - } - const fullAvg = fullTimes.reduce((a, b) => a + b, 0) / fullTimes.length; - console.log(` Full search (flat + SQL content): avg ${fullAvg.toFixed(1)}ms (${fullResultCount} results)`); - - // --- 3. Content snippet extraction --- - const fastResults = searchService.findResultsWithQuery("performance", - new SearchContext({ fastSearch: true })); - const trimmed = fastResults.slice(0, 200); - const tokens = ["performance"]; - - const snippetTimes: number[] = []; - for (let i = 0; i < 3; i++) { - const [, ms] = timed(() => { - for (const r of trimmed) { - r.contentSnippet = searchService.extractContentSnippet(r.noteId, tokens); - } - }); - snippetTimes.push(ms); - } - const snippetAvg = snippetTimes.reduce((a, b) => a + b, 0) / snippetTimes.length; - console.log(` Content snippet (${trimmed.length} results): avg ${snippetAvg.toFixed(1)}ms (${(snippetAvg / trimmed.length).toFixed(3)}ms/note)`); - - // --- 4. Raw getContent() cost --- - const contentTimes: number[] = []; - const textNotes = trimmed - .map(r => becca.notes[r.noteId]) - .filter(n => n && ["text", "code"].includes(n.type)); - - for (let i = 0; i < 5; i++) { - const [, ms] = timed(() => { - for (const n of textNotes) n.getContent(); - }); - contentTimes.push(ms); - } - const contentAvg = contentTimes.reduce((a, b) => a + b, 0) / contentTimes.length; - console.log(` getContent() × ${textNotes.length} notes: avg ${contentAvg.toFixed(1)}ms (${(contentAvg / textNotes.length).toFixed(3)}ms/note)`); - - // --- 5. striptags + normalize cost (isolated) --- - const striptags = require("striptags"); - const normalizeString = require("normalize-strings"); - const contents = textNotes.map(n => n.getContent() as string).filter(Boolean); - - const [, stripMs] = timed(() => { - for (const c of contents) { - striptags(c); } }); - console.log(` striptags × ${contents.length} notes: ${stripMs.toFixed(1)}ms (${(stripMs / contents.length).toFixed(3)}ms/note)`); + }); + console.log(` SQL seeding: ${seedMs.toFixed(0)}ms`); - const stripped = contents.map(c => striptags(c)); - const [, normMs] = timed(() => { - for (const s of stripped) { - normalizeString(s.toLowerCase()); - } - }); - console.log(` normalizeString × ${stripped.length} notes: ${normMs.toFixed(1)}ms (${(normMs / stripped.length).toFixed(3)}ms/note)`); + const [, reloadMs] = timed(() => beccaLoader.load()); + const totalNotes = Object.keys(becca.notes).length; + console.log(` Becca reload: ${reloadMs.toFixed(0)}ms Total notes: ${totalNotes}`); - // --- 6. Full autocomplete --- - const autoTimes: number[] = []; - let autoResultCount = 0; - for (let i = 0; i < 3; i++) { - const [r, ms] = timed(() => - searchService.searchNotesForAutocomplete("performance", true) - ); - autoTimes.push(ms); - autoResultCount = r.length; + // ── Warm caches ── + searchService.searchNotesForAutocomplete("test", true); + + // ════════════════════════════════════════════ + // PROFILING AT SCALE + // ════════════════════════════════════════════ + + console.log(`\n ════ PROFILING (${totalNotes} notes) ════\n`); + + // 1. getCandidateNotes cost (the full-scan bottleneck) + const allNotes = Object.values(becca.notes); + const [, flatScanMs] = timed(() => { + let count = 0; + for (const note of allNotes) { + const ft = note.getFlatText(); + if (ft.includes("test")) count++; } - const autoAvg = autoTimes.reduce((a, b) => a + b, 0) / autoTimes.length; - console.log(`\n FULL AUTOCOMPLETE: avg ${autoAvg.toFixed(1)}ms (${autoResultCount} results)`); + return count; + }); + console.log(` getFlatText + includes scan (${allNotes.length} notes): ${flatScanMs.toFixed(1)}ms`); - // --- 7. SQL content scan cost --- - const [scanCount, scanMs] = timed(() => { - let count = 0; - for (const row of sql.iterateRows<{ content: Buffer | string }>(` - SELECT noteId, type, mime, content, isProtected - FROM notes JOIN blobs USING (blobId) - WHERE type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap') - AND isDeleted = 0 - AND LENGTH(content) < 2097152`)) { - count++; - } - return count; - }); - console.log(` SQL content scan (${scanCount} rows): ${scanMs.toFixed(1)}ms`); - - // --- Summary --- - console.log(`\n === SUMMARY (${cfg.label}, ${Object.keys(becca.notes).length} total notes) ===`); - console.log(` Fast search: ${fastAvg.toFixed(1)}ms`); - console.log(` Full search: ${fullAvg.toFixed(1)}ms`); - console.log(` Content snippets: ${snippetAvg.toFixed(1)}ms (${(snippetAvg / trimmed.length).toFixed(3)}ms/note)`); - console.log(` normalizeString: ${normMs.toFixed(1)}ms (${(normMs / stripped.length).toFixed(3)}ms/note)`); - console.log(` Full autocomplete: ${autoAvg.toFixed(1)}ms`); - console.log(` SQL scan: ${scanMs.toFixed(1)}ms (${scanCount} rows)`); + // 2. Full findResultsWithQuery (includes candidate scan + parent walk + scoring) + const findTimes: number[] = []; + let findResultCount = 0; + for (let i = 0; i < 3; i++) { + const [r, ms] = timed(() => + searchService.findResultsWithQuery("test", new SearchContext({ fastSearch: true })) + ); + findTimes.push(ms); + findResultCount = r.length; } + const findAvg = findTimes.reduce((a, b) => a + b, 0) / findTimes.length; + console.log(` findResultsWithQuery (fast): avg ${findAvg.toFixed(1)}ms (${findResultCount} results)`); + + // 3. Exact-only (no fuzzy) + const exactTimes: number[] = []; + for (let i = 0; i < 3; i++) { + const [, ms] = timed(() => + searchService.findResultsWithQuery("test", new SearchContext({ fastSearch: true, enableFuzzyMatching: false })) + ); + exactTimes.push(ms); + } + const exactAvg = exactTimes.reduce((a, b) => a + b, 0) / exactTimes.length; + console.log(` findResultsWithQuery (exact): avg ${exactAvg.toFixed(1)}ms`); + console.log(` Fuzzy overhead: ${(findAvg - exactAvg).toFixed(1)}ms`); + + // 4. SearchResult construction + computeScore cost (isolated) + const results = searchService.findResultsWithQuery("test", new SearchContext({ fastSearch: true })); + console.log(` Total results before trim: ${results.length}`); + + const [, scoreAllMs] = timed(() => { + for (const r of results) r.computeScore("test", ["test"], true); + }); + console.log(` computeScore × ${results.length}: ${scoreAllMs.toFixed(1)}ms (${(scoreAllMs / results.length).toFixed(3)}ms/result)`); + + // 5. getNoteTitleForPath for all results + const [, pathTitleMs] = timed(() => { + for (const r of results) beccaService.getNoteTitleForPath(r.notePathArray); + }); + console.log(` getNoteTitleForPath × ${results.length}: ${pathTitleMs.toFixed(1)}ms`); + + // 6. Content snippet extraction (only 200) + const trimmed = results.slice(0, 200); + const [, snippetMs] = timed(() => { + for (const r of trimmed) { + r.contentSnippet = searchService.extractContentSnippet(r.noteId, ["test"]); + } + }); + console.log(` extractContentSnippet × 200: ${snippetMs.toFixed(1)}ms`); + + // 7. Highlighting (only 200) + const [, hlMs] = timed(() => { + searchService.highlightSearchResults(trimmed, ["test"]); + }); + console.log(` highlightSearchResults × 200: ${hlMs.toFixed(1)}ms`); + + // 7b. getBestNotePath cost (used by fast path) + const sampleNotes = Object.values(becca.notes).filter(n => n.title.startsWith("Test Document")).slice(0, 1000); + const [, bestPathMs] = timed(() => { + for (const n of sampleNotes) n.getBestNotePath(); + }); + console.log(` getBestNotePath × ${sampleNotes.length}: ${bestPathMs.toFixed(1)}ms (${(bestPathMs/sampleNotes.length).toFixed(3)}ms/note)`); + + // 8. Full autocomplete end-to-end + const autoTimes: number[] = []; + let autoCount = 0; + for (let i = 0; i < 3; i++) { + const [r, ms] = timed(() => + searchService.searchNotesForAutocomplete("test", true) + ); + autoTimes.push(ms); + autoCount = r.length; + } + const autoAvg = autoTimes.reduce((a, b) => a + b, 0) / autoTimes.length; + const autoMin = Math.min(...autoTimes); + console.log(`\n ★ FULL AUTOCOMPLETE: avg ${autoAvg.toFixed(1)}ms min ${autoMin.toFixed(1)}ms (${autoCount} results)`); + + // 9. With a less common search term (fewer matches) + const rareTimes: number[] = []; + let rareCount = 0; + for (let i = 0; i < 3; i++) { + const [r, ms] = timed(() => + searchService.searchNotesForAutocomplete("leitfaden", true) + ); + rareTimes.push(ms); + rareCount = r.length; + } + const rareAvg = rareTimes.reduce((a, b) => a + b, 0) / rareTimes.length; + console.log(` Autocomplete "leitfaden": avg ${rareAvg.toFixed(1)}ms (${rareCount} results)`); + + // 10. Full search (fastSearch=false) — the 2.7s bottleneck + console.log(`\n ── Full search (fastSearch=false) ──`); + const fullTimes: number[] = []; + let fullCount = 0; + for (let i = 0; i < 2; i++) { + const [r, ms] = timed(() => + searchService.findResultsWithQuery("test", new SearchContext({ fastSearch: false })) + ); + fullTimes.push(ms); + fullCount = r.length; + } + const fullAvg = fullTimes.reduce((a, b) => a + b, 0) / fullTimes.length; + console.log(` Full search (flat + SQL): avg ${fullAvg.toFixed(1)}ms (${fullCount} results)`); + + // 11. SQL content scan alone + const [scanCount, scanMs] = timed(() => { + let count = 0; + for (const row of sql.iterateRows<{ content: Buffer | string }>(` + SELECT noteId, type, mime, content, isProtected + FROM notes JOIN blobs USING (blobId) + WHERE type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap') + AND isDeleted = 0 + AND LENGTH(content) < 2097152`)) { + count++; + } + return count; + }); + console.log(` Raw SQL scan (${scanCount} rows): ${scanMs.toFixed(1)}ms`); + + // ── Summary ── + console.log(`\n ════ SUMMARY ════`); + console.log(` Notes: ${totalNotes} | Matches: ${findResultCount} | Hierarchy depth: 3 (root → folder → note)`); + console.log(` ──────────────────────────────────`); + console.log(` Autocomplete (fast): ${autoAvg.toFixed(1)}ms`); + console.log(` findResults: ${findAvg.toFixed(1)}ms (${((findAvg/autoAvg)*100).toFixed(0)}%)`); + console.log(` snippets+highlight: ${(snippetMs + hlMs).toFixed(1)}ms (${(((snippetMs+hlMs)/autoAvg)*100).toFixed(0)}%)`); + console.log(` Full search: ${fullAvg.toFixed(1)}ms`); resolve(); }); diff --git a/apps/server/src/becca/becca-interface.ts b/apps/server/src/becca/becca-interface.ts index 1a8203f436..6619ed30b9 100644 --- a/apps/server/src/becca/becca-interface.ts +++ b/apps/server/src/becca/becca-interface.ts @@ -31,9 +31,17 @@ export default class Becca { allNoteSetCache: NoteSet | null; + /** + * Pre-built parallel arrays for fast flat text scanning in search. + * Avoids per-note property access overhead when iterating 50K+ notes. + * Dirtied when notes change (along with allNoteSetCache). + */ + flatTextIndex: { notes: BNote[], flatTexts: string[] } | null; + constructor() { this.reset(); this.allNoteSetCache = null; + this.flatTextIndex = null; } reset() { @@ -239,6 +247,28 @@ export default class Becca { /** Should be called when the set of all non-skeleton notes changes (added/removed) */ dirtyNoteSetCache() { this.allNoteSetCache = null; + this.flatTextIndex = null; + } + + /** + * Returns pre-built parallel arrays of notes and their flat texts for fast scanning. + * The flat texts are already normalized (lowercase, diacritics removed). + */ + getFlatTextIndex(): { notes: BNote[], flatTexts: string[] } { + if (!this.flatTextIndex) { + const allNoteSet = this.getAllNoteSet(); + const notes: BNote[] = []; + const flatTexts: string[] = []; + + for (const note of allNoteSet.notes) { + notes.push(note); + flatTexts.push(note.getFlatText()); + } + + this.flatTextIndex = { notes, flatTexts }; + } + + return this.flatTextIndex; } getAllNoteSet() { diff --git a/apps/server/src/becca/entities/bnote.ts b/apps/server/src/becca/entities/bnote.ts index 112543a603..4e78974b4e 100644 --- a/apps/server/src/becca/entities/bnote.ts +++ b/apps/server/src/becca/entities/bnote.ts @@ -790,6 +790,9 @@ class BNote extends AbstractBeccaEntity { this.__attributeCache = null; this.__inheritableAttributeCache = null; this.__ancestorCache = null; + + // Dirty the becca-level flat text index since this note's flat text may have changed + this.becca.flatTextIndex = null; } invalidateSubTree(path: string[] = []) { diff --git a/apps/server/src/services/search/expressions/note_flat_text.ts b/apps/server/src/services/search/expressions/note_flat_text.ts index b9ad19c36c..93213d164e 100644 --- a/apps/server/src/services/search/expressions/note_flat_text.ts +++ b/apps/server/src/services/search/expressions/note_flat_text.ts @@ -99,6 +99,22 @@ class NoteFlatTextExp extends Expression { const candidateNotes = this.getCandidateNotes(inputNoteSet, searchContext); + // Fast path for single-token searches with a limit (e.g. autocomplete): + // Skip the expensive recursive parent walk and just use getBestNotePath(). + // The flat text already matched, so we know the token is present. + if (this.tokens.length === 1 && searchContext.limit) { + for (const note of candidateNotes) { + if (!resultNoteSet.hasNoteId(note.noteId)) { + const notePath = note.getBestNotePath(); + if (notePath) { + executionContext.noteIdToNotePath[note.noteId] = notePath; + resultNoteSet.add(note); + } + } + } + return resultNoteSet; + } + for (const note of candidateNotes) { // autocomplete should be able to find notes by their noteIds as well (only leafs) if (this.tokens.length === 1 && note.noteId.toLowerCase() === this.tokens[0]) { @@ -112,7 +128,7 @@ class NoteFlatTextExp extends Expression { // Add defensive checks for undefined properties const typeMatches = note.type && note.type.includes(token); const mimeMatches = note.mime && note.mime.includes(token); - + if (typeMatches || mimeMatches) { foundAttrTokens.push(token); } @@ -165,14 +181,38 @@ class NoteFlatTextExp extends Expression { getCandidateNotes(noteSet: NoteSet, searchContext?: SearchContext): BNote[] { const candidateNotes: BNote[] = []; - for (const note of noteSet.notes) { - const normalizedFlatText = normalizeSearchText(note.getFlatText()); + // For limited searches (e.g. autocomplete), cap candidates to avoid + // processing thousands of matches when only a few hundred are needed. + // Use 5x the limit to ensure enough quality candidates for scoring. + const maxCandidates = searchContext?.limit ? searchContext.limit * 5 : Infinity; + + // Use the pre-built flat text index for fast scanning. + // This provides pre-computed flat texts in a parallel array, avoiding + // per-note property access overhead at large scale (50K+ notes). + const { notes: indexNotes, flatTexts } = becca.getFlatTextIndex(); + + // Build a set for quick membership check when noteSet isn't the full set + const isFullSet = noteSet.notes.length === indexNotes.length; + + for (let i = 0; i < indexNotes.length; i++) { + const note = indexNotes[i]; + + // Skip notes not in the input set (only check when not using the full set) + if (!isFullSet && !noteSet.hasNoteId(note.noteId)) { + continue; + } + + const flatText = flatTexts[i]; for (const token of this.tokens) { - if (this.smartMatch(normalizedFlatText, token, searchContext)) { + if (this.smartMatch(flatText, token, searchContext)) { candidateNotes.push(note); break; } } + + if (candidateNotes.length >= maxCandidates) { + break; + } } return candidateNotes; diff --git a/apps/server/src/services/search/services/search.ts b/apps/server/src/services/search/services/search.ts index 4701964f5b..7ee3e494f4 100644 --- a/apps/server/src/services/search/services/search.ts +++ b/apps/server/src/services/search/services/search.ts @@ -16,7 +16,6 @@ import type { SearchParams, TokenStructure } from "./types.js"; import type Expression from "../expressions/expression.js"; import sql from "../../sql.js"; import scriptService from "../../script.js"; -import striptags from "striptags"; import protectedSessionService from "../../protected_session.js"; export interface SearchNoteResult { @@ -249,23 +248,30 @@ function findResultsWithExpression(expression: Expression, searchContext: Search return performSearch(expression, searchContext, false); } + // For limited searches (e.g. autocomplete), skip the expensive two-phase + // fuzzy fallback. The user is typing and will refine their query — exact + // matching is sufficient and avoids a second full scan of all notes. + if (searchContext.limit) { + return performSearch(expression, searchContext, false); + } + // Phase 1: Try exact matches first (without fuzzy matching) const exactResults = performSearch(expression, searchContext, false); - + // Check if we have sufficient high-quality results const minResultThreshold = 5; const minScoreForQuality = 10; // Minimum score to consider a result "high quality" - + const highQualityResults = exactResults.filter(result => result.score >= minScoreForQuality); - + // If we have enough high-quality exact matches, return them if (highQualityResults.length >= minResultThreshold) { return exactResults; } - + // Phase 2: Add fuzzy matching as fallback when exact matches are insufficient const fuzzyResults = performSearch(expression, searchContext, true); - + // Merge results, ensuring exact matches always rank higher than fuzzy matches return mergeExactAndFuzzyResults(exactResults, fuzzyResults); } @@ -447,7 +453,7 @@ function extractContentSnippet(noteId: string, searchTokens: string[], maxLength try { let content = note.getContent(); - + if (!content || typeof content !== "string") { return ""; } @@ -463,77 +469,66 @@ function extractContentSnippet(noteId: string, searchTokens: string[], maxLength return ""; // Protected but no session available } - // Strip HTML tags for text notes + // Strip HTML tags for text notes — use fast regex for snippet extraction + // (striptags library is ~18x slower and not needed for search snippets) if (note.type === "text") { - content = striptags(content); + content = content.replace(/<[^>]*>/g, ""); } - // Normalize whitespace while preserving paragraph breaks - // First, normalize multiple newlines to double newlines (paragraph breaks) - content = content.replace(/\n\s*\n/g, "\n\n"); - // Then normalize spaces within lines - content = content.split('\n').map(line => line.replace(/\s+/g, " ").trim()).join('\n'); - // Finally trim the whole content - content = content.trim(); - if (!content) { return ""; } - // Try to find a snippet around the first matching token + // Find match position using normalize on the raw stripped content. + // We use a single normalize() pass — no need for expensive whitespace + // normalization just to find the match index. const normalizedContent = normalize(content); + const normalizedTokens = searchTokens.map(token => normalize(token)); let snippetStart = 0; - let matchFound = false; - for (const token of searchTokens) { - const normalizedToken = normalize(token); + for (const normalizedToken of normalizedTokens) { const matchIndex = normalizedContent.indexOf(normalizedToken); - + if (matchIndex !== -1) { // Center the snippet around the match snippetStart = Math.max(0, matchIndex - maxLength / 2); - matchFound = true; break; } } - // Extract snippet - let snippet = content.substring(snippetStart, snippetStart + maxLength); + // Extract a snippet region from the raw content, then clean only that + const snippetRegion = content.substring(snippetStart, snippetStart + maxLength + 100); - // If snippet contains linebreaks, limit to max 4 lines and override character limit + // Normalize whitespace only on the small snippet region + let snippet = snippetRegion + .replace(/\n\s*\n/g, "\n\n") + .replace(/[ \t]+/g, " ") + .trim() + .substring(0, maxLength); + + // If snippet contains linebreaks, limit to max 4 lines const lines = snippet.split('\n'); if (lines.length > 4) { - // Find which lines contain the search tokens to ensure they're included - const normalizedLines = lines.map(line => normalize(line)); - const normalizedTokens = searchTokens.map(token => normalize(token)); - // Find the first line that contains a search token let firstMatchLine = -1; - for (let i = 0; i < normalizedLines.length; i++) { - if (normalizedTokens.some(token => normalizedLines[i].includes(token))) { + for (let i = 0; i < lines.length; i++) { + const normalizedLine = normalize(lines[i]); + if (normalizedTokens.some(token => normalizedLine.includes(token))) { firstMatchLine = i; break; } } if (firstMatchLine !== -1) { - // Center the 4-line window around the first match - // Try to show 1 line before and 2 lines after the match const startLine = Math.max(0, firstMatchLine - 1); const endLine = Math.min(lines.length, startLine + 4); snippet = lines.slice(startLine, endLine).join('\n'); } else { - // No match found in lines (shouldn't happen), just take first 4 snippet = lines.slice(0, 4).join('\n'); } - // Add ellipsis if we truncated lines snippet = snippet + "..."; - } else if (lines.length > 1) { - // For multi-line snippets that are 4 or fewer lines, keep them as-is - // No need to truncate - } else { - // Single line content - apply original word boundary logic - // Try to start/end at word boundaries + } else if (lines.length <= 1) { + // Single line content - apply word boundary logic if (snippetStart > 0) { const firstSpace = snippet.search(/\s/); if (firstSpace > 0 && firstSpace < 20) { @@ -541,7 +536,7 @@ function extractContentSnippet(noteId: string, searchTokens: string[], maxLength } snippet = "..." + snippet; } - + if (snippetStart + maxLength < content.length) { const lastSpace = snippet.search(/\s[^\s]*$/); if (lastSpace > snippet.length - 20 && lastSpace > 0) { @@ -649,7 +644,8 @@ function searchNotesForAutocomplete(query: string, fastSearch: boolean = true) { includeHiddenNotes: true, fuzzyAttributeSearch: true, ignoreInternalAttributes: true, - ancestorNoteId: hoistedNoteService.isHoistedInHiddenSubtree() ? "root" : hoistedNoteService.getHoistedNoteId() + ancestorNoteId: hoistedNoteService.isHoistedInHiddenSubtree() ? "root" : hoistedNoteService.getHoistedNoteId(), + limit: 200 }); const allSearchResults = findResultsWithQuery(query, searchContext);