From 585b6ccd3e3dbb4f9941824e566829aa882884b7 Mon Sep 17 00:00:00 2001 From: perfectra1n Date: Wed, 11 Mar 2026 19:05:44 -0700 Subject: [PATCH 01/33] feat(search): try to improve performance --- apps/server/spec/search_profiling.spec.ts | 284 ++++++++++ .../src/services/search/services/search.ts | 19 +- .../search/services/search_profiling.spec.ts | 526 ++++++++++++++++++ 3 files changed, 819 insertions(+), 10 deletions(-) create mode 100644 apps/server/spec/search_profiling.spec.ts create mode 100644 apps/server/src/services/search/services/search_profiling.spec.ts diff --git a/apps/server/spec/search_profiling.spec.ts b/apps/server/spec/search_profiling.spec.ts new file mode 100644 index 0000000000..9f5f848034 --- /dev/null +++ b/apps/server/spec/search_profiling.spec.ts @@ -0,0 +1,284 @@ +/** + * Integration-level search profiling test. + * + * Uses the real SQLite database (spec/db/document.db loaded in-memory), + * real sql module, real becca cache, and the full app stack. + * + * Seeds a large number of notes via direct SQL (much faster than ETAPI) + * to create a realistic dataset for profiling. + */ +import { Application } from "express"; +import { beforeAll, describe, expect, it } from "vitest"; +import config from "../src/services/config.js"; + +let app: Application; + +function timed(fn: () => T): [T, number] { + const start = performance.now(); + const result = fn(); + return [result, performance.now() - start]; +} + +function randomId(len = 12): string { + const chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; + let id = ""; + for (let i = 0; i < len; i++) id += chars[Math.floor(Math.random() * chars.length)]; + return id; +} + +function randomWord(len = 8): string { + const chars = "abcdefghijklmnopqrstuvwxyz"; + let w = ""; + for (let i = 0; i < len; i++) w += chars[Math.floor(Math.random() * chars.length)]; + return w; +} + +function generateContent(wordCount: number, keyword?: string): string { + const paragraphs: string[] = []; + let remaining = wordCount; + let injected = false; + while (remaining > 0) { + const n = Math.min(remaining, 30 + Math.floor(Math.random() * 30)); + const words: string[] = []; + for (let i = 0; i < n; i++) words.push(randomWord(3 + Math.floor(Math.random() * 10))); + if (keyword && !injected && remaining < wordCount / 2) { + words[Math.floor(words.length / 2)] = keyword; + injected = true; + } + paragraphs.push(`

${words.join(" ")}

`); + remaining -= n; + } + return paragraphs.join("\n"); +} + +describe("Search profiling (integration)", () => { + beforeAll(async () => { + config.General.noAuthentication = true; + const buildApp = (await import("../src/app.js")).default; + app = await buildApp(); + }); + + it("seed and profile with realistic data", async () => { + const sql = (await import("../src/services/sql.js")).default; + const becca = (await import("../src/becca/becca.js")).default; + const beccaLoader = (await import("../src/becca/becca_loader.js")).default; + const cls = (await import("../src/services/cls.js")).default; + const searchService = (await import("../src/services/search/services/search.js")).default; + const SearchContext = (await import("../src/services/search/search_context.js")).default; + + await new Promise((resolve) => { + cls.init(() => { + const initialNoteCount = Object.keys(becca.notes).length; + console.log(`\n Initial becca notes: ${initialNoteCount}`); + + const configs = [ + { notes: 2000, words: 500, label: "2K notes × 500 words (~4KB)" }, + { notes: 2000, words: 2000, label: "2K notes × 2000 words (~15KB)" }, + { notes: 5000, words: 500, label: "5K notes × 500 words (~4KB)" }, + { notes: 5000, words: 2000, label: "5K notes × 2000 words (~15KB)" }, + { notes: 10000, words: 1000, label: "10K notes × 1000 words (~8KB)" }, + ]; + + for (const cfg of configs) { + // Reset DB: delete all seeded notes from prior iteration + sql.execute(`DELETE FROM blobs WHERE blobId LIKE 'seed%'`); + sql.execute(`DELETE FROM notes WHERE noteId LIKE 'seed%'`); + sql.execute(`DELETE FROM branches WHERE branchId LIKE 'seed%'`); + + const TOTAL_NOTES = cfg.notes; + const MATCH_FRACTION = 0.15; + const CONTENT_WORDS = cfg.words; + const matchCount = Math.floor(TOTAL_NOTES * MATCH_FRACTION); + + const now = new Date().toISOString().replace("T", " ").replace("Z", "+0000"); + + console.log(`\n ──── ${cfg.label} ────`); + console.log(` Seeding ${TOTAL_NOTES} notes (${matchCount} with keyword)...`); + + const [, seedMs] = timed(() => { + sql.transactional(() => { + for (let i = 0; i < TOTAL_NOTES; i++) { + const isMatch = i < matchCount; + const noteId = `seed${randomId(8)}`; + const branchId = `seed${randomId(8)}`; + const blobId = `seed${randomId(16)}`; + const title = isMatch + ? `Performance Doc ${i} ${randomWord(6)}` + : `General Note ${i} ${randomWord(6)} ${randomWord(5)}`; + const content = generateContent( + CONTENT_WORDS, + isMatch ? "performance" : undefined + ); + + sql.execute( + `INSERT INTO blobs (blobId, content, dateModified, utcDateModified) + VALUES (?, ?, ?, ?)`, + [blobId, content, now, now] + ); + + sql.execute( + `INSERT INTO notes (noteId, title, type, mime, blobId, isProtected, isDeleted, + dateCreated, dateModified, utcDateCreated, utcDateModified) + VALUES (?, ?, 'text', 'text/html', ?, 0, 0, ?, ?, ?, ?)`, + [noteId, title, blobId, now, now, now, now] + ); + + sql.execute( + `INSERT INTO branches (branchId, noteId, parentNoteId, notePosition, isDeleted, isExpanded, + utcDateModified) + VALUES (?, ?, 'root', ?, 0, 0, ?)`, + [branchId, noteId, i * 10, now] + ); + } + }); + }); + console.log(` SQL seeding: ${seedMs.toFixed(0)}ms`); + + // Reload becca to pick up new notes + const [, reloadMs] = timed(() => { + beccaLoader.load(); + }); + console.log(` Becca reload: ${reloadMs.toFixed(0)}ms`); + console.log(` Becca notes after seed: ${Object.keys(becca.notes).length}`); + + // Verify content is accessible + const sampleNote = Object.values(becca.notes).find(n => n.title.startsWith("Performance Doc")); + if (sampleNote) { + const content = sampleNote.getContent(); + console.log(` Sample content length: ${typeof content === 'string' ? content.length : 0} chars`); + } + + // ========================================== + // PROFILING + // ========================================== + + console.log(`\n --- PROFILING (${cfg.label}) ---\n`); + + // --- 1. Fast search (NoteFlatTextExp only) --- + searchService.findResultsWithQuery("performance", new SearchContext({ fastSearch: true })); + + const fastTimes: number[] = []; + let fastResultCount = 0; + for (let i = 0; i < 5; i++) { + const [r, ms] = timed(() => + searchService.findResultsWithQuery("performance", + new SearchContext({ fastSearch: true }) + ) + ); + fastTimes.push(ms); + fastResultCount = r.length; + } + const fastAvg = fastTimes.reduce((a, b) => a + b, 0) / fastTimes.length; + console.log(` Fast search (flat text only): avg ${fastAvg.toFixed(1)}ms (${fastResultCount} results)`); + + // --- 2. Full search (flat text + content fulltext via SQL) --- + const fullTimes: number[] = []; + let fullResultCount = 0; + for (let i = 0; i < 3; i++) { + const [r, ms] = timed(() => + searchService.findResultsWithQuery("performance", + new SearchContext({ fastSearch: false }) + ) + ); + fullTimes.push(ms); + fullResultCount = r.length; + } + const fullAvg = fullTimes.reduce((a, b) => a + b, 0) / fullTimes.length; + console.log(` Full search (flat + SQL content): avg ${fullAvg.toFixed(1)}ms (${fullResultCount} results)`); + + // --- 3. Content snippet extraction --- + const fastResults = searchService.findResultsWithQuery("performance", + new SearchContext({ fastSearch: true })); + const trimmed = fastResults.slice(0, 200); + const tokens = ["performance"]; + + const snippetTimes: number[] = []; + for (let i = 0; i < 3; i++) { + const [, ms] = timed(() => { + for (const r of trimmed) { + r.contentSnippet = searchService.extractContentSnippet(r.noteId, tokens); + } + }); + snippetTimes.push(ms); + } + const snippetAvg = snippetTimes.reduce((a, b) => a + b, 0) / snippetTimes.length; + console.log(` Content snippet (${trimmed.length} results): avg ${snippetAvg.toFixed(1)}ms (${(snippetAvg / trimmed.length).toFixed(3)}ms/note)`); + + // --- 4. Raw getContent() cost --- + const contentTimes: number[] = []; + const textNotes = trimmed + .map(r => becca.notes[r.noteId]) + .filter(n => n && ["text", "code"].includes(n.type)); + + for (let i = 0; i < 5; i++) { + const [, ms] = timed(() => { + for (const n of textNotes) n.getContent(); + }); + contentTimes.push(ms); + } + const contentAvg = contentTimes.reduce((a, b) => a + b, 0) / contentTimes.length; + console.log(` getContent() × ${textNotes.length} notes: avg ${contentAvg.toFixed(1)}ms (${(contentAvg / textNotes.length).toFixed(3)}ms/note)`); + + // --- 5. striptags + normalize cost (isolated) --- + const striptags = require("striptags"); + const normalizeString = require("normalize-strings"); + const contents = textNotes.map(n => n.getContent() as string).filter(Boolean); + + const [, stripMs] = timed(() => { + for (const c of contents) { + striptags(c); + } + }); + console.log(` striptags × ${contents.length} notes: ${stripMs.toFixed(1)}ms (${(stripMs / contents.length).toFixed(3)}ms/note)`); + + const stripped = contents.map(c => striptags(c)); + const [, normMs] = timed(() => { + for (const s of stripped) { + normalizeString(s.toLowerCase()); + } + }); + console.log(` normalizeString × ${stripped.length} notes: ${normMs.toFixed(1)}ms (${(normMs / stripped.length).toFixed(3)}ms/note)`); + + // --- 6. Full autocomplete --- + const autoTimes: number[] = []; + let autoResultCount = 0; + for (let i = 0; i < 3; i++) { + const [r, ms] = timed(() => + searchService.searchNotesForAutocomplete("performance", true) + ); + autoTimes.push(ms); + autoResultCount = r.length; + } + const autoAvg = autoTimes.reduce((a, b) => a + b, 0) / autoTimes.length; + console.log(`\n FULL AUTOCOMPLETE: avg ${autoAvg.toFixed(1)}ms (${autoResultCount} results)`); + + // --- 7. SQL content scan cost --- + const [scanCount, scanMs] = timed(() => { + let count = 0; + for (const row of sql.iterateRows<{ content: Buffer | string }>(` + SELECT noteId, type, mime, content, isProtected + FROM notes JOIN blobs USING (blobId) + WHERE type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap') + AND isDeleted = 0 + AND LENGTH(content) < 2097152`)) { + count++; + } + return count; + }); + console.log(` SQL content scan (${scanCount} rows): ${scanMs.toFixed(1)}ms`); + + // --- Summary --- + console.log(`\n === SUMMARY (${cfg.label}, ${Object.keys(becca.notes).length} total notes) ===`); + console.log(` Fast search: ${fastAvg.toFixed(1)}ms`); + console.log(` Full search: ${fullAvg.toFixed(1)}ms`); + console.log(` Content snippets: ${snippetAvg.toFixed(1)}ms (${(snippetAvg / trimmed.length).toFixed(3)}ms/note)`); + console.log(` normalizeString: ${normMs.toFixed(1)}ms (${(normMs / stripped.length).toFixed(3)}ms/note)`); + console.log(` Full autocomplete: ${autoAvg.toFixed(1)}ms`); + console.log(` SQL scan: ${scanMs.toFixed(1)}ms (${scanCount} rows)`); + } + + resolve(); + }); + }); + }, 600_000); +}); diff --git a/apps/server/src/services/search/services/search.ts b/apps/server/src/services/search/services/search.ts index 5ca4bda4a1..4701964f5b 100644 --- a/apps/server/src/services/search/services/search.ts +++ b/apps/server/src/services/search/services/search.ts @@ -1,6 +1,5 @@ "use strict"; -import normalizeString from "normalize-strings"; import lex from "./lex.js"; import handleParens from "./handle_parens.js"; import parse from "./parse.js"; @@ -8,7 +7,7 @@ import SearchResult from "../search_result.js"; import SearchContext from "../search_context.js"; import becca from "../../../becca/becca.js"; import beccaService from "../../../becca/becca_service.js"; -import { normalize, escapeHtml, escapeRegExp } from "../../utils.js"; +import { normalize, removeDiacritic, escapeHtml, escapeRegExp } from "../../utils.js"; import log from "../../log.js"; import hoistedNoteService from "../../hoisted_note.js"; import type BNote from "../../../becca/entities/bnote.js"; @@ -482,12 +481,12 @@ function extractContentSnippet(noteId: string, searchTokens: string[], maxLength } // Try to find a snippet around the first matching token - const normalizedContent = normalizeString(content.toLowerCase()); + const normalizedContent = normalize(content); let snippetStart = 0; let matchFound = false; for (const token of searchTokens) { - const normalizedToken = normalizeString(token.toLowerCase()); + const normalizedToken = normalize(token); const matchIndex = normalizedContent.indexOf(normalizedToken); if (matchIndex !== -1) { @@ -505,8 +504,8 @@ function extractContentSnippet(noteId: string, searchTokens: string[], maxLength const lines = snippet.split('\n'); if (lines.length > 4) { // Find which lines contain the search tokens to ensure they're included - const normalizedLines = lines.map(line => normalizeString(line.toLowerCase())); - const normalizedTokens = searchTokens.map(token => normalizeString(token.toLowerCase())); + const normalizedLines = lines.map(line => normalize(line)); + const normalizedTokens = searchTokens.map(token => normalize(token)); // Find the first line that contains a search token let firstMatchLine = -1; @@ -582,7 +581,7 @@ function extractAttributeSnippet(noteId: string, searchTokens: string[], maxLeng // Check if any search token matches the attribute name or value const hasMatch = searchTokens.some(token => { - const normalizedToken = normalizeString(token.toLowerCase()); + const normalizedToken = normalize(token); return attrName.includes(normalizedToken) || attrValue.includes(normalizedToken); }); @@ -734,7 +733,7 @@ function highlightSearchResults(searchResults: SearchResult[], highlightedTokens // Highlight in note path title if (result.highlightedNotePathTitle) { const titleRegex = new RegExp(escapeRegExp(token), "gi"); - while ((match = titleRegex.exec(normalizeString(result.highlightedNotePathTitle))) !== null) { + while ((match = titleRegex.exec(removeDiacritic(result.highlightedNotePathTitle))) !== null) { result.highlightedNotePathTitle = wrapText(result.highlightedNotePathTitle, match.index, token.length, "{", "}"); // 2 characters are added, so we need to adjust the index titleRegex.lastIndex += 2; @@ -744,7 +743,7 @@ function highlightSearchResults(searchResults: SearchResult[], highlightedTokens // Highlight in content snippet if (result.highlightedContentSnippet) { const contentRegex = new RegExp(escapeRegExp(token), "gi"); - while ((match = contentRegex.exec(normalizeString(result.highlightedContentSnippet))) !== null) { + while ((match = contentRegex.exec(removeDiacritic(result.highlightedContentSnippet))) !== null) { result.highlightedContentSnippet = wrapText(result.highlightedContentSnippet, match.index, token.length, "{", "}"); // 2 characters are added, so we need to adjust the index contentRegex.lastIndex += 2; @@ -754,7 +753,7 @@ function highlightSearchResults(searchResults: SearchResult[], highlightedTokens // Highlight in attribute snippet if (result.highlightedAttributeSnippet) { const attributeRegex = new RegExp(escapeRegExp(token), "gi"); - while ((match = attributeRegex.exec(normalizeString(result.highlightedAttributeSnippet))) !== null) { + while ((match = attributeRegex.exec(removeDiacritic(result.highlightedAttributeSnippet))) !== null) { result.highlightedAttributeSnippet = wrapText(result.highlightedAttributeSnippet, match.index, token.length, "{", "}"); // 2 characters are added, so we need to adjust the index attributeRegex.lastIndex += 2; diff --git a/apps/server/src/services/search/services/search_profiling.spec.ts b/apps/server/src/services/search/services/search_profiling.spec.ts new file mode 100644 index 0000000000..96a414b257 --- /dev/null +++ b/apps/server/src/services/search/services/search_profiling.spec.ts @@ -0,0 +1,526 @@ +/** + * Search performance profiling tests. + * + * These tests measure where time is spent in the search pipeline. + * We monkeypatch note.getContent() to return synthetic HTML content + * since unit tests don't have a real SQLite database. + * + * KNOWN GAPS vs production: + * - note.getContent() is instant (monkeypatched) vs ~2ms SQL fetch + * - NoteContentFulltextExp.execute() is skipped (no sql.iterateRows) + * because fastSearch=true uses only NoteFlatTextExp + * - These tests focus on the in-memory/CPU-bound parts of the pipeline + */ +import { describe, it, expect, beforeEach, afterEach } from "vitest"; +import searchService from "./search.js"; +import BNote from "../../../becca/entities/bnote.js"; +import BBranch from "../../../becca/entities/bbranch.js"; +import SearchContext from "../search_context.js"; +import becca from "../../../becca/becca.js"; +import beccaService from "../../../becca/becca_service.js"; +import { NoteBuilder, note, id } from "../../../test/becca_mocking.js"; +import SearchResult from "../search_result.js"; +import { normalizeSearchText } from "../utils/text_utils.js"; + +// ── helpers ────────────────────────────────────────────────────────── + +function randomWord(len = 6): string { + const chars = "abcdefghijklmnopqrstuvwxyz"; + let word = ""; + for (let i = 0; i < len; i++) { + word += chars[Math.floor(Math.random() * chars.length)]; + } + return word; +} + +function generateHtmlContent(wordCount: number, includeTarget = false): string { + const paragraphs: string[] = []; + let wordsRemaining = wordCount; + + while (wordsRemaining > 0) { + const paraWords = Math.min(wordsRemaining, 20 + Math.floor(Math.random() * 40)); + const words: string[] = []; + for (let i = 0; i < paraWords; i++) { + words.push(randomWord(3 + Math.floor(Math.random() * 10))); + } + if (includeTarget && paragraphs.length === 2) { + words[Math.floor(words.length / 2)] = "target"; + } + paragraphs.push(`

${words.join(" ")}

`); + wordsRemaining -= paraWords; + } + + return `${paragraphs.join("\n")}`; +} + +function timed(fn: () => T): [T, number] { + const start = performance.now(); + const result = fn(); + return [result, performance.now() - start]; +} + +interface TimingEntry { label: string; ms: number; } + +function reportTimings(title: string, timings: TimingEntry[]) { + const total = timings.reduce((s, t) => s + t.ms, 0); + console.log(`\n=== ${title} (total: ${total.toFixed(1)}ms) ===`); + for (const { label, ms } of timings) { + const pct = total > 0 ? ((ms / total) * 100).toFixed(0) : "0"; + const bar = "#".repeat(Math.max(1, Math.round(ms / total * 40))); + console.log(` ${label.padEnd(55)} ${ms.toFixed(1).padStart(8)}ms ${pct.padStart(3)}% ${bar}`); + } +} + +// ── dataset builder ────────────────────────────────────────────────── + +const syntheticContent: Record = {}; + +function buildDataset(noteCount: number, opts: { + matchFraction?: number; + labelsPerNote?: number; + depth?: number; + contentWordCount?: number; +} = {}) { + const { + matchFraction = 0.1, + labelsPerNote = 3, + depth = 3, + contentWordCount = 200, + } = opts; + + becca.reset(); + for (const key of Object.keys(syntheticContent)) { + delete syntheticContent[key]; + } + + const rootNote = new NoteBuilder(new BNote({ noteId: "root", title: "root", type: "text" })); + new BBranch({ + branchId: "none_root", + noteId: "root", + parentNoteId: "none", + notePosition: 10 + }); + + const containers: NoteBuilder[] = []; + let parent = rootNote; + for (let d = 0; d < depth; d++) { + const container = note(`Container_${d}_${randomWord(4)}`); + parent.child(container); + containers.push(container); + parent = container; + } + + const matchCount = Math.floor(noteCount * matchFraction); + + for (let i = 0; i < noteCount; i++) { + const isMatch = i < matchCount; + const title = isMatch + ? `${randomWord(5)} target ${randomWord(5)} Document ${i}` + : `${randomWord(5)} ${randomWord(6)} ${randomWord(4)} Note ${i}`; + + const n = note(title); + + for (let l = 0; l < labelsPerNote; l++) { + const labelName = isMatch && l === 0 ? "category" : `label_${randomWord(4)}`; + const labelValue = isMatch && l === 0 ? "important target" : randomWord(8); + n.label(labelName, labelValue); + } + + syntheticContent[n.note.noteId] = generateHtmlContent(contentWordCount, isMatch); + + const containerIndex = i % containers.length; + containers[containerIndex].child(n); + } + + // Monkeypatch getContent() + for (const noteObj of Object.values(becca.notes)) { + const noteId = noteObj.noteId; + if (syntheticContent[noteId]) { + (noteObj as any).getContent = () => syntheticContent[noteId]; + } else { + (noteObj as any).getContent = () => ""; + } + } + + return { rootNote, matchCount }; +} + +// ── profiling tests ────────────────────────────────────────────────── + +describe("Search Profiling", () => { + + afterEach(() => { + becca.reset(); + }); + + /** + * Break down the autocomplete pipeline into every individual stage, + * including previously unmeasured operations like getBestNotePath, + * SearchResult construction, and getNoteTitleForPath. + */ + describe("Granular autocomplete pipeline", () => { + + for (const noteCount of [500, 2000, 5000, 10000]) { + it(`granular breakdown with ${noteCount} notes`, () => { + const timings: TimingEntry[] = []; + + const [, buildMs] = timed(() => buildDataset(noteCount, { + matchFraction: 0.2, + contentWordCount: 300, + depth: 5 + })); + timings.push({ label: `Dataset build (${noteCount} notes)`, ms: buildMs }); + + // === NoteFlatTextExp: getCandidateNotes === + // This calls getFlatText() + normalizeSearchText() for EVERY note + const allNotes = Object.values(becca.notes); + for (const n of allNotes) n.invalidateThisCache(); + + const [, candidateMs] = timed(() => { + const token = normalizeSearchText("target"); + let count = 0; + for (const n of allNotes) { + const flatText = normalizeSearchText(n.getFlatText()); + if (flatText.includes(token)) count++; + } + return count; + }); + timings.push({ label: `getCandidateNotes simulation (cold caches)`, ms: candidateMs }); + + // Warm cache version + const [candidateCount, candidateWarmMs] = timed(() => { + const token = normalizeSearchText("target"); + let count = 0; + for (const n of allNotes) { + const flatText = normalizeSearchText(n.getFlatText()); + if (flatText.includes(token)) count++; + } + return count; + }); + timings.push({ label: `getCandidateNotes simulation (warm caches)`, ms: candidateWarmMs }); + + // === getBestNotePath for each candidate === + const candidates = allNotes.filter(n => { + const flatText = normalizeSearchText(n.getFlatText()); + return flatText.includes("target"); + }); + + const [, pathMs] = timed(() => { + for (const n of candidates) { + n.getBestNotePath(); + } + }); + timings.push({ label: `getBestNotePath (${candidates.length} notes)`, ms: pathMs }); + + // === SearchResult construction (includes getNoteTitleForPath) === + const paths = candidates.map(n => n.getBestNotePath()).filter(Boolean); + + const [searchResults, srMs] = timed(() => { + return paths.map(p => new SearchResult(p)); + }); + timings.push({ label: `SearchResult construction (${paths.length} results)`, ms: srMs }); + + // === computeScore === + const [, scoreMs] = timed(() => { + for (const r of searchResults) { + r.computeScore("target", ["target"], true); + } + }); + timings.push({ label: `computeScore with fuzzy (${searchResults.length} results)`, ms: scoreMs }); + + const [, scoreNoFuzzyMs] = timed(() => { + for (const r of searchResults) { + r.computeScore("target", ["target"], false); + } + }); + timings.push({ label: `computeScore no-fuzzy`, ms: scoreNoFuzzyMs }); + + // === Sorting === + const [, sortMs] = timed(() => { + searchResults.sort((a, b) => { + if (a.score !== b.score) return b.score - a.score; + if (a.notePathArray.length === b.notePathArray.length) { + return a.notePathTitle < b.notePathTitle ? -1 : 1; + } + return a.notePathArray.length - b.notePathArray.length; + }); + }); + timings.push({ label: `Sort results`, ms: sortMs }); + + // === Trim + content snippet extraction === + const trimmed = searchResults.slice(0, 200); + + const [, snippetMs] = timed(() => { + for (const r of trimmed) { + r.contentSnippet = searchService.extractContentSnippet( + r.noteId, ["target"] + ); + } + }); + timings.push({ label: `Content snippet extraction (${trimmed.length} results)`, ms: snippetMs }); + + const [, attrMs] = timed(() => { + for (const r of trimmed) { + r.attributeSnippet = searchService.extractAttributeSnippet( + r.noteId, ["target"] + ); + } + }); + timings.push({ label: `Attribute snippet extraction`, ms: attrMs }); + + // === Highlighting === + const [, hlMs] = timed(() => { + searchService.highlightSearchResults(trimmed, ["target"]); + }); + timings.push({ label: `Highlighting`, ms: hlMs }); + + // === Final mapping (getNoteTitleAndIcon) === + const [, mapMs] = timed(() => { + for (const r of trimmed) { + beccaService.getNoteTitleAndIcon(r.noteId); + } + }); + timings.push({ label: `getNoteTitleAndIcon (${trimmed.length} results)`, ms: mapMs }); + + // === Full autocomplete for comparison === + const [autoResults, autoMs] = timed(() => { + return searchService.searchNotesForAutocomplete("target", true); + }); + timings.push({ label: `Full autocomplete call (end-to-end)`, ms: autoMs }); + + reportTimings(`Granular Autocomplete — ${noteCount} notes`, timings); + expect(autoResults.length).toBeGreaterThan(0); + }); + } + }); + + /** + * Test the specific cost of normalizeSearchText which is called + * pervasively throughout the pipeline. + */ + describe("normalizeSearchText cost", () => { + + it("profile normalizeSearchText at scale", () => { + buildDataset(5000, { matchFraction: 0.2, contentWordCount: 100 }); + + // Generate various text lengths to profile + const shortTexts = Array.from({ length: 5000 }, () => randomWord(10)); + const mediumTexts = Array.from({ length: 5000 }, () => + Array.from({ length: 20 }, () => randomWord(6)).join(" ") + ); + const longTexts = Object.values(becca.notes).map(n => n.getFlatText()); + + console.log("\n=== normalizeSearchText cost ==="); + + const [, shortMs] = timed(() => { + for (const t of shortTexts) normalizeSearchText(t); + }); + console.log(` 5000 short texts (10 chars): ${shortMs.toFixed(1)}ms (${(shortMs/5000*1000).toFixed(1)}µs/call)`); + + const [, medMs] = timed(() => { + for (const t of mediumTexts) normalizeSearchText(t); + }); + console.log(` 5000 medium texts (120 chars): ${medMs.toFixed(1)}ms (${(medMs/5000*1000).toFixed(1)}µs/call)`); + + const [, longMs] = timed(() => { + for (const t of longTexts) normalizeSearchText(t); + }); + console.log(` ${longTexts.length} flat texts (varying): ${longMs.toFixed(1)}ms (${(longMs/longTexts.length*1000).toFixed(1)}µs/call)`); + }); + }); + + /** + * Test the searchPathTowardsRoot recursive walk which runs + * for every candidate note in NoteFlatTextExp. + */ + describe("searchPathTowardsRoot cost", () => { + + it("profile recursive walk with varying hierarchy depth", () => { + console.log("\n=== Search path walk vs hierarchy depth ==="); + + for (const depth of [3, 5, 8, 12]) { + buildDataset(2000, { + matchFraction: 0.15, + depth, + contentWordCount: 50 + }); + + const [results, ms] = timed(() => { + const ctx = new SearchContext({ fastSearch: true }); + return searchService.findResultsWithQuery("target", ctx); + }); + console.log(` depth=${depth}: ${ms.toFixed(1)}ms (${results.length} results)`); + } + }); + }); + + /** + * Content snippet extraction scaling — the operation that calls + * note.getContent() for each result. + */ + describe("Content snippet extraction", () => { + + it("profile snippet extraction with varying content sizes", () => { + console.log("\n=== Content snippet extraction vs content size ==="); + + for (const wordCount of [50, 200, 500, 1000, 2000, 5000]) { + buildDataset(500, { + matchFraction: 0.5, + contentWordCount: wordCount + }); + + const ctx = new SearchContext({ fastSearch: true }); + const results = searchService.findResultsWithQuery("target", ctx); + const trimmed = results.slice(0, 200); + + const [, ms] = timed(() => { + for (const r of trimmed) { + r.contentSnippet = searchService.extractContentSnippet( + r.noteId, ["target"] + ); + } + }); + + const avgContentLen = Object.values(syntheticContent) + .slice(0, 100) + .reduce((s, c) => s + c.length, 0) / 100; + + console.log(` ${String(wordCount).padStart(5)} words/note (avg ${Math.round(avgContentLen)} chars) × ${trimmed.length} results: ${ms.toFixed(1)}ms (${(ms / trimmed.length).toFixed(3)}ms/note)`); + } + }); + + it("profile snippet extraction with varying result counts", () => { + console.log("\n=== Content snippet extraction vs result count ==="); + + buildDataset(2000, { + matchFraction: 0.5, + contentWordCount: 500 + }); + + const ctx = new SearchContext({ fastSearch: true }); + const allResults = searchService.findResultsWithQuery("target", ctx); + + for (const count of [5, 10, 20, 50, 100, 200]) { + const subset = allResults.slice(0, count); + + const [, ms] = timed(() => { + for (const r of subset) { + r.contentSnippet = searchService.extractContentSnippet( + r.noteId, ["target"] + ); + } + }); + + console.log(` ${String(count).padStart(3)} results: ${ms.toFixed(1)}ms (${(ms / count).toFixed(3)}ms/note)`); + } + }); + }); + + /** + * Two-phase exact/fuzzy search cost. + */ + describe("Two-phase search cost", () => { + + for (const noteCount of [1000, 5000, 10000]) { + it(`exact vs progressive with ${noteCount} notes`, () => { + const timings: TimingEntry[] = []; + + buildDataset(noteCount, { matchFraction: 0.005, contentWordCount: 50 }); + + const [exactR, exactMs] = timed(() => { + const ctx = new SearchContext({ fastSearch: true }); + ctx.enableFuzzyMatching = false; + return searchService.findResultsWithQuery("target", ctx); + }); + timings.push({ label: `Exact-only (${exactR.length} results)`, ms: exactMs }); + + const [progR, progMs] = timed(() => { + const ctx = new SearchContext({ fastSearch: true }); + return searchService.findResultsWithQuery("target", ctx); + }); + timings.push({ label: `Progressive exact→fuzzy (${progR.length} results)`, ms: progMs }); + + const overhead = progMs - exactMs; + timings.push({ label: `Fuzzy phase overhead`, ms: Math.max(0, overhead) }); + + reportTimings(`Two-phase — ${noteCount} notes`, timings); + }); + } + }); + + /** + * End-to-end scaling to give the full picture. + */ + describe("End-to-end scaling", () => { + + it("autocomplete at different scales", () => { + console.log("\n=== End-to-end autocomplete scaling ==="); + console.log(" (fastSearch=true, monkeypatched getContent, no real SQL)"); + + for (const noteCount of [100, 500, 1000, 2000, 5000, 10000, 20000]) { + buildDataset(noteCount, { + matchFraction: 0.2, + contentWordCount: 300, + depth: 4 + }); + + // Warm up + searchService.searchNotesForAutocomplete("target", true); + + const times: number[] = []; + for (let i = 0; i < 3; i++) { + const [, ms] = timed(() => searchService.searchNotesForAutocomplete("target", true)); + times.push(ms); + } + + const avg = times.reduce((a, b) => a + b, 0) / times.length; + const min = Math.min(...times); + + console.log( + ` ${String(noteCount).padStart(6)} notes: avg ${avg.toFixed(1)}ms ` + + `min ${min.toFixed(1)}ms` + ); + } + }); + + it("compare fast vs non-fast search", () => { + console.log("\n=== Fast vs non-fast search (no real SQL for content) ==="); + + for (const noteCount of [500, 2000, 5000]) { + buildDataset(noteCount, { + matchFraction: 0.2, + contentWordCount: 200, + depth: 4 + }); + + const [, fastMs] = timed(() => { + const ctx = new SearchContext({ fastSearch: true }); + return searchService.findResultsWithQuery("target", ctx); + }); + + // Non-fast search tries NoteContentFulltextExp which uses sql.iterateRows + // This will likely fail/return empty since there's no real DB, but we + // can still measure the overhead of attempting it + let nonFastMs: number; + let nonFastCount: number; + try { + const [results, ms] = timed(() => { + const ctx = new SearchContext({ fastSearch: false }); + return searchService.findResultsWithQuery("target", ctx); + }); + nonFastMs = ms; + nonFastCount = results.length; + } catch { + nonFastMs = -1; + nonFastCount = -1; + } + + console.log( + ` ${String(noteCount).padStart(5)} notes: fast=${fastMs.toFixed(1)}ms ` + + `non-fast=${nonFastMs >= 0 ? nonFastMs.toFixed(1) + 'ms' : 'FAILED (no real DB)'} ` + + `(non-fast results: ${nonFastCount})` + ); + } + }); + }); +}); From 77733ce2050953eeb927939becfcbdf97882a2eb Mon Sep 17 00:00:00 2001 From: perfectra1n Date: Wed, 11 Mar 2026 21:11:55 -0700 Subject: [PATCH 02/33] feat(search): try to rice performance some more --- apps/server/spec/search_profiling.spec.ts | 380 +++++++++--------- apps/server/src/becca/becca-interface.ts | 30 ++ apps/server/src/becca/entities/bnote.ts | 3 + .../search/expressions/note_flat_text.ts | 48 ++- .../src/services/search/services/search.ts | 84 ++-- 5 files changed, 318 insertions(+), 227 deletions(-) diff --git a/apps/server/spec/search_profiling.spec.ts b/apps/server/spec/search_profiling.spec.ts index 9f5f848034..8099a322b4 100644 --- a/apps/server/spec/search_profiling.spec.ts +++ b/apps/server/spec/search_profiling.spec.ts @@ -4,8 +4,8 @@ * Uses the real SQLite database (spec/db/document.db loaded in-memory), * real sql module, real becca cache, and the full app stack. * - * Seeds a large number of notes via direct SQL (much faster than ETAPI) - * to create a realistic dataset for profiling. + * Profiles search at large scale (50K+ notes) to match real-world + * performance reports from users with 240K+ notes. */ import { Application } from "express"; import { beforeAll, describe, expect, it } from "vitest"; @@ -58,224 +58,246 @@ describe("Search profiling (integration)", () => { app = await buildApp(); }); - it("seed and profile with realistic data", async () => { + it("large-scale profiling (50K notes)", async () => { const sql = (await import("../src/services/sql.js")).default; const becca = (await import("../src/becca/becca.js")).default; const beccaLoader = (await import("../src/becca/becca_loader.js")).default; const cls = (await import("../src/services/cls.js")).default; const searchService = (await import("../src/services/search/services/search.js")).default; const SearchContext = (await import("../src/services/search/search_context.js")).default; + const beccaService = (await import("../src/becca/becca_service.js")).default; await new Promise((resolve) => { cls.init(() => { const initialNoteCount = Object.keys(becca.notes).length; console.log(`\n Initial becca notes: ${initialNoteCount}`); - const configs = [ - { notes: 2000, words: 500, label: "2K notes × 500 words (~4KB)" }, - { notes: 2000, words: 2000, label: "2K notes × 2000 words (~15KB)" }, - { notes: 5000, words: 500, label: "5K notes × 500 words (~4KB)" }, - { notes: 5000, words: 2000, label: "5K notes × 2000 words (~15KB)" }, - { notes: 10000, words: 1000, label: "10K notes × 1000 words (~8KB)" }, - ]; + // ── Seed 50K notes with hierarchy ── + // Some folders (depth), some with common keyword "test" in title + const TOTAL_NOTES = 50000; + const FOLDER_COUNT = 500; // 500 folders + const NOTES_PER_FOLDER = (TOTAL_NOTES - FOLDER_COUNT) / FOLDER_COUNT; // ~99 notes per folder + const MATCH_FRACTION = 0.10; // 10% match "test" — ~5000 notes + const CONTENT_WORDS = 500; - for (const cfg of configs) { - // Reset DB: delete all seeded notes from prior iteration - sql.execute(`DELETE FROM blobs WHERE blobId LIKE 'seed%'`); - sql.execute(`DELETE FROM notes WHERE noteId LIKE 'seed%'`); - sql.execute(`DELETE FROM branches WHERE branchId LIKE 'seed%'`); + const now = new Date().toISOString().replace("T", " ").replace("Z", "+0000"); + console.log(` Seeding ${TOTAL_NOTES} notes (${FOLDER_COUNT} folders, ~${NOTES_PER_FOLDER.toFixed(0)} per folder)...`); - const TOTAL_NOTES = cfg.notes; - const MATCH_FRACTION = 0.15; - const CONTENT_WORDS = cfg.words; - const matchCount = Math.floor(TOTAL_NOTES * MATCH_FRACTION); + const [, seedMs] = timed(() => { + sql.transactional(() => { + const folderIds: string[] = []; - const now = new Date().toISOString().replace("T", " ").replace("Z", "+0000"); + // Create folders under root + for (let f = 0; f < FOLDER_COUNT; f++) { + const noteId = `seed${randomId(8)}`; + const branchId = `seed${randomId(8)}`; + const blobId = `seed${randomId(16)}`; + folderIds.push(noteId); - console.log(`\n ──── ${cfg.label} ────`); - console.log(` Seeding ${TOTAL_NOTES} notes (${matchCount} with keyword)...`); + sql.execute( + `INSERT INTO blobs (blobId, content, dateModified, utcDateModified) VALUES (?, ?, ?, ?)`, + [blobId, `

Folder ${f}

`, now, now] + ); + sql.execute( + `INSERT INTO notes (noteId, title, type, mime, blobId, isProtected, isDeleted, + dateCreated, dateModified, utcDateCreated, utcDateModified) + VALUES (?, ?, 'text', 'text/html', ?, 0, 0, ?, ?, ?, ?)`, + [noteId, `Folder ${f} ${randomWord(5)}`, blobId, now, now, now, now] + ); + sql.execute( + `INSERT INTO branches (branchId, noteId, parentNoteId, notePosition, isDeleted, isExpanded, utcDateModified) + VALUES (?, ?, 'root', ?, 0, 0, ?)`, + [branchId, noteId, f * 10, now] + ); + } - const [, seedMs] = timed(() => { - sql.transactional(() => { - for (let i = 0; i < TOTAL_NOTES; i++) { - const isMatch = i < matchCount; + // Create notes under folders + let noteIdx = 0; + for (let f = 0; f < FOLDER_COUNT; f++) { + const parentId = folderIds[f]; + for (let n = 0; n < NOTES_PER_FOLDER; n++) { + const isMatch = noteIdx < TOTAL_NOTES * MATCH_FRACTION; const noteId = `seed${randomId(8)}`; const branchId = `seed${randomId(8)}`; const blobId = `seed${randomId(16)}`; const title = isMatch - ? `Performance Doc ${i} ${randomWord(6)}` - : `General Note ${i} ${randomWord(6)} ${randomWord(5)}`; - const content = generateContent( - CONTENT_WORDS, - isMatch ? "performance" : undefined - ); + ? `Test Document ${noteIdx} ${randomWord(6)}` + : `Note ${noteIdx} ${randomWord(6)} ${randomWord(5)}`; + const content = generateContent(CONTENT_WORDS, isMatch ? "test" : undefined); sql.execute( - `INSERT INTO blobs (blobId, content, dateModified, utcDateModified) - VALUES (?, ?, ?, ?)`, + `INSERT INTO blobs (blobId, content, dateModified, utcDateModified) VALUES (?, ?, ?, ?)`, [blobId, content, now, now] ); - sql.execute( `INSERT INTO notes (noteId, title, type, mime, blobId, isProtected, isDeleted, dateCreated, dateModified, utcDateCreated, utcDateModified) VALUES (?, ?, 'text', 'text/html', ?, 0, 0, ?, ?, ?, ?)`, [noteId, title, blobId, now, now, now, now] ); - sql.execute( - `INSERT INTO branches (branchId, noteId, parentNoteId, notePosition, isDeleted, isExpanded, - utcDateModified) - VALUES (?, ?, 'root', ?, 0, 0, ?)`, - [branchId, noteId, i * 10, now] + `INSERT INTO branches (branchId, noteId, parentNoteId, notePosition, isDeleted, isExpanded, utcDateModified) + VALUES (?, ?, ?, ?, 0, 0, ?)`, + [branchId, noteId, parentId, n * 10, now] ); + noteIdx++; } - }); - }); - console.log(` SQL seeding: ${seedMs.toFixed(0)}ms`); - - // Reload becca to pick up new notes - const [, reloadMs] = timed(() => { - beccaLoader.load(); - }); - console.log(` Becca reload: ${reloadMs.toFixed(0)}ms`); - console.log(` Becca notes after seed: ${Object.keys(becca.notes).length}`); - - // Verify content is accessible - const sampleNote = Object.values(becca.notes).find(n => n.title.startsWith("Performance Doc")); - if (sampleNote) { - const content = sampleNote.getContent(); - console.log(` Sample content length: ${typeof content === 'string' ? content.length : 0} chars`); - } - - // ========================================== - // PROFILING - // ========================================== - - console.log(`\n --- PROFILING (${cfg.label}) ---\n`); - - // --- 1. Fast search (NoteFlatTextExp only) --- - searchService.findResultsWithQuery("performance", new SearchContext({ fastSearch: true })); - - const fastTimes: number[] = []; - let fastResultCount = 0; - for (let i = 0; i < 5; i++) { - const [r, ms] = timed(() => - searchService.findResultsWithQuery("performance", - new SearchContext({ fastSearch: true }) - ) - ); - fastTimes.push(ms); - fastResultCount = r.length; - } - const fastAvg = fastTimes.reduce((a, b) => a + b, 0) / fastTimes.length; - console.log(` Fast search (flat text only): avg ${fastAvg.toFixed(1)}ms (${fastResultCount} results)`); - - // --- 2. Full search (flat text + content fulltext via SQL) --- - const fullTimes: number[] = []; - let fullResultCount = 0; - for (let i = 0; i < 3; i++) { - const [r, ms] = timed(() => - searchService.findResultsWithQuery("performance", - new SearchContext({ fastSearch: false }) - ) - ); - fullTimes.push(ms); - fullResultCount = r.length; - } - const fullAvg = fullTimes.reduce((a, b) => a + b, 0) / fullTimes.length; - console.log(` Full search (flat + SQL content): avg ${fullAvg.toFixed(1)}ms (${fullResultCount} results)`); - - // --- 3. Content snippet extraction --- - const fastResults = searchService.findResultsWithQuery("performance", - new SearchContext({ fastSearch: true })); - const trimmed = fastResults.slice(0, 200); - const tokens = ["performance"]; - - const snippetTimes: number[] = []; - for (let i = 0; i < 3; i++) { - const [, ms] = timed(() => { - for (const r of trimmed) { - r.contentSnippet = searchService.extractContentSnippet(r.noteId, tokens); - } - }); - snippetTimes.push(ms); - } - const snippetAvg = snippetTimes.reduce((a, b) => a + b, 0) / snippetTimes.length; - console.log(` Content snippet (${trimmed.length} results): avg ${snippetAvg.toFixed(1)}ms (${(snippetAvg / trimmed.length).toFixed(3)}ms/note)`); - - // --- 4. Raw getContent() cost --- - const contentTimes: number[] = []; - const textNotes = trimmed - .map(r => becca.notes[r.noteId]) - .filter(n => n && ["text", "code"].includes(n.type)); - - for (let i = 0; i < 5; i++) { - const [, ms] = timed(() => { - for (const n of textNotes) n.getContent(); - }); - contentTimes.push(ms); - } - const contentAvg = contentTimes.reduce((a, b) => a + b, 0) / contentTimes.length; - console.log(` getContent() × ${textNotes.length} notes: avg ${contentAvg.toFixed(1)}ms (${(contentAvg / textNotes.length).toFixed(3)}ms/note)`); - - // --- 5. striptags + normalize cost (isolated) --- - const striptags = require("striptags"); - const normalizeString = require("normalize-strings"); - const contents = textNotes.map(n => n.getContent() as string).filter(Boolean); - - const [, stripMs] = timed(() => { - for (const c of contents) { - striptags(c); } }); - console.log(` striptags × ${contents.length} notes: ${stripMs.toFixed(1)}ms (${(stripMs / contents.length).toFixed(3)}ms/note)`); + }); + console.log(` SQL seeding: ${seedMs.toFixed(0)}ms`); - const stripped = contents.map(c => striptags(c)); - const [, normMs] = timed(() => { - for (const s of stripped) { - normalizeString(s.toLowerCase()); - } - }); - console.log(` normalizeString × ${stripped.length} notes: ${normMs.toFixed(1)}ms (${(normMs / stripped.length).toFixed(3)}ms/note)`); + const [, reloadMs] = timed(() => beccaLoader.load()); + const totalNotes = Object.keys(becca.notes).length; + console.log(` Becca reload: ${reloadMs.toFixed(0)}ms Total notes: ${totalNotes}`); - // --- 6. Full autocomplete --- - const autoTimes: number[] = []; - let autoResultCount = 0; - for (let i = 0; i < 3; i++) { - const [r, ms] = timed(() => - searchService.searchNotesForAutocomplete("performance", true) - ); - autoTimes.push(ms); - autoResultCount = r.length; + // ── Warm caches ── + searchService.searchNotesForAutocomplete("test", true); + + // ════════════════════════════════════════════ + // PROFILING AT SCALE + // ════════════════════════════════════════════ + + console.log(`\n ════ PROFILING (${totalNotes} notes) ════\n`); + + // 1. getCandidateNotes cost (the full-scan bottleneck) + const allNotes = Object.values(becca.notes); + const [, flatScanMs] = timed(() => { + let count = 0; + for (const note of allNotes) { + const ft = note.getFlatText(); + if (ft.includes("test")) count++; } - const autoAvg = autoTimes.reduce((a, b) => a + b, 0) / autoTimes.length; - console.log(`\n FULL AUTOCOMPLETE: avg ${autoAvg.toFixed(1)}ms (${autoResultCount} results)`); + return count; + }); + console.log(` getFlatText + includes scan (${allNotes.length} notes): ${flatScanMs.toFixed(1)}ms`); - // --- 7. SQL content scan cost --- - const [scanCount, scanMs] = timed(() => { - let count = 0; - for (const row of sql.iterateRows<{ content: Buffer | string }>(` - SELECT noteId, type, mime, content, isProtected - FROM notes JOIN blobs USING (blobId) - WHERE type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap') - AND isDeleted = 0 - AND LENGTH(content) < 2097152`)) { - count++; - } - return count; - }); - console.log(` SQL content scan (${scanCount} rows): ${scanMs.toFixed(1)}ms`); - - // --- Summary --- - console.log(`\n === SUMMARY (${cfg.label}, ${Object.keys(becca.notes).length} total notes) ===`); - console.log(` Fast search: ${fastAvg.toFixed(1)}ms`); - console.log(` Full search: ${fullAvg.toFixed(1)}ms`); - console.log(` Content snippets: ${snippetAvg.toFixed(1)}ms (${(snippetAvg / trimmed.length).toFixed(3)}ms/note)`); - console.log(` normalizeString: ${normMs.toFixed(1)}ms (${(normMs / stripped.length).toFixed(3)}ms/note)`); - console.log(` Full autocomplete: ${autoAvg.toFixed(1)}ms`); - console.log(` SQL scan: ${scanMs.toFixed(1)}ms (${scanCount} rows)`); + // 2. Full findResultsWithQuery (includes candidate scan + parent walk + scoring) + const findTimes: number[] = []; + let findResultCount = 0; + for (let i = 0; i < 3; i++) { + const [r, ms] = timed(() => + searchService.findResultsWithQuery("test", new SearchContext({ fastSearch: true })) + ); + findTimes.push(ms); + findResultCount = r.length; } + const findAvg = findTimes.reduce((a, b) => a + b, 0) / findTimes.length; + console.log(` findResultsWithQuery (fast): avg ${findAvg.toFixed(1)}ms (${findResultCount} results)`); + + // 3. Exact-only (no fuzzy) + const exactTimes: number[] = []; + for (let i = 0; i < 3; i++) { + const [, ms] = timed(() => + searchService.findResultsWithQuery("test", new SearchContext({ fastSearch: true, enableFuzzyMatching: false })) + ); + exactTimes.push(ms); + } + const exactAvg = exactTimes.reduce((a, b) => a + b, 0) / exactTimes.length; + console.log(` findResultsWithQuery (exact): avg ${exactAvg.toFixed(1)}ms`); + console.log(` Fuzzy overhead: ${(findAvg - exactAvg).toFixed(1)}ms`); + + // 4. SearchResult construction + computeScore cost (isolated) + const results = searchService.findResultsWithQuery("test", new SearchContext({ fastSearch: true })); + console.log(` Total results before trim: ${results.length}`); + + const [, scoreAllMs] = timed(() => { + for (const r of results) r.computeScore("test", ["test"], true); + }); + console.log(` computeScore × ${results.length}: ${scoreAllMs.toFixed(1)}ms (${(scoreAllMs / results.length).toFixed(3)}ms/result)`); + + // 5. getNoteTitleForPath for all results + const [, pathTitleMs] = timed(() => { + for (const r of results) beccaService.getNoteTitleForPath(r.notePathArray); + }); + console.log(` getNoteTitleForPath × ${results.length}: ${pathTitleMs.toFixed(1)}ms`); + + // 6. Content snippet extraction (only 200) + const trimmed = results.slice(0, 200); + const [, snippetMs] = timed(() => { + for (const r of trimmed) { + r.contentSnippet = searchService.extractContentSnippet(r.noteId, ["test"]); + } + }); + console.log(` extractContentSnippet × 200: ${snippetMs.toFixed(1)}ms`); + + // 7. Highlighting (only 200) + const [, hlMs] = timed(() => { + searchService.highlightSearchResults(trimmed, ["test"]); + }); + console.log(` highlightSearchResults × 200: ${hlMs.toFixed(1)}ms`); + + // 7b. getBestNotePath cost (used by fast path) + const sampleNotes = Object.values(becca.notes).filter(n => n.title.startsWith("Test Document")).slice(0, 1000); + const [, bestPathMs] = timed(() => { + for (const n of sampleNotes) n.getBestNotePath(); + }); + console.log(` getBestNotePath × ${sampleNotes.length}: ${bestPathMs.toFixed(1)}ms (${(bestPathMs/sampleNotes.length).toFixed(3)}ms/note)`); + + // 8. Full autocomplete end-to-end + const autoTimes: number[] = []; + let autoCount = 0; + for (let i = 0; i < 3; i++) { + const [r, ms] = timed(() => + searchService.searchNotesForAutocomplete("test", true) + ); + autoTimes.push(ms); + autoCount = r.length; + } + const autoAvg = autoTimes.reduce((a, b) => a + b, 0) / autoTimes.length; + const autoMin = Math.min(...autoTimes); + console.log(`\n ★ FULL AUTOCOMPLETE: avg ${autoAvg.toFixed(1)}ms min ${autoMin.toFixed(1)}ms (${autoCount} results)`); + + // 9. With a less common search term (fewer matches) + const rareTimes: number[] = []; + let rareCount = 0; + for (let i = 0; i < 3; i++) { + const [r, ms] = timed(() => + searchService.searchNotesForAutocomplete("leitfaden", true) + ); + rareTimes.push(ms); + rareCount = r.length; + } + const rareAvg = rareTimes.reduce((a, b) => a + b, 0) / rareTimes.length; + console.log(` Autocomplete "leitfaden": avg ${rareAvg.toFixed(1)}ms (${rareCount} results)`); + + // 10. Full search (fastSearch=false) — the 2.7s bottleneck + console.log(`\n ── Full search (fastSearch=false) ──`); + const fullTimes: number[] = []; + let fullCount = 0; + for (let i = 0; i < 2; i++) { + const [r, ms] = timed(() => + searchService.findResultsWithQuery("test", new SearchContext({ fastSearch: false })) + ); + fullTimes.push(ms); + fullCount = r.length; + } + const fullAvg = fullTimes.reduce((a, b) => a + b, 0) / fullTimes.length; + console.log(` Full search (flat + SQL): avg ${fullAvg.toFixed(1)}ms (${fullCount} results)`); + + // 11. SQL content scan alone + const [scanCount, scanMs] = timed(() => { + let count = 0; + for (const row of sql.iterateRows<{ content: Buffer | string }>(` + SELECT noteId, type, mime, content, isProtected + FROM notes JOIN blobs USING (blobId) + WHERE type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap') + AND isDeleted = 0 + AND LENGTH(content) < 2097152`)) { + count++; + } + return count; + }); + console.log(` Raw SQL scan (${scanCount} rows): ${scanMs.toFixed(1)}ms`); + + // ── Summary ── + console.log(`\n ════ SUMMARY ════`); + console.log(` Notes: ${totalNotes} | Matches: ${findResultCount} | Hierarchy depth: 3 (root → folder → note)`); + console.log(` ──────────────────────────────────`); + console.log(` Autocomplete (fast): ${autoAvg.toFixed(1)}ms`); + console.log(` findResults: ${findAvg.toFixed(1)}ms (${((findAvg/autoAvg)*100).toFixed(0)}%)`); + console.log(` snippets+highlight: ${(snippetMs + hlMs).toFixed(1)}ms (${(((snippetMs+hlMs)/autoAvg)*100).toFixed(0)}%)`); + console.log(` Full search: ${fullAvg.toFixed(1)}ms`); resolve(); }); diff --git a/apps/server/src/becca/becca-interface.ts b/apps/server/src/becca/becca-interface.ts index 1a8203f436..6619ed30b9 100644 --- a/apps/server/src/becca/becca-interface.ts +++ b/apps/server/src/becca/becca-interface.ts @@ -31,9 +31,17 @@ export default class Becca { allNoteSetCache: NoteSet | null; + /** + * Pre-built parallel arrays for fast flat text scanning in search. + * Avoids per-note property access overhead when iterating 50K+ notes. + * Dirtied when notes change (along with allNoteSetCache). + */ + flatTextIndex: { notes: BNote[], flatTexts: string[] } | null; + constructor() { this.reset(); this.allNoteSetCache = null; + this.flatTextIndex = null; } reset() { @@ -239,6 +247,28 @@ export default class Becca { /** Should be called when the set of all non-skeleton notes changes (added/removed) */ dirtyNoteSetCache() { this.allNoteSetCache = null; + this.flatTextIndex = null; + } + + /** + * Returns pre-built parallel arrays of notes and their flat texts for fast scanning. + * The flat texts are already normalized (lowercase, diacritics removed). + */ + getFlatTextIndex(): { notes: BNote[], flatTexts: string[] } { + if (!this.flatTextIndex) { + const allNoteSet = this.getAllNoteSet(); + const notes: BNote[] = []; + const flatTexts: string[] = []; + + for (const note of allNoteSet.notes) { + notes.push(note); + flatTexts.push(note.getFlatText()); + } + + this.flatTextIndex = { notes, flatTexts }; + } + + return this.flatTextIndex; } getAllNoteSet() { diff --git a/apps/server/src/becca/entities/bnote.ts b/apps/server/src/becca/entities/bnote.ts index 112543a603..4e78974b4e 100644 --- a/apps/server/src/becca/entities/bnote.ts +++ b/apps/server/src/becca/entities/bnote.ts @@ -790,6 +790,9 @@ class BNote extends AbstractBeccaEntity { this.__attributeCache = null; this.__inheritableAttributeCache = null; this.__ancestorCache = null; + + // Dirty the becca-level flat text index since this note's flat text may have changed + this.becca.flatTextIndex = null; } invalidateSubTree(path: string[] = []) { diff --git a/apps/server/src/services/search/expressions/note_flat_text.ts b/apps/server/src/services/search/expressions/note_flat_text.ts index b9ad19c36c..93213d164e 100644 --- a/apps/server/src/services/search/expressions/note_flat_text.ts +++ b/apps/server/src/services/search/expressions/note_flat_text.ts @@ -99,6 +99,22 @@ class NoteFlatTextExp extends Expression { const candidateNotes = this.getCandidateNotes(inputNoteSet, searchContext); + // Fast path for single-token searches with a limit (e.g. autocomplete): + // Skip the expensive recursive parent walk and just use getBestNotePath(). + // The flat text already matched, so we know the token is present. + if (this.tokens.length === 1 && searchContext.limit) { + for (const note of candidateNotes) { + if (!resultNoteSet.hasNoteId(note.noteId)) { + const notePath = note.getBestNotePath(); + if (notePath) { + executionContext.noteIdToNotePath[note.noteId] = notePath; + resultNoteSet.add(note); + } + } + } + return resultNoteSet; + } + for (const note of candidateNotes) { // autocomplete should be able to find notes by their noteIds as well (only leafs) if (this.tokens.length === 1 && note.noteId.toLowerCase() === this.tokens[0]) { @@ -112,7 +128,7 @@ class NoteFlatTextExp extends Expression { // Add defensive checks for undefined properties const typeMatches = note.type && note.type.includes(token); const mimeMatches = note.mime && note.mime.includes(token); - + if (typeMatches || mimeMatches) { foundAttrTokens.push(token); } @@ -165,14 +181,38 @@ class NoteFlatTextExp extends Expression { getCandidateNotes(noteSet: NoteSet, searchContext?: SearchContext): BNote[] { const candidateNotes: BNote[] = []; - for (const note of noteSet.notes) { - const normalizedFlatText = normalizeSearchText(note.getFlatText()); + // For limited searches (e.g. autocomplete), cap candidates to avoid + // processing thousands of matches when only a few hundred are needed. + // Use 5x the limit to ensure enough quality candidates for scoring. + const maxCandidates = searchContext?.limit ? searchContext.limit * 5 : Infinity; + + // Use the pre-built flat text index for fast scanning. + // This provides pre-computed flat texts in a parallel array, avoiding + // per-note property access overhead at large scale (50K+ notes). + const { notes: indexNotes, flatTexts } = becca.getFlatTextIndex(); + + // Build a set for quick membership check when noteSet isn't the full set + const isFullSet = noteSet.notes.length === indexNotes.length; + + for (let i = 0; i < indexNotes.length; i++) { + const note = indexNotes[i]; + + // Skip notes not in the input set (only check when not using the full set) + if (!isFullSet && !noteSet.hasNoteId(note.noteId)) { + continue; + } + + const flatText = flatTexts[i]; for (const token of this.tokens) { - if (this.smartMatch(normalizedFlatText, token, searchContext)) { + if (this.smartMatch(flatText, token, searchContext)) { candidateNotes.push(note); break; } } + + if (candidateNotes.length >= maxCandidates) { + break; + } } return candidateNotes; diff --git a/apps/server/src/services/search/services/search.ts b/apps/server/src/services/search/services/search.ts index 4701964f5b..7ee3e494f4 100644 --- a/apps/server/src/services/search/services/search.ts +++ b/apps/server/src/services/search/services/search.ts @@ -16,7 +16,6 @@ import type { SearchParams, TokenStructure } from "./types.js"; import type Expression from "../expressions/expression.js"; import sql from "../../sql.js"; import scriptService from "../../script.js"; -import striptags from "striptags"; import protectedSessionService from "../../protected_session.js"; export interface SearchNoteResult { @@ -249,23 +248,30 @@ function findResultsWithExpression(expression: Expression, searchContext: Search return performSearch(expression, searchContext, false); } + // For limited searches (e.g. autocomplete), skip the expensive two-phase + // fuzzy fallback. The user is typing and will refine their query — exact + // matching is sufficient and avoids a second full scan of all notes. + if (searchContext.limit) { + return performSearch(expression, searchContext, false); + } + // Phase 1: Try exact matches first (without fuzzy matching) const exactResults = performSearch(expression, searchContext, false); - + // Check if we have sufficient high-quality results const minResultThreshold = 5; const minScoreForQuality = 10; // Minimum score to consider a result "high quality" - + const highQualityResults = exactResults.filter(result => result.score >= minScoreForQuality); - + // If we have enough high-quality exact matches, return them if (highQualityResults.length >= minResultThreshold) { return exactResults; } - + // Phase 2: Add fuzzy matching as fallback when exact matches are insufficient const fuzzyResults = performSearch(expression, searchContext, true); - + // Merge results, ensuring exact matches always rank higher than fuzzy matches return mergeExactAndFuzzyResults(exactResults, fuzzyResults); } @@ -447,7 +453,7 @@ function extractContentSnippet(noteId: string, searchTokens: string[], maxLength try { let content = note.getContent(); - + if (!content || typeof content !== "string") { return ""; } @@ -463,77 +469,66 @@ function extractContentSnippet(noteId: string, searchTokens: string[], maxLength return ""; // Protected but no session available } - // Strip HTML tags for text notes + // Strip HTML tags for text notes — use fast regex for snippet extraction + // (striptags library is ~18x slower and not needed for search snippets) if (note.type === "text") { - content = striptags(content); + content = content.replace(/<[^>]*>/g, ""); } - // Normalize whitespace while preserving paragraph breaks - // First, normalize multiple newlines to double newlines (paragraph breaks) - content = content.replace(/\n\s*\n/g, "\n\n"); - // Then normalize spaces within lines - content = content.split('\n').map(line => line.replace(/\s+/g, " ").trim()).join('\n'); - // Finally trim the whole content - content = content.trim(); - if (!content) { return ""; } - // Try to find a snippet around the first matching token + // Find match position using normalize on the raw stripped content. + // We use a single normalize() pass — no need for expensive whitespace + // normalization just to find the match index. const normalizedContent = normalize(content); + const normalizedTokens = searchTokens.map(token => normalize(token)); let snippetStart = 0; - let matchFound = false; - for (const token of searchTokens) { - const normalizedToken = normalize(token); + for (const normalizedToken of normalizedTokens) { const matchIndex = normalizedContent.indexOf(normalizedToken); - + if (matchIndex !== -1) { // Center the snippet around the match snippetStart = Math.max(0, matchIndex - maxLength / 2); - matchFound = true; break; } } - // Extract snippet - let snippet = content.substring(snippetStart, snippetStart + maxLength); + // Extract a snippet region from the raw content, then clean only that + const snippetRegion = content.substring(snippetStart, snippetStart + maxLength + 100); - // If snippet contains linebreaks, limit to max 4 lines and override character limit + // Normalize whitespace only on the small snippet region + let snippet = snippetRegion + .replace(/\n\s*\n/g, "\n\n") + .replace(/[ \t]+/g, " ") + .trim() + .substring(0, maxLength); + + // If snippet contains linebreaks, limit to max 4 lines const lines = snippet.split('\n'); if (lines.length > 4) { - // Find which lines contain the search tokens to ensure they're included - const normalizedLines = lines.map(line => normalize(line)); - const normalizedTokens = searchTokens.map(token => normalize(token)); - // Find the first line that contains a search token let firstMatchLine = -1; - for (let i = 0; i < normalizedLines.length; i++) { - if (normalizedTokens.some(token => normalizedLines[i].includes(token))) { + for (let i = 0; i < lines.length; i++) { + const normalizedLine = normalize(lines[i]); + if (normalizedTokens.some(token => normalizedLine.includes(token))) { firstMatchLine = i; break; } } if (firstMatchLine !== -1) { - // Center the 4-line window around the first match - // Try to show 1 line before and 2 lines after the match const startLine = Math.max(0, firstMatchLine - 1); const endLine = Math.min(lines.length, startLine + 4); snippet = lines.slice(startLine, endLine).join('\n'); } else { - // No match found in lines (shouldn't happen), just take first 4 snippet = lines.slice(0, 4).join('\n'); } - // Add ellipsis if we truncated lines snippet = snippet + "..."; - } else if (lines.length > 1) { - // For multi-line snippets that are 4 or fewer lines, keep them as-is - // No need to truncate - } else { - // Single line content - apply original word boundary logic - // Try to start/end at word boundaries + } else if (lines.length <= 1) { + // Single line content - apply word boundary logic if (snippetStart > 0) { const firstSpace = snippet.search(/\s/); if (firstSpace > 0 && firstSpace < 20) { @@ -541,7 +536,7 @@ function extractContentSnippet(noteId: string, searchTokens: string[], maxLength } snippet = "..." + snippet; } - + if (snippetStart + maxLength < content.length) { const lastSpace = snippet.search(/\s[^\s]*$/); if (lastSpace > snippet.length - 20 && lastSpace > 0) { @@ -649,7 +644,8 @@ function searchNotesForAutocomplete(query: string, fastSearch: boolean = true) { includeHiddenNotes: true, fuzzyAttributeSearch: true, ignoreInternalAttributes: true, - ancestorNoteId: hoistedNoteService.isHoistedInHiddenSubtree() ? "root" : hoistedNoteService.getHoistedNoteId() + ancestorNoteId: hoistedNoteService.isHoistedInHiddenSubtree() ? "root" : hoistedNoteService.getHoistedNoteId(), + limit: 200 }); const allSearchResults = findResultsWithQuery(query, searchContext); From 6a06fc79956914f1b437dd5131886ccaa14fb60c Mon Sep 17 00:00:00 2001 From: perfectra1n Date: Thu, 12 Mar 2026 14:02:23 -0700 Subject: [PATCH 03/33] feat(search): get rid of candidate capping --- .../src/services/search/expressions/note_flat_text.ts | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/apps/server/src/services/search/expressions/note_flat_text.ts b/apps/server/src/services/search/expressions/note_flat_text.ts index 93213d164e..eff3622a76 100644 --- a/apps/server/src/services/search/expressions/note_flat_text.ts +++ b/apps/server/src/services/search/expressions/note_flat_text.ts @@ -181,11 +181,6 @@ class NoteFlatTextExp extends Expression { getCandidateNotes(noteSet: NoteSet, searchContext?: SearchContext): BNote[] { const candidateNotes: BNote[] = []; - // For limited searches (e.g. autocomplete), cap candidates to avoid - // processing thousands of matches when only a few hundred are needed. - // Use 5x the limit to ensure enough quality candidates for scoring. - const maxCandidates = searchContext?.limit ? searchContext.limit * 5 : Infinity; - // Use the pre-built flat text index for fast scanning. // This provides pre-computed flat texts in a parallel array, avoiding // per-note property access overhead at large scale (50K+ notes). @@ -210,10 +205,7 @@ class NoteFlatTextExp extends Expression { } } - if (candidateNotes.length >= maxCandidates) { - break; - } - } +} return candidateNotes; } From 9403efa9a1a99be6afe365eaf4fff1859555a760 Mon Sep 17 00:00:00 2001 From: perfectra1n Date: Thu, 12 Mar 2026 14:21:36 -0700 Subject: [PATCH 04/33] feat(search): add even some more robust tests --- .../search/services/search_profiling.spec.ts | 151 +++++++++++++++++- 1 file changed, 145 insertions(+), 6 deletions(-) diff --git a/apps/server/src/services/search/services/search_profiling.spec.ts b/apps/server/src/services/search/services/search_profiling.spec.ts index 96a414b257..6ed5d9fbb7 100644 --- a/apps/server/src/services/search/services/search_profiling.spec.ts +++ b/apps/server/src/services/search/services/search_profiling.spec.ts @@ -33,9 +33,10 @@ function randomWord(len = 6): string { return word; } -function generateHtmlContent(wordCount: number, includeTarget = false): string { +function generateHtmlContent(wordCount: number, includeKeywords = false, keywords?: string[]): string { const paragraphs: string[] = []; let wordsRemaining = wordCount; + const kws = keywords ?? ["target"]; while (wordsRemaining > 0) { const paraWords = Math.min(wordsRemaining, 20 + Math.floor(Math.random() * 40)); @@ -43,8 +44,12 @@ function generateHtmlContent(wordCount: number, includeTarget = false): string { for (let i = 0; i < paraWords; i++) { words.push(randomWord(3 + Math.floor(Math.random() * 10))); } - if (includeTarget && paragraphs.length === 2) { - words[Math.floor(words.length / 2)] = "target"; + if (includeKeywords && paragraphs.length === 2) { + // Inject all keywords into the paragraph at spaced positions + for (let k = 0; k < kws.length; k++) { + const pos = Math.min(words.length - 1, Math.floor((words.length / (kws.length + 1)) * (k + 1))); + words[pos] = kws[k]; + } } paragraphs.push(`

${words.join(" ")}

`); wordsRemaining -= paraWords; @@ -80,12 +85,21 @@ function buildDataset(noteCount: number, opts: { labelsPerNote?: number; depth?: number; contentWordCount?: number; + /** When set, contentWordCount is treated as a median and actual sizes vary from 0.2x to 3x */ + varyContentSize?: boolean; + /** Keywords to inject into matching notes' titles (default: ["target"]) */ + titleKeywords?: string[]; + /** Keywords to inject into matching notes' content (default: same as titleKeywords) */ + contentKeywords?: string[]; } = {}) { const { matchFraction = 0.1, labelsPerNote = 3, depth = 3, contentWordCount = 200, + varyContentSize = false, + titleKeywords = ["target"], + contentKeywords = titleKeywords, } = opts; becca.reset(); @@ -115,18 +129,39 @@ function buildDataset(noteCount: number, opts: { for (let i = 0; i < noteCount; i++) { const isMatch = i < matchCount; const title = isMatch - ? `${randomWord(5)} target ${randomWord(5)} Document ${i}` + ? `${randomWord(5)} ${titleKeywords.join(" ")} ${randomWord(5)} Document ${i}` : `${randomWord(5)} ${randomWord(6)} ${randomWord(4)} Note ${i}`; const n = note(title); for (let l = 0; l < labelsPerNote; l++) { const labelName = isMatch && l === 0 ? "category" : `label_${randomWord(4)}`; - const labelValue = isMatch && l === 0 ? "important target" : randomWord(8); + const labelValue = isMatch && l === 0 ? `important ${titleKeywords[0]}` : randomWord(8); n.label(labelName, labelValue); } - syntheticContent[n.note.noteId] = generateHtmlContent(contentWordCount, isMatch); + // Vary content size: 0.2x to 3x the median, producing a realistic + // mix of short stubs, medium notes, and long documents. + let noteWordCount = contentWordCount; + if (varyContentSize) { + const r = Math.random(); + if (r < 0.2) { + noteWordCount = Math.floor(contentWordCount * (0.2 + Math.random() * 0.3)); // 20-50% (short stubs) + } else if (r < 0.7) { + noteWordCount = Math.floor(contentWordCount * (0.7 + Math.random() * 0.6)); // 70-130% (medium) + } else if (r < 0.9) { + noteWordCount = Math.floor(contentWordCount * (1.3 + Math.random() * 0.7)); // 130-200% (long) + } else { + noteWordCount = Math.floor(contentWordCount * (2.0 + Math.random() * 1.0)); // 200-300% (very long) + } + } + + const includeContentKeyword = isMatch && contentKeywords.length > 0; + syntheticContent[n.note.noteId] = generateHtmlContent( + noteWordCount, + includeContentKeyword, + includeContentKeyword ? contentKeywords : undefined + ); const containerIndex = i % containers.length; containers[containerIndex].child(n); @@ -451,6 +486,110 @@ describe("Search Profiling", () => { /** * End-to-end scaling to give the full picture. */ + /** + * Multi-token search with varying content sizes. + * Real users search things like "meeting notes january" — this exercises + * the multi-token path (which doesn't use the single-token fast path) + * with a realistic mix of note sizes. + */ + describe("Multi-token search with varying content sizes", () => { + + it("single vs multi-token autocomplete at scale", () => { + console.log("\n=== Single vs multi-token autocomplete (varying content sizes) ==="); + + for (const noteCount of [1000, 5000, 10000, 20000]) { + buildDataset(noteCount, { + matchFraction: 0.15, + contentWordCount: 400, + varyContentSize: true, + depth: 5, + titleKeywords: ["meeting", "notes", "january"], + contentKeywords: ["meeting", "notes", "january"], + }); + + // Warm up + searchService.searchNotesForAutocomplete("meeting", true); + + // Single token + const singleTimes: number[] = []; + for (let i = 0; i < 3; i++) { + const [, ms] = timed(() => searchService.searchNotesForAutocomplete("meeting", true)); + singleTimes.push(ms); + } + const singleAvg = singleTimes.reduce((a, b) => a + b, 0) / singleTimes.length; + + // Two tokens + const twoTimes: number[] = []; + for (let i = 0; i < 3; i++) { + const [, ms] = timed(() => searchService.searchNotesForAutocomplete("meeting notes", true)); + twoTimes.push(ms); + } + const twoAvg = twoTimes.reduce((a, b) => a + b, 0) / twoTimes.length; + + // Three tokens + const threeTimes: number[] = []; + for (let i = 0; i < 3; i++) { + const [, ms] = timed(() => searchService.searchNotesForAutocomplete("meeting notes january", true)); + threeTimes.push(ms); + } + const threeAvg = threeTimes.reduce((a, b) => a + b, 0) / threeTimes.length; + + console.log( + ` ${String(noteCount).padStart(6)} notes: ` + + `1-token ${singleAvg.toFixed(1)}ms ` + + `2-token ${twoAvg.toFixed(1)}ms ` + + `3-token ${threeAvg.toFixed(1)}ms` + ); + } + }); + + it("multi-token with realistic content size distribution", () => { + console.log("\n=== Multi-token search — content size distribution ==="); + + buildDataset(5000, { + matchFraction: 0.15, + contentWordCount: 400, + varyContentSize: true, + depth: 5, + titleKeywords: ["project", "review"], + contentKeywords: ["project", "review"], + }); + + // Report the actual content size distribution + const sizes = Object.values(syntheticContent).map(c => c.length); + sizes.sort((a, b) => a - b); + const p10 = sizes[Math.floor(sizes.length * 0.1)]; + const p50 = sizes[Math.floor(sizes.length * 0.5)]; + const p90 = sizes[Math.floor(sizes.length * 0.9)]; + const p99 = sizes[Math.floor(sizes.length * 0.99)]; + console.log(` Content sizes: p10=${p10} p50=${p50} p90=${p90} p99=${p99} chars`); + + // Warm up + searchService.searchNotesForAutocomplete("project", true); + + const queries = [ + "project", + "project review", + "project review document", + `${randomWord(7)}`, // no-match single token + `${randomWord(5)} ${randomWord(6)}`, // no-match multi token + ]; + + for (const query of queries) { + const times: number[] = []; + let resultCount = 0; + for (let i = 0; i < 3; i++) { + const [r, ms] = timed(() => searchService.searchNotesForAutocomplete(query, true)); + times.push(ms); + resultCount = r.length; + } + const avg = times.reduce((a, b) => a + b, 0) / times.length; + const label = `"${query}"`.padEnd(35); + console.log(` ${label} ${avg.toFixed(1)}ms (${resultCount} results)`); + } + }); + }); + describe("End-to-end scaling", () => { it("autocomplete at different scales", () => { From 1c148f407cc8c397374b87575c984c15f0fae1da Mon Sep 17 00:00:00 2001 From: perfectra1n Date: Thu, 12 Mar 2026 14:35:17 -0700 Subject: [PATCH 05/33] feat(search): don't toss the entire index after each note change --- apps/server/src/becca/becca-interface.ts | 46 +++++++++++++++++-- apps/server/src/becca/entities/battribute.ts | 10 ++++ apps/server/src/becca/entities/bnote.ts | 4 +- .../search/expressions/note_flat_text.ts | 9 ++-- 4 files changed, 56 insertions(+), 13 deletions(-) diff --git a/apps/server/src/becca/becca-interface.ts b/apps/server/src/becca/becca-interface.ts index 6619ed30b9..1a95408d9b 100644 --- a/apps/server/src/becca/becca-interface.ts +++ b/apps/server/src/becca/becca-interface.ts @@ -34,14 +34,19 @@ export default class Becca { /** * Pre-built parallel arrays for fast flat text scanning in search. * Avoids per-note property access overhead when iterating 50K+ notes. - * Dirtied when notes change (along with allNoteSetCache). + * Supports incremental updates: when individual notes change, only their + * entries are rebuilt rather than the entire index. */ - flatTextIndex: { notes: BNote[], flatTexts: string[] } | null; + flatTextIndex: { notes: BNote[], flatTexts: string[], noteIdToIdx: Map } | null; + + /** NoteIds whose flat text needs to be recomputed in the index. */ + dirtyFlatTextNoteIds: Set; constructor() { - this.reset(); + this.dirtyFlatTextNoteIds = new Set(); this.allNoteSetCache = null; this.flatTextIndex = null; + this.reset(); } reset() { @@ -247,25 +252,56 @@ export default class Becca { /** Should be called when the set of all non-skeleton notes changes (added/removed) */ dirtyNoteSetCache() { this.allNoteSetCache = null; + // Full rebuild needed since the note set itself changed this.flatTextIndex = null; + this.dirtyFlatTextNoteIds.clear(); + } + + /** Mark a single note's flat text as needing recomputation in the index. */ + dirtyNoteFlatText(noteId: string) { + if (this.flatTextIndex) { + // Index exists — schedule an incremental update + this.dirtyFlatTextNoteIds.add(noteId); + } + // If flatTextIndex is null, full rebuild will happen on next access anyway } /** * Returns pre-built parallel arrays of notes and their flat texts for fast scanning. * The flat texts are already normalized (lowercase, diacritics removed). + * Supports incremental updates: when individual notes are dirtied, only their + * entries are recomputed rather than rebuilding the entire index. */ - getFlatTextIndex(): { notes: BNote[], flatTexts: string[] } { + getFlatTextIndex(): { notes: BNote[], flatTexts: string[], noteIdToIdx: Map } { if (!this.flatTextIndex) { const allNoteSet = this.getAllNoteSet(); const notes: BNote[] = []; const flatTexts: string[] = []; + const noteIdToIdx = new Map(); for (const note of allNoteSet.notes) { + noteIdToIdx.set(note.noteId, notes.length); notes.push(note); flatTexts.push(note.getFlatText()); } - this.flatTextIndex = { notes, flatTexts }; + this.flatTextIndex = { notes, flatTexts, noteIdToIdx }; + this.dirtyFlatTextNoteIds.clear(); + } else if (this.dirtyFlatTextNoteIds.size > 0) { + // Incremental update: only recompute flat texts for dirtied notes + const { flatTexts, noteIdToIdx } = this.flatTextIndex; + + for (const noteId of this.dirtyFlatTextNoteIds) { + const idx = noteIdToIdx.get(noteId); + if (idx !== undefined) { + const note = this.notes[noteId]; + if (note) { + flatTexts[idx] = note.getFlatText(); + } + } + } + + this.dirtyFlatTextNoteIds.clear(); } return this.flatTextIndex; diff --git a/apps/server/src/becca/entities/battribute.ts b/apps/server/src/becca/entities/battribute.ts index 6ff1246fcf..77a15c2fd1 100644 --- a/apps/server/src/becca/entities/battribute.ts +++ b/apps/server/src/becca/entities/battribute.ts @@ -6,6 +6,7 @@ import dateUtils from "../../services/date_utils.js"; import promotedAttributeDefinitionParser from "../../services/promoted_attribute_definition_parser.js"; import sanitizeAttributeName from "../../services/sanitize_attribute_name.js"; import type { AttributeRow, AttributeType } from "@triliumnext/commons"; +import { normalize } from "../../services/utils.js"; interface SavingOpts { skipValidation?: boolean; @@ -34,6 +35,11 @@ class BAttribute extends AbstractBeccaEntity { value!: string; isInheritable!: boolean; + /** Pre-normalized (lowercase, diacritics removed) name for search. */ + normalizedName!: string; + /** Pre-normalized (lowercase, diacritics removed) value for search. */ + normalizedValue!: string; + constructor(row?: AttributeRow) { super(); @@ -59,6 +65,10 @@ class BAttribute extends AbstractBeccaEntity { this.isInheritable = !!isInheritable; this.utcDateModified = utcDateModified; + // Pre-compute normalized forms for search (avoids repeated normalize() calls in hot loops) + this.normalizedName = normalize(this.name); + this.normalizedValue = normalize(this.value); + return this; } diff --git a/apps/server/src/becca/entities/bnote.ts b/apps/server/src/becca/entities/bnote.ts index 4e78974b4e..10750efc32 100644 --- a/apps/server/src/becca/entities/bnote.ts +++ b/apps/server/src/becca/entities/bnote.ts @@ -791,8 +791,8 @@ class BNote extends AbstractBeccaEntity { this.__inheritableAttributeCache = null; this.__ancestorCache = null; - // Dirty the becca-level flat text index since this note's flat text may have changed - this.becca.flatTextIndex = null; + // Mark only this note's flat text as dirty for incremental index update + this.becca.dirtyNoteFlatText(this.noteId); } invalidateSubTree(path: string[] = []) { diff --git a/apps/server/src/services/search/expressions/note_flat_text.ts b/apps/server/src/services/search/expressions/note_flat_text.ts index eff3622a76..ff54287d91 100644 --- a/apps/server/src/services/search/expressions/note_flat_text.ts +++ b/apps/server/src/services/search/expressions/note_flat_text.ts @@ -7,7 +7,7 @@ import Expression from "./expression.js"; import NoteSet from "../note_set.js"; import becca from "../../../becca/becca.js"; import { normalize } from "../../utils.js"; -import { normalizeSearchText, fuzzyMatchWord, fuzzyMatchWordWithResult } from "../utils/text_utils.js"; +import { normalizeSearchText, fuzzyMatchWordWithResult } from "../utils/text_utils.js"; import beccaService from "../../../becca/becca_service.js"; class NoteFlatTextExp extends Expression { @@ -67,11 +67,8 @@ class NoteFlatTextExp extends Expression { } for (const attribute of note.getOwnedAttributes()) { - const normalizedName = normalizeSearchText(attribute.name); - const normalizedValue = normalizeSearchText(attribute.value); - for (const token of remainingTokens) { - if (normalizedName.includes(token) || normalizedValue.includes(token)) { + if (attribute.normalizedName.includes(token) || attribute.normalizedValue.includes(token)) { foundAttrTokens.push(token); } } @@ -134,7 +131,7 @@ class NoteFlatTextExp extends Expression { } for (const attribute of note.ownedAttributes) { - if (normalizeSearchText(attribute.name).includes(token) || normalizeSearchText(attribute.value).includes(token)) { + if (attribute.normalizedName.includes(token) || attribute.normalizedValue.includes(token)) { foundAttrTokens.push(token); } } From b533546236a4264d64697ae9a76fb8a9f63c25a3 Mon Sep 17 00:00:00 2001 From: perfectra1n Date: Thu, 12 Mar 2026 14:35:47 -0700 Subject: [PATCH 06/33] fix(search): fix flying bracket --- apps/server/src/services/search/expressions/note_flat_text.ts | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/apps/server/src/services/search/expressions/note_flat_text.ts b/apps/server/src/services/search/expressions/note_flat_text.ts index ff54287d91..b1ceac991e 100644 --- a/apps/server/src/services/search/expressions/note_flat_text.ts +++ b/apps/server/src/services/search/expressions/note_flat_text.ts @@ -201,8 +201,7 @@ class NoteFlatTextExp extends Expression { break; } } - -} + } return candidateNotes; } From 5718631889ecb8fa01e72d2c2f374ebdc0de44a7 Mon Sep 17 00:00:00 2001 From: perfectra1n Date: Wed, 18 Mar 2026 09:46:24 -0700 Subject: [PATCH 07/33] fix(search): resolve issue with autocomplete with search performance enhancements --- .../src/services/search/expressions/note_flat_text.ts | 4 ++-- apps/server/src/services/search/search_context.ts | 3 +++ apps/server/src/services/search/services/search.ts | 10 +++++----- apps/server/src/services/search/services/types.ts | 2 ++ 4 files changed, 12 insertions(+), 7 deletions(-) diff --git a/apps/server/src/services/search/expressions/note_flat_text.ts b/apps/server/src/services/search/expressions/note_flat_text.ts index b1ceac991e..ef3efbf66f 100644 --- a/apps/server/src/services/search/expressions/note_flat_text.ts +++ b/apps/server/src/services/search/expressions/note_flat_text.ts @@ -96,10 +96,10 @@ class NoteFlatTextExp extends Expression { const candidateNotes = this.getCandidateNotes(inputNoteSet, searchContext); - // Fast path for single-token searches with a limit (e.g. autocomplete): + // Fast path for single-token autocomplete searches: // Skip the expensive recursive parent walk and just use getBestNotePath(). // The flat text already matched, so we know the token is present. - if (this.tokens.length === 1 && searchContext.limit) { + if (this.tokens.length === 1 && searchContext.autocomplete) { for (const note of candidateNotes) { if (!resultNoteSet.hasNoteId(note.noteId)) { const notePath = note.getBestNotePath(); diff --git a/apps/server/src/services/search/search_context.ts b/apps/server/src/services/search/search_context.ts index 314c7e7ce6..55d4df5d2f 100644 --- a/apps/server/src/services/search/search_context.ts +++ b/apps/server/src/services/search/search_context.ts @@ -18,6 +18,8 @@ class SearchContext { debug?: boolean; debugInfo: {} | null; fuzzyAttributeSearch: boolean; + /** When true, skip the two-phase fuzzy fallback and use the single-token fast path. */ + autocomplete: boolean; enableFuzzyMatching: boolean; // Controls whether fuzzy matching is enabled for this search phase highlightedTokens: string[]; originalQuery: string; @@ -46,6 +48,7 @@ class SearchContext { this.debug = params.debug; this.debugInfo = null; this.fuzzyAttributeSearch = !!params.fuzzyAttributeSearch; + this.autocomplete = !!params.autocomplete; this.enableFuzzyMatching = true; // Default to true for backward compatibility this.highlightedTokens = []; this.originalQuery = ""; diff --git a/apps/server/src/services/search/services/search.ts b/apps/server/src/services/search/services/search.ts index 7ee3e494f4..b533c185fe 100644 --- a/apps/server/src/services/search/services/search.ts +++ b/apps/server/src/services/search/services/search.ts @@ -248,10 +248,10 @@ function findResultsWithExpression(expression: Expression, searchContext: Search return performSearch(expression, searchContext, false); } - // For limited searches (e.g. autocomplete), skip the expensive two-phase - // fuzzy fallback. The user is typing and will refine their query — exact - // matching is sufficient and avoids a second full scan of all notes. - if (searchContext.limit) { + // For autocomplete searches, skip the expensive two-phase fuzzy fallback. + // The user is typing and will refine their query — exact matching is + // sufficient and avoids a second full scan of all notes. + if (searchContext.autocomplete) { return performSearch(expression, searchContext, false); } @@ -645,7 +645,7 @@ function searchNotesForAutocomplete(query: string, fastSearch: boolean = true) { fuzzyAttributeSearch: true, ignoreInternalAttributes: true, ancestorNoteId: hoistedNoteService.isHoistedInHiddenSubtree() ? "root" : hoistedNoteService.getHoistedNoteId(), - limit: 200 + autocomplete: true }); const allSearchResults = findResultsWithQuery(query, searchContext); diff --git a/apps/server/src/services/search/services/types.ts b/apps/server/src/services/search/services/types.ts index 7edc3b4ae5..60d00540c6 100644 --- a/apps/server/src/services/search/services/types.ts +++ b/apps/server/src/services/search/services/types.ts @@ -21,4 +21,6 @@ export interface SearchParams { limit?: number | null; debug?: boolean; fuzzyAttributeSearch?: boolean; + /** When true, skip the two-phase fuzzy fallback and use the single-token fast path. */ + autocomplete?: boolean; } From f23a7b48429223ed21f0b944920f28b5edb6b675 Mon Sep 17 00:00:00 2001 From: perfectra1n Date: Wed, 18 Mar 2026 11:43:28 -0700 Subject: [PATCH 08/33] feat(settings): also allow for fuzzy searching to just be disabled --- apps/client/src/translations/en/translation.json | 4 ++++ .../src/widgets/type_widgets/options/other.tsx | 16 ++++++++++++++++ apps/server/src/routes/api/options.ts | 1 + apps/server/src/services/options_init.ts | 3 +++ .../server/src/services/search/search_context.ts | 7 ++++++- packages/commons/src/lib/options_interface.ts | 4 ++++ 6 files changed, 34 insertions(+), 1 deletion(-) diff --git a/apps/client/src/translations/en/translation.json b/apps/client/src/translations/en/translation.json index fdd6f9fb2d..f9ba8f8743 100644 --- a/apps/client/src/translations/en/translation.json +++ b/apps/client/src/translations/en/translation.json @@ -1292,6 +1292,10 @@ "erase_excess_revision_snapshots": "Erase excess revision snapshots now", "erase_excess_revision_snapshots_prompt": "Excess revision snapshots have been erased." }, + "search": { + "title": "Search", + "enable_fuzzy_matching": "Enable fuzzy matching in search (matches similar words when exact matches are insufficient)" + }, "search_engine": { "title": "Search Engine", "custom_search_engine_info": "Custom search engine requires both a name and a URL to be set. If either of these is not set, DuckDuckGo will be used as the default search engine.", diff --git a/apps/client/src/widgets/type_widgets/options/other.tsx b/apps/client/src/widgets/type_widgets/options/other.tsx index e6813f8d2b..8cb99bace4 100644 --- a/apps/client/src/widgets/type_widgets/options/other.tsx +++ b/apps/client/src/widgets/type_widgets/options/other.tsx @@ -21,6 +21,7 @@ import TimeSelector from "./components/TimeSelector"; export default function OtherSettings() { return ( <> + {isElectron() && <> @@ -36,6 +37,21 @@ export default function OtherSettings() { ); } +function SearchSettings() { + const [ fuzzyEnabled, setFuzzyEnabled ] = useTriliumOptionBool("searchEnableFuzzyMatching"); + + return ( + + + + ); +} + function SearchEngineSettings() { const [ customSearchEngineName, setCustomSearchEngineName ] = useTriliumOption("customSearchEngineName"); const [ customSearchEngineUrl, setCustomSearchEngineUrl ] = useTriliumOption("customSearchEngineUrl"); diff --git a/apps/server/src/routes/api/options.ts b/apps/server/src/routes/api/options.ts index bb6ffb00d6..049a898fca 100644 --- a/apps/server/src/routes/api/options.ts +++ b/apps/server/src/routes/api/options.ts @@ -97,6 +97,7 @@ const ALLOWED_OPTIONS = new Set([ "layoutOrientation", "backgroundEffects", "allowedHtmlTags", + "searchEnableFuzzyMatching", "redirectBareDomain", "showLoginInShareTheme", "splitEditorOrientation", diff --git a/apps/server/src/services/options_init.ts b/apps/server/src/services/options_init.ts index a49672019d..17ea5a1f0b 100644 --- a/apps/server/src/services/options_init.ts +++ b/apps/server/src/services/options_init.ts @@ -198,6 +198,9 @@ const defaultOptions: DefaultOption[] = [ isSynced: true }, + // Search settings + { name: "searchEnableFuzzyMatching", value: "true", isSynced: true }, + // Share settings { name: "redirectBareDomain", value: "false", isSynced: true }, { name: "showLoginInShareTheme", value: "false", isSynced: true }, diff --git a/apps/server/src/services/search/search_context.ts b/apps/server/src/services/search/search_context.ts index 55d4df5d2f..79b0b6db3d 100644 --- a/apps/server/src/services/search/search_context.ts +++ b/apps/server/src/services/search/search_context.ts @@ -1,6 +1,7 @@ "use strict"; import hoistedNoteService from "../hoisted_note.js"; +import optionService from "../options.js"; import type { SearchParams } from "./services/types.js"; class SearchContext { @@ -49,7 +50,11 @@ class SearchContext { this.debugInfo = null; this.fuzzyAttributeSearch = !!params.fuzzyAttributeSearch; this.autocomplete = !!params.autocomplete; - this.enableFuzzyMatching = true; // Default to true for backward compatibility + try { + this.enableFuzzyMatching = optionService.getOptionBool("searchEnableFuzzyMatching"); + } catch { + this.enableFuzzyMatching = true; // Default to true if option not yet initialized + } this.highlightedTokens = []; this.originalQuery = ""; this.fulltextQuery = ""; // complete fulltext part diff --git a/packages/commons/src/lib/options_interface.ts b/packages/commons/src/lib/options_interface.ts index 5582df79d2..6e36ebd7a3 100644 --- a/packages/commons/src/lib/options_interface.ts +++ b/packages/commons/src/lib/options_interface.ts @@ -134,6 +134,10 @@ export interface OptionDefinitions extends KeyboardShortcutsOptions Date: Fri, 20 Mar 2026 11:26:19 -0700 Subject: [PATCH 09/33] feat(tests): implement search benchmark test... --- .../search/services/search_benchmark.spec.ts | 675 ++++++++++++++++++ 1 file changed, 675 insertions(+) create mode 100644 apps/server/src/services/search/services/search_benchmark.spec.ts diff --git a/apps/server/src/services/search/services/search_benchmark.spec.ts b/apps/server/src/services/search/services/search_benchmark.spec.ts new file mode 100644 index 0000000000..53319ff9cd --- /dev/null +++ b/apps/server/src/services/search/services/search_benchmark.spec.ts @@ -0,0 +1,675 @@ +/** + * Comprehensive search benchmark suite. + * + * Covers many scenarios: + * - Single-token, multi-token, phrase-like queries + * - Fuzzy matching enabled vs disabled + * - Autocomplete vs full search + * - Diacritics / unicode queries + * - No-match queries + * - Varying note counts (1K, 5K, 10K, 20K) + * - Warm cache vs cold cache + * + * All times are in-memory (monkeypatched getContent, no real SQL). + */ +import { describe, it, expect, afterEach } from "vitest"; +import searchService from "./search.js"; +import BNote from "../../../becca/entities/bnote.js"; +import BBranch from "../../../becca/entities/bbranch.js"; +import SearchContext from "../search_context.js"; +import becca from "../../../becca/becca.js"; +import { NoteBuilder, note } from "../../../test/becca_mocking.js"; + +// ── helpers ────────────────────────────────────────────────────────── + +function randomWord(len = 6): string { + const chars = "abcdefghijklmnopqrstuvwxyz"; + let word = ""; + for (let i = 0; i < len; i++) { + word += chars[Math.floor(Math.random() * chars.length)]; + } + return word; +} + +function generateHtmlContent(wordCount: number, includeKeywords = false, keywords?: string[]): string { + const paragraphs: string[] = []; + let wordsRemaining = wordCount; + const kws = keywords ?? []; + + while (wordsRemaining > 0) { + const paraWords = Math.min(wordsRemaining, 20 + Math.floor(Math.random() * 40)); + const words: string[] = []; + for (let i = 0; i < paraWords; i++) { + words.push(randomWord(3 + Math.floor(Math.random() * 10))); + } + if (includeKeywords && paragraphs.length === 2) { + for (let k = 0; k < kws.length; k++) { + const pos = Math.min(words.length - 1, Math.floor((words.length / (kws.length + 1)) * (k + 1))); + words[pos] = kws[k]; + } + } + paragraphs.push(`

${words.join(" ")}

`); + wordsRemaining -= paraWords; + } + + return `${paragraphs.join("\n")}`; +} + +function timed(fn: () => T): [T, number] { + const start = performance.now(); + const result = fn(); + return [result, performance.now() - start]; +} + +function avg(nums: number[]): number { + return nums.reduce((a, b) => a + b, 0) / nums.length; +} + +function min(nums: number[]): number { + return Math.min(...nums); +} + +// ── dataset builder ────────────────────────────────────────────────── + +const syntheticContent: Record = {}; + +function buildDataset(noteCount: number, opts: { + matchFraction?: number; + labelsPerNote?: number; + depth?: number; + contentWordCount?: number; + varyContentSize?: boolean; + titleKeywords?: string[]; + contentKeywords?: string[]; + /** Include notes with diacritics in titles */ + includeDiacritics?: boolean; +} = {}) { + const { + matchFraction = 0.1, + labelsPerNote = 3, + depth = 4, + contentWordCount = 300, + varyContentSize = true, + titleKeywords = ["target"], + contentKeywords = titleKeywords, + includeDiacritics = false, + } = opts; + + becca.reset(); + for (const key of Object.keys(syntheticContent)) { + delete syntheticContent[key]; + } + + const rootNote = new NoteBuilder(new BNote({ noteId: "root", title: "root", type: "text" })); + new BBranch({ + branchId: "none_root", + noteId: "root", + parentNoteId: "none", + notePosition: 10 + }); + + const containers: NoteBuilder[] = []; + let parent = rootNote; + for (let d = 0; d < depth; d++) { + const container = note(`Container_${d}_${randomWord(4)}`); + parent.child(container); + containers.push(container); + parent = container; + } + + const matchCount = Math.floor(noteCount * matchFraction); + const diacriticTitles = [ + "résumé", "naïve", "café", "über", "ñoño", "exposé", + "Ångström", "Üntersuchung", "São Paulo", "François" + ]; + + for (let i = 0; i < noteCount; i++) { + const isMatch = i < matchCount; + let title: string; + + if (includeDiacritics && i % 20 === 0) { + // Every 20th note gets a diacritics-heavy title + const dTitle = diacriticTitles[i % diacriticTitles.length]; + title = isMatch + ? `${dTitle} ${titleKeywords.join(" ")} Document ${i}` + : `${dTitle} ${randomWord(5)} Note ${i}`; + } else { + title = isMatch + ? `${randomWord(5)} ${titleKeywords.join(" ")} ${randomWord(5)} Document ${i}` + : `${randomWord(5)} ${randomWord(6)} ${randomWord(4)} Note ${i}`; + } + + const n = note(title); + + for (let l = 0; l < labelsPerNote; l++) { + const labelName = isMatch && l === 0 ? "category" : `label_${randomWord(4)}`; + const labelValue = isMatch && l === 0 ? `important ${titleKeywords[0]}` : randomWord(8); + n.label(labelName, labelValue); + } + + let noteWordCount = contentWordCount; + if (varyContentSize) { + const r = Math.random(); + if (r < 0.2) noteWordCount = Math.floor(contentWordCount * (0.2 + Math.random() * 0.3)); + else if (r < 0.7) noteWordCount = Math.floor(contentWordCount * (0.7 + Math.random() * 0.6)); + else if (r < 0.9) noteWordCount = Math.floor(contentWordCount * (1.3 + Math.random() * 0.7)); + else noteWordCount = Math.floor(contentWordCount * (2.0 + Math.random() * 1.0)); + } + + const includeContentKeyword = isMatch && contentKeywords.length > 0; + syntheticContent[n.note.noteId] = generateHtmlContent( + noteWordCount, + includeContentKeyword, + includeContentKeyword ? contentKeywords : undefined + ); + + const containerIndex = i % containers.length; + containers[containerIndex].child(n); + } + + // Monkeypatch getContent() + for (const noteObj of Object.values(becca.notes)) { + const noteId = noteObj.noteId; + if (syntheticContent[noteId]) { + (noteObj as any).getContent = () => syntheticContent[noteId]; + } else { + (noteObj as any).getContent = () => ""; + } + } + + return { rootNote, matchCount }; +} + +// ── benchmark runner ───────────────────────────────────────────────── + +interface BenchmarkResult { + query: string; + mode: string; + noteCount: number; + avgMs: number; + minMs: number; + resultCount: number; +} + +function runBenchmark( + query: string, + mode: "autocomplete" | "fullSearch", + fuzzyEnabled: boolean, + iterations = 5 +): BenchmarkResult { + const noteCount = Object.keys(becca.notes).length; + + // Warm up + if (mode === "autocomplete") { + searchService.searchNotesForAutocomplete(query, true); + } else { + const ctx = new SearchContext({ fastSearch: false }); + ctx.enableFuzzyMatching = fuzzyEnabled; + searchService.findResultsWithQuery(query, ctx); + } + + const times: number[] = []; + let resultCount = 0; + + for (let i = 0; i < iterations; i++) { + if (mode === "autocomplete") { + // For autocomplete, fuzzy is controlled by the global option + // We'll manipulate enableFuzzyMatching after construction + const [results, ms] = timed(() => { + // searchNotesForAutocomplete creates its own SearchContext internally + // so we need to test via findResultsWithQuery for fuzzy control + const ctx = new SearchContext({ + fastSearch: true, + includeHiddenNotes: true, + fuzzyAttributeSearch: true, + ignoreInternalAttributes: true, + autocomplete: true + }); + ctx.enableFuzzyMatching = fuzzyEnabled; + return searchService.findResultsWithQuery(query, ctx); + }); + times.push(ms); + resultCount = results.length; + } else { + const [results, ms] = timed(() => { + const ctx = new SearchContext({ fastSearch: false }); + ctx.enableFuzzyMatching = fuzzyEnabled; + return searchService.findResultsWithQuery(query, ctx); + }); + times.push(ms); + resultCount = results.length; + } + } + + return { + query, + mode: `${mode}${fuzzyEnabled ? "+fuzzy" : ""}`, + noteCount, + avgMs: avg(times), + minMs: min(times), + resultCount + }; +} + +function printTable(title: string, results: BenchmarkResult[]) { + console.log(`\n${"═".repeat(110)}`); + console.log(` ${title}`); + console.log(`${"═".repeat(110)}`); + console.log( + " " + + "Query".padEnd(35) + + "Mode".padEnd(22) + + "Notes".padStart(7) + + "Avg (ms)".padStart(12) + + "Min (ms)".padStart(12) + + "Results".padStart(10) + ); + console.log(` ${"─".repeat(98)}`); + for (const r of results) { + console.log( + " " + + `"${r.query}"`.padEnd(35) + + r.mode.padEnd(22) + + String(r.noteCount).padStart(7) + + r.avgMs.toFixed(1).padStart(12) + + r.minMs.toFixed(1).padStart(12) + + String(r.resultCount).padStart(10) + ); + } + console.log(`${"═".repeat(110)}\n`); +} + +// ── tests ──────────────────────────────────────────────────────────── + +describe("Comprehensive Search Benchmark", () => { + + afterEach(() => { + becca.reset(); + }); + + describe("Single-token queries", () => { + for (const noteCount of [1000, 5000, 10000, 20000]) { + it(`single token @ ${noteCount} notes — fuzzy on vs off, autocomplete vs full`, () => { + buildDataset(noteCount, { + matchFraction: 0.15, + titleKeywords: ["meeting"], + contentKeywords: ["meeting"], + contentWordCount: 300, + }); + + const results: BenchmarkResult[] = [ + runBenchmark("meeting", "autocomplete", false), + runBenchmark("meeting", "autocomplete", true), + runBenchmark("meeting", "fullSearch", false), + runBenchmark("meeting", "fullSearch", true), + ]; + + printTable(`Single Token "meeting" — ${noteCount} notes`, results); + expect(results[0].resultCount).toBeGreaterThan(0); + }); + } + }); + + describe("Multi-token queries", () => { + for (const noteCount of [1000, 5000, 10000, 20000]) { + it(`multi token @ ${noteCount} notes — fuzzy on vs off`, () => { + buildDataset(noteCount, { + matchFraction: 0.15, + titleKeywords: ["meeting", "notes", "january"], + contentKeywords: ["meeting", "notes", "january"], + contentWordCount: 400, + }); + + const results: BenchmarkResult[] = [ + // 2-token + runBenchmark("meeting notes", "autocomplete", false), + runBenchmark("meeting notes", "autocomplete", true), + runBenchmark("meeting notes", "fullSearch", false), + runBenchmark("meeting notes", "fullSearch", true), + // 3-token + runBenchmark("meeting notes january", "autocomplete", false), + runBenchmark("meeting notes january", "autocomplete", true), + runBenchmark("meeting notes january", "fullSearch", false), + runBenchmark("meeting notes january", "fullSearch", true), + ]; + + printTable(`Multi Token — ${noteCount} notes`, results); + expect(results[0].resultCount).toBeGreaterThan(0); + }); + } + }); + + describe("No-match queries (worst case — full scan, zero results)", () => { + for (const noteCount of [1000, 5000, 10000, 20000]) { + it(`no-match @ ${noteCount} notes`, () => { + buildDataset(noteCount, { + matchFraction: 0.1, + titleKeywords: ["target"], + contentKeywords: ["target"], + contentWordCount: 300, + }); + + const results: BenchmarkResult[] = [ + runBenchmark("xyznonexistent", "autocomplete", false), + runBenchmark("xyznonexistent", "autocomplete", true), + runBenchmark("xyznonexistent", "fullSearch", false), + runBenchmark("xyznonexistent", "fullSearch", true), + runBenchmark("xyzfoo xyzbar", "autocomplete", false), + runBenchmark("xyzfoo xyzbar", "autocomplete", true), + runBenchmark("xyzfoo xyzbar", "fullSearch", false), + runBenchmark("xyzfoo xyzbar", "fullSearch", true), + ]; + + printTable(`No-Match Queries — ${noteCount} notes`, results); + // All should return 0 results + for (const r of results) { + expect(r.resultCount).toBe(0); + } + }); + } + }); + + describe("Diacritics / Unicode queries", () => { + for (const noteCount of [1000, 5000, 10000]) { + it(`diacritics @ ${noteCount} notes`, () => { + buildDataset(noteCount, { + matchFraction: 0.15, + titleKeywords: ["résumé"], + contentKeywords: ["résumé"], + contentWordCount: 300, + includeDiacritics: true, + }); + + const results: BenchmarkResult[] = [ + // Exact diacritics + runBenchmark("résumé", "autocomplete", false), + runBenchmark("résumé", "autocomplete", true), + // ASCII equivalent (should still match via normalize) + runBenchmark("resume", "autocomplete", false), + runBenchmark("resume", "autocomplete", true), + // Full search + runBenchmark("résumé", "fullSearch", false), + runBenchmark("resume", "fullSearch", false), + ]; + + printTable(`Diacritics "résumé" / "resume" — ${noteCount} notes`, results); + }); + } + }); + + describe("Partial / prefix queries (simulating typing)", () => { + for (const noteCount of [5000, 10000, 20000]) { + it(`typing progression @ ${noteCount} notes`, () => { + buildDataset(noteCount, { + matchFraction: 0.15, + titleKeywords: ["documentation"], + contentKeywords: ["documentation"], + contentWordCount: 300, + }); + + const results: BenchmarkResult[] = [ + runBenchmark("d", "autocomplete", false), + runBenchmark("do", "autocomplete", false), + runBenchmark("doc", "autocomplete", false), + runBenchmark("docu", "autocomplete", false), + runBenchmark("docum", "autocomplete", false), + runBenchmark("document", "autocomplete", false), + runBenchmark("documentation", "autocomplete", false), + // Same with fuzzy + runBenchmark("d", "autocomplete", true), + runBenchmark("doc", "autocomplete", true), + runBenchmark("document", "autocomplete", true), + runBenchmark("documentation", "autocomplete", true), + ]; + + printTable(`Typing Progression "documentation" — ${noteCount} notes`, results); + }); + } + }); + + describe("Attribute-matching queries", () => { + for (const noteCount of [5000, 10000]) { + it(`attribute search @ ${noteCount} notes`, () => { + buildDataset(noteCount, { + matchFraction: 0.15, + labelsPerNote: 5, + titleKeywords: ["important"], + contentKeywords: ["important"], + contentWordCount: 200, + }); + + const results: BenchmarkResult[] = [ + // "category" is a label name on matching notes + runBenchmark("category", "autocomplete", false), + runBenchmark("category", "autocomplete", true), + runBenchmark("category", "fullSearch", false), + runBenchmark("category", "fullSearch", true), + // "important" appears in both title and label value + runBenchmark("important", "autocomplete", false), + runBenchmark("important", "autocomplete", true), + ]; + + printTable(`Attribute Matching — ${noteCount} notes`, results); + }); + } + }); + + describe("Long queries (4-5 tokens)", () => { + for (const noteCount of [5000, 10000]) { + it(`long query @ ${noteCount} notes`, () => { + buildDataset(noteCount, { + matchFraction: 0.10, + titleKeywords: ["quarterly", "budget", "review", "report"], + contentKeywords: ["quarterly", "budget", "review", "report"], + contentWordCount: 500, + }); + + const results: BenchmarkResult[] = [ + runBenchmark("quarterly", "autocomplete", false), + runBenchmark("quarterly budget", "autocomplete", false), + runBenchmark("quarterly budget review", "autocomplete", false), + runBenchmark("quarterly budget review report", "autocomplete", false), + // Same with fuzzy + runBenchmark("quarterly budget review report", "autocomplete", true), + // Full search + runBenchmark("quarterly budget review report", "fullSearch", false), + runBenchmark("quarterly budget review report", "fullSearch", true), + ]; + + printTable(`Long Queries (4 tokens) — ${noteCount} notes`, results); + }); + } + }); + + describe("Mixed scenario — realistic user session", () => { + it("simulates a user session with varied queries @ 10K notes", () => { + buildDataset(10000, { + matchFraction: 0.15, + titleKeywords: ["project", "planning"], + contentKeywords: ["project", "planning", "timeline", "budget"], + contentWordCount: 400, + varyContentSize: true, + includeDiacritics: true, + depth: 6, + }); + + const results: BenchmarkResult[] = [ + // Quick autocomplete lookups (user typing in search bar) + runBenchmark("pro", "autocomplete", false), + runBenchmark("project", "autocomplete", false), + runBenchmark("project plan", "autocomplete", false), + + // Full search (user hits Enter) + runBenchmark("project", "fullSearch", false), + runBenchmark("project planning", "fullSearch", false), + runBenchmark("project planning", "fullSearch", true), + + // Typo / near-miss with fuzzy + runBenchmark("projct", "autocomplete", false), + runBenchmark("projct", "autocomplete", true), + runBenchmark("projct planing", "fullSearch", false), + runBenchmark("projct planing", "fullSearch", true), + + // No results + runBenchmark("xyznonexistent", "autocomplete", false), + runBenchmark("xyznonexistent foo", "fullSearch", true), + + // Short common substring + runBenchmark("note", "autocomplete", false), + runBenchmark("document", "autocomplete", false), + ]; + + printTable("Realistic User Session — 10K notes", results); + }); + }); + + describe("Cache warmth impact", () => { + it("cold vs warm flat text index @ 10K notes", () => { + buildDataset(10000, { + matchFraction: 0.15, + titleKeywords: ["target"], + contentKeywords: ["target"], + contentWordCount: 300, + }); + + console.log(`\n${"═".repeat(80)}`); + console.log(" Cold vs Warm Cache — 10K notes"); + console.log(`${"═".repeat(80)}`); + + // Cold: first search after dataset build (flat text index not yet built) + becca.flatTextIndex = null; + becca.dirtyFlatTextNoteIds.clear(); + const [coldResults, coldMs] = timed(() => { + const ctx = new SearchContext({ fastSearch: true, autocomplete: true }); + ctx.enableFuzzyMatching = false; + return searchService.findResultsWithQuery("target", ctx); + }); + console.log(` Cold (index build + search): ${coldMs.toFixed(1)}ms (${coldResults.length} results)`); + + // Warm: subsequent searches reuse the index + const warmTimes: number[] = []; + for (let i = 0; i < 5; i++) { + const [, ms] = timed(() => { + const ctx = new SearchContext({ fastSearch: true, autocomplete: true }); + ctx.enableFuzzyMatching = false; + return searchService.findResultsWithQuery("target", ctx); + }); + warmTimes.push(ms); + } + console.log(` Warm (reuse index, 5 runs): avg ${avg(warmTimes).toFixed(1)}ms min ${min(warmTimes).toFixed(1)}ms`); + + // Incremental: dirty a few notes and search again + const noteIds = Object.keys(becca.notes).slice(0, 50); + for (const nid of noteIds) { + becca.dirtyNoteFlatText(nid); + } + const [, incrMs] = timed(() => { + const ctx = new SearchContext({ fastSearch: true, autocomplete: true }); + ctx.enableFuzzyMatching = false; + return searchService.findResultsWithQuery("target", ctx); + }); + console.log(` Incremental (50 dirty notes): ${incrMs.toFixed(1)}ms`); + + // Full rebuild + becca.flatTextIndex = null; + const [, rebuildMs] = timed(() => { + const ctx = new SearchContext({ fastSearch: true, autocomplete: true }); + ctx.enableFuzzyMatching = false; + return searchService.findResultsWithQuery("target", ctx); + }); + console.log(` Full rebuild (index = null): ${rebuildMs.toFixed(1)}ms`); + + console.log(`${"═".repeat(80)}\n`); + }); + }); + + describe("Fuzzy matching effectiveness comparison", () => { + it("exact vs fuzzy result quality @ 10K notes", () => { + buildDataset(10000, { + matchFraction: 0.10, + titleKeywords: ["performance"], + contentKeywords: ["performance", "optimization"], + contentWordCount: 300, + }); + + console.log(`\n${"═".repeat(90)}`); + console.log(" Fuzzy Matching Effectiveness — 10K notes"); + console.log(`${"═".repeat(90)}`); + console.log( + " " + + "Query".padEnd(30) + + "Fuzzy".padEnd(8) + + "Time (ms)".padStart(12) + + "Results".padStart(10) + + " Notes" + ); + console.log(` ${"─".repeat(70)}`); + + const queries = [ + "performance", // exact match + "performanc", // truncated + "preformance", // typo + "performence", // common misspelling + "optimization", // exact match + "optimzation", // typo + "perf optim", // abbreviated multi + ]; + + for (const query of queries) { + for (const fuzzy of [false, true]) { + const times: number[] = []; + let resultCount = 0; + for (let i = 0; i < 3; i++) { + const [results, ms] = timed(() => { + const ctx = new SearchContext({ fastSearch: true }); + ctx.enableFuzzyMatching = fuzzy; + return searchService.findResultsWithQuery(query, ctx); + }); + times.push(ms); + resultCount = results.length; + } + console.log( + " " + + `"${query}"`.padEnd(30) + + (fuzzy ? "ON" : "OFF").padEnd(8) + + avg(times).toFixed(1).padStart(12) + + String(resultCount).padStart(10) + ); + } + } + + console.log(`${"═".repeat(90)}\n`); + }); + }); + + describe("Scale comparison summary", () => { + it("summary table across all note counts", () => { + const summaryResults: BenchmarkResult[] = []; + + for (const noteCount of [1000, 5000, 10000, 20000]) { + buildDataset(noteCount, { + matchFraction: 0.15, + titleKeywords: ["meeting", "notes"], + contentKeywords: ["meeting", "notes"], + contentWordCount: 400, + varyContentSize: true, + depth: 5, + }); + + // Core scenarios + summaryResults.push(runBenchmark("meeting", "autocomplete", false)); + summaryResults.push(runBenchmark("meeting", "autocomplete", true)); + summaryResults.push(runBenchmark("meeting notes", "autocomplete", false)); + summaryResults.push(runBenchmark("meeting notes", "autocomplete", true)); + summaryResults.push(runBenchmark("meeting", "fullSearch", false)); + summaryResults.push(runBenchmark("meeting", "fullSearch", true)); + summaryResults.push(runBenchmark("meeting notes", "fullSearch", false)); + summaryResults.push(runBenchmark("meeting notes", "fullSearch", true)); + summaryResults.push(runBenchmark("xyznonexistent", "autocomplete", false)); + summaryResults.push(runBenchmark("xyznonexistent", "fullSearch", true)); + } + + printTable("Scale Comparison Summary", summaryResults); + }); + }); +}); From ac13af73c50d7ece3009129d8b560e882f0b3f7a Mon Sep 17 00:00:00 2001 From: perfectra1n Date: Fri, 20 Mar 2026 11:38:56 -0700 Subject: [PATCH 10/33] feat(search): add FTS5 migration for content search index --- .../migrations/0235__add_fts5_content_search.ts | 14 ++++++++++++++ apps/server/src/migrations/migrations.ts | 5 +++++ 2 files changed, 19 insertions(+) create mode 100644 apps/server/src/migrations/0235__add_fts5_content_search.ts diff --git a/apps/server/src/migrations/0235__add_fts5_content_search.ts b/apps/server/src/migrations/0235__add_fts5_content_search.ts new file mode 100644 index 0000000000..d0767d51b4 --- /dev/null +++ b/apps/server/src/migrations/0235__add_fts5_content_search.ts @@ -0,0 +1,14 @@ +import sql from "../services/sql.js"; +import log from "../services/log.js"; + +export default () => { + sql.execute(/*sql*/` + CREATE VIRTUAL TABLE IF NOT EXISTS note_content_fts USING fts5( + noteId UNINDEXED, + content, + tokenize='unicode61 remove_diacritics 2' + ) + `); + + log.info("Created note_content_fts table. FTS index will be populated on first search."); +}; diff --git a/apps/server/src/migrations/migrations.ts b/apps/server/src/migrations/migrations.ts index 7aca1f802b..e9f8b8d72d 100644 --- a/apps/server/src/migrations/migrations.ts +++ b/apps/server/src/migrations/migrations.ts @@ -6,6 +6,11 @@ // Migrations should be kept in descending order, so the latest migration is first. const MIGRATIONS: (SqlMigration | JsMigration)[] = [ + // Add FTS5 virtual table for full-text content search + { + version: 235, + module: async () => import("./0235__add_fts5_content_search.js") + }, // Migrate aiChat notes to code notes since LLM integration has been removed { version: 234, From dcaebeea8312f7b1175ebae9a916b89a6fee118b Mon Sep 17 00:00:00 2001 From: perfectra1n Date: Fri, 20 Mar 2026 11:39:54 -0700 Subject: [PATCH 11/33] feat(search): add FTS5 index service for content search --- .../src/services/search/fts_index.spec.ts | 13 ++ apps/server/src/services/search/fts_index.ts | 178 ++++++++++++++++++ 2 files changed, 191 insertions(+) create mode 100644 apps/server/src/services/search/fts_index.spec.ts create mode 100644 apps/server/src/services/search/fts_index.ts diff --git a/apps/server/src/services/search/fts_index.spec.ts b/apps/server/src/services/search/fts_index.spec.ts new file mode 100644 index 0000000000..983c972a0b --- /dev/null +++ b/apps/server/src/services/search/fts_index.spec.ts @@ -0,0 +1,13 @@ +import { describe, it, expect } from "vitest"; + +describe("FTS Index Service", () => { + it("should export buildIndex, updateNote, removeNote, searchContent functions", async () => { + const ftsIndex = await import("./fts_index.js"); + expect(typeof ftsIndex.default.buildIndex).toBe("function"); + expect(typeof ftsIndex.default.updateNote).toBe("function"); + expect(typeof ftsIndex.default.removeNote).toBe("function"); + expect(typeof ftsIndex.default.searchContent).toBe("function"); + expect(typeof ftsIndex.default.isIndexBuilt).toBe("function"); + expect(typeof ftsIndex.default.resetIndex).toBe("function"); + }); +}); diff --git a/apps/server/src/services/search/fts_index.ts b/apps/server/src/services/search/fts_index.ts new file mode 100644 index 0000000000..83c7616999 --- /dev/null +++ b/apps/server/src/services/search/fts_index.ts @@ -0,0 +1,178 @@ +"use strict"; + +import sql from "../sql.js"; +import log from "../log.js"; +import protectedSessionService from "../protected_session.js"; +import preprocessContent from "./expressions/note_content_fulltext_preprocessor.js"; + +interface ContentRow { + noteId: string; + type: string; + mime: string; + content: string | Buffer | null; + isProtected: number; + isDeleted: number; +} + +const MAX_CONTENT_SIZE = 2 * 1024 * 1024; + +let indexBuilt = false; + +function prepareContent(row: ContentRow): string | null { + if (!row.content) return null; + if (row.isDeleted) return null; + + let content: string | undefined; + + if (row.isProtected) { + if (!protectedSessionService.isProtectedSessionAvailable()) { + return null; + } + try { + content = protectedSessionService.decryptString(row.content as string) || undefined; + } catch { + return null; + } + } else { + content = typeof row.content === "string" ? row.content : row.content.toString(); + } + + if (!content || content.length > MAX_CONTENT_SIZE) return null; + + try { + content = preprocessContent(content, row.type, row.mime); + } catch { + return null; + } + + return content || null; +} + +function ftsTableExists(): boolean { + try { + const result = sql.getValue( + "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name='note_content_fts'" + ); + return result > 0; + } catch { + return false; + } +} + +function buildIndex(): void { + if (!ftsTableExists()) { + log.info("FTS5 table does not exist, skipping index build."); + return; + } + + const startTime = Date.now(); + log.info("Building FTS content index..."); + + sql.execute("DELETE FROM note_content_fts"); + + const count = sql.transactional(() => { + let count = 0; + + for (const row of sql.iterateRows(` + SELECT noteId, type, mime, content, isProtected, isDeleted + FROM notes JOIN blobs USING (blobId) + WHERE type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap') + AND isDeleted = 0 + AND content IS NOT NULL + AND LENGTH(content) < ${MAX_CONTENT_SIZE} + `)) { + const processedContent = prepareContent(row); + if (processedContent) { + sql.execute( + "INSERT INTO note_content_fts (noteId, content) VALUES (?, ?)", + [row.noteId, processedContent] + ); + count++; + } + } + + return count; + }); + + const elapsed = Date.now() - startTime; + log.info(`FTS content index built: ${count} notes indexed in ${elapsed}ms`); + indexBuilt = true; +} + +function updateNote(noteId: string): void { + if (!indexBuilt || !ftsTableExists()) return; + + sql.execute("DELETE FROM note_content_fts WHERE noteId = ?", [noteId]); + + const row = sql.getRowOrNull(` + SELECT noteId, type, mime, content, isProtected, isDeleted + FROM notes JOIN blobs USING (blobId) + WHERE noteId = ? + `, [noteId]); + + if (!row) return; + + const processedContent = prepareContent(row); + if (processedContent) { + sql.execute( + "INSERT INTO note_content_fts (noteId, content) VALUES (?, ?)", + [row.noteId, processedContent] + ); + } +} + +function removeNote(noteId: string): void { + if (!indexBuilt || !ftsTableExists()) return; + sql.execute("DELETE FROM note_content_fts WHERE noteId = ?", [noteId]); +} + +function searchContent(tokens: string[], operator: string = "*=*"): string[] { + if (!ftsTableExists()) return []; + + if (!indexBuilt) { + buildIndex(); + } + + const escapedTokens = tokens.map(t => { + const cleaned = t.replace(/["*^(){}:]/g, ""); + if (!cleaned) return null; + return `"${cleaned}"`; + }).filter(Boolean); + + if (escapedTokens.length === 0) return []; + + let ftsQuery: string; + if (operator === "=") { + ftsQuery = escapedTokens.join(" "); + } else { + ftsQuery = escapedTokens.join(" AND "); + } + + try { + const results = sql.getColumn( + "SELECT noteId FROM note_content_fts WHERE note_content_fts MATCH ? ORDER BY rank", + [ftsQuery] + ); + return results; + } catch (e) { + log.info(`FTS5 query failed for "${ftsQuery}": ${e}`); + return []; + } +} + +function isIndexBuilt(): boolean { + return indexBuilt; +} + +function resetIndex(): void { + indexBuilt = false; +} + +export default { + buildIndex, + updateNote, + removeNote, + searchContent, + isIndexBuilt, + resetIndex +}; From f358563c27de16781e3da9a37594520be507a44d Mon Sep 17 00:00:00 2001 From: perfectra1n Date: Fri, 20 Mar 2026 11:40:33 -0700 Subject: [PATCH 12/33] feat(search): wire FTS index updates to note content changes --- apps/server/src/services/handlers.ts | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/apps/server/src/services/handlers.ts b/apps/server/src/services/handlers.ts index f32bf6ddd0..89647d8d83 100644 --- a/apps/server/src/services/handlers.ts +++ b/apps/server/src/services/handlers.ts @@ -60,6 +60,17 @@ eventService.subscribe([eventService.ENTITY_CHANGED, eventService.ENTITY_DELETED } else if (entityName === "notes") { // ENTITY_DELETED won't trigger anything since all branches/attributes are already deleted at this point runAttachedRelations(entity, "runOnNoteChange", entity); + + if (entity.isDeleted) { + try { + const ftsIndex = require("./search/fts_index.js").default; + if (ftsIndex.isIndexBuilt()) { + ftsIndex.removeNote(entity.noteId); + } + } catch { + // FTS index update failure should not block note operations + } + } } }); @@ -81,6 +92,16 @@ eventService.subscribe(eventService.ENTITY_CHANGED, ({ entityName, entity }) => eventService.subscribe(eventService.NOTE_CONTENT_CHANGE, ({ entity }) => { runAttachedRelations(entity, "runOnNoteContentChange", entity); + + // Update FTS content index incrementally + try { + const ftsIndex = require("./search/fts_index.js").default; + if (ftsIndex.isIndexBuilt()) { + ftsIndex.updateNote(entity.noteId); + } + } catch { + // FTS index update failure should not block note saves + } }); eventService.subscribe(eventService.ENTITY_CREATED, ({ entityName, entity }) => { From bc0942180e97c6af915d9d4b5f428adb86e019a0 Mon Sep 17 00:00:00 2001 From: perfectra1n Date: Fri, 20 Mar 2026 11:42:20 -0700 Subject: [PATCH 13/33] feat(search): use FTS5 index in NoteContentFulltextExp with sequential fallback For operators =, !=, and *=*, the search now tries the FTS5 index first via searchViaFts(). If FTS is unavailable or fails, it falls back to the original sequential scan. The flat text attribute search is extracted into its own searchFlatTextAttributes() method and runs after both paths. --- .../expressions/note_content_fulltext.ts | 112 +++++++++++++----- 1 file changed, 81 insertions(+), 31 deletions(-) diff --git a/apps/server/src/services/search/expressions/note_content_fulltext.ts b/apps/server/src/services/search/expressions/note_content_fulltext.ts index f3e0a39333..86aa40e36a 100644 --- a/apps/server/src/services/search/expressions/note_content_fulltext.ts +++ b/apps/server/src/services/search/expressions/note_content_fulltext.ts @@ -79,7 +79,17 @@ class NoteContentFulltextExp extends Expression { const resultNoteSet = new NoteSet(); - // Search through notes with content + // Try FTS5 index first for supported operators + if (this.canUseFts()) { + const ftsWorked = this.searchViaFts(inputNoteSet, resultNoteSet); + if (ftsWorked) { + this.searchFlatTextAttributes(inputNoteSet, resultNoteSet); + return resultNoteSet; + } + // FTS unavailable or failed — fall through to sequential scan + } + + // Fallback: sequential scan (original behavior) for (const row of sql.iterateRows(` SELECT noteId, type, mime, content, isProtected FROM notes JOIN blobs USING (blobId) @@ -89,43 +99,83 @@ class NoteContentFulltextExp extends Expression { this.findInText(row, inputNoteSet, resultNoteSet); } - // For exact match with flatText, also search notes WITHOUT content (they may have matching attributes) - if (this.flatText && (this.operator === "=" || this.operator === "!=")) { - for (const note of inputNoteSet.notes) { - // Skip if already found or doesn't exist - if (resultNoteSet.hasNoteId(note.noteId) || !(note.noteId in becca.notes)) { - continue; - } + this.searchFlatTextAttributes(inputNoteSet, resultNoteSet); + return resultNoteSet; + } - const noteFromBecca = becca.notes[note.noteId]; - const flatText = noteFromBecca.getFlatText(); + /** + * Whether this operator can be served by FTS5. + */ + private canUseFts(): boolean { + return ["=", "!=", "*=*"].includes(this.operator); + } - // For flatText, only check attribute values (format: #name=value or ~name=value) - // Don't match against noteId, type, mime, or title which are also in flatText - let matches = false; - const phrase = this.tokens.join(" "); - const normalizedPhrase = normalizeSearchText(phrase); - const normalizedFlatText = normalizeSearchText(flatText); + /** + * Attempts to use the FTS5 index for content search. + * Returns true if FTS was used successfully, false to fall back to sequential scan. + */ + private searchViaFts(inputNoteSet: NoteSet, resultNoteSet: NoteSet): boolean { + try { + const ftsIndex = require("../fts_index.js").default; + const matchingNoteIds = ftsIndex.searchContent(this.tokens, this.operator); - // Check if =phrase appears in flatText (indicates attribute value match) - // For single words, use word-boundary matching to avoid substring matches - if (!normalizedPhrase.includes(' ')) { - // Single word: look for =word with word boundaries - // Split by = to get attribute values, then check each value for exact word match - const parts = normalizedFlatText.split('='); - matches = parts.slice(1).some(part => this.exactWordMatch(normalizedPhrase, part)); - } else { - // Multi-word phrase: check for substring match - matches = normalizedFlatText.includes(`=${normalizedPhrase}`); - } - - if ((this.operator === "=" && matches) || (this.operator === "!=" && !matches)) { - resultNoteSet.add(noteFromBecca); + for (const noteId of matchingNoteIds) { + if (inputNoteSet.hasNoteId(noteId) && noteId in becca.notes) { + if (this.operator === "!=") { + continue; + } + resultNoteSet.add(becca.notes[noteId]); } } + + if (this.operator === "!=") { + const matchingSet = new Set(matchingNoteIds); + for (const note of inputNoteSet.notes) { + if (!matchingSet.has(note.noteId) && note.noteId in becca.notes) { + resultNoteSet.add(becca.notes[note.noteId]); + } + } + } + + return true; + } catch { + return false; + } + } + + /** + * Searches flat text attributes for = and != operators. + * Extracted from the old execute() tail. + */ + private searchFlatTextAttributes(inputNoteSet: NoteSet, resultNoteSet: NoteSet): void { + if (!this.flatText || (this.operator !== "=" && this.operator !== "!=")) { + return; } - return resultNoteSet; + for (const note of inputNoteSet.notes) { + if (resultNoteSet.hasNoteId(note.noteId) || !(note.noteId in becca.notes)) { + continue; + } + + const noteFromBecca = becca.notes[note.noteId]; + const flatText = noteFromBecca.getFlatText(); + + let matches = false; + const phrase = this.tokens.join(" "); + const normalizedPhrase = normalizeSearchText(phrase); + const normalizedFlatText = normalizeSearchText(flatText); + + if (!normalizedPhrase.includes(' ')) { + const parts = normalizedFlatText.split('='); + matches = parts.slice(1).some(part => this.exactWordMatch(normalizedPhrase, part)); + } else { + matches = normalizedFlatText.includes(`=${normalizedPhrase}`); + } + + if ((this.operator === "=" && matches) || (this.operator === "!=" && !matches)) { + resultNoteSet.add(noteFromBecca); + } + } } /** From 06fb9c0a6bf9efae5a70a41389159152d3674e3d Mon Sep 17 00:00:00 2001 From: perfectra1n Date: Fri, 20 Mar 2026 11:43:23 -0700 Subject: [PATCH 14/33] test(search): add FTS5 integration test --- apps/server/spec/fts5_search.spec.ts | 76 ++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 apps/server/spec/fts5_search.spec.ts diff --git a/apps/server/spec/fts5_search.spec.ts b/apps/server/spec/fts5_search.spec.ts new file mode 100644 index 0000000000..7d6ed7483a --- /dev/null +++ b/apps/server/spec/fts5_search.spec.ts @@ -0,0 +1,76 @@ +import { Application } from "express"; +import { beforeAll, describe, expect, it } from "vitest"; +import config from "../src/services/config.js"; + +let app: Application; + +function timed(fn: () => T): [T, number] { + const start = performance.now(); + const result = fn(); + return [result, performance.now() - start]; +} + +describe("FTS5 Content Search (integration)", () => { + beforeAll(async () => { + config.General.noAuthentication = true; + const buildApp = (await import("../src/app.js")).default; + app = await buildApp(); + }); + + it("FTS5 index builds and searches correctly", async () => { + const sql = (await import("../src/services/sql.js")).default; + const becca = (await import("../src/becca/becca.js")).default; + const ftsIndex = (await import("../src/services/search/fts_index.js")).default; + const cls = (await import("../src/services/cls.js")).default; + + await new Promise((resolve) => { + cls.init(() => { + // Check if FTS table exists (migration may not have run on test DB) + const tableExists = sql.getValue( + "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name='note_content_fts'" + ); + + if (!tableExists) { + // Create the table for testing + sql.execute(` + CREATE VIRTUAL TABLE IF NOT EXISTS note_content_fts USING fts5( + noteId UNINDEXED, + content, + tokenize='unicode61 remove_diacritics 2' + ) + `); + } + + const noteCount = Object.keys(becca.notes).length; + console.log(`\n Notes in becca: ${noteCount}`); + + // Build the index + ftsIndex.resetIndex(); + const [, buildMs] = timed(() => ftsIndex.buildIndex()); + console.log(` FTS index build: ${buildMs.toFixed(0)}ms`); + + // Verify index has content + const indexedCount = sql.getValue("SELECT COUNT(*) FROM note_content_fts"); + console.log(` Notes indexed: ${indexedCount}`); + expect(indexedCount).toBeGreaterThanOrEqual(0); + + // If we have indexed content, test search + if (indexedCount > 0) { + const [results, searchMs] = timed(() => ftsIndex.searchContent(["note"], "*=*")); + console.log(` FTS search "note": ${searchMs.toFixed(1)}ms (${results.length} results)`); + expect(results).toBeInstanceOf(Array); + } + + // Test update and remove don't throw + expect(() => ftsIndex.updateNote("nonexistent")).not.toThrow(); + expect(() => ftsIndex.removeNote("nonexistent")).not.toThrow(); + + // Clean up + sql.execute("DELETE FROM note_content_fts"); + ftsIndex.resetIndex(); + + resolve(); + }); + }); + }); +}); From 24a01aefe2f674f0362e8e0f207b10af1669f0bd Mon Sep 17 00:00:00 2001 From: perfectra1n Date: Fri, 20 Mar 2026 11:44:19 -0700 Subject: [PATCH 15/33] feat(search): add user option to enable/disable FTS5 content index --- apps/client/src/translations/en/translation.json | 3 ++- apps/client/src/widgets/type_widgets/options/other.tsx | 7 +++++++ apps/server/src/routes/api/options.ts | 1 + apps/server/src/services/options_init.ts | 1 + .../services/search/expressions/note_content_fulltext.ts | 8 ++++++++ packages/commons/src/lib/options_interface.ts | 2 ++ 6 files changed, 21 insertions(+), 1 deletion(-) diff --git a/apps/client/src/translations/en/translation.json b/apps/client/src/translations/en/translation.json index f9ba8f8743..5232fa1dc1 100644 --- a/apps/client/src/translations/en/translation.json +++ b/apps/client/src/translations/en/translation.json @@ -1294,7 +1294,8 @@ }, "search": { "title": "Search", - "enable_fuzzy_matching": "Enable fuzzy matching in search (matches similar words when exact matches are insufficient)" + "enable_fuzzy_matching": "Enable fuzzy matching in search (matches similar words when exact matches are insufficient)", + "enable_fts5": "Use content index for faster full-text search (applies on next search)" }, "search_engine": { "title": "Search Engine", diff --git a/apps/client/src/widgets/type_widgets/options/other.tsx b/apps/client/src/widgets/type_widgets/options/other.tsx index 8cb99bace4..c16fffc891 100644 --- a/apps/client/src/widgets/type_widgets/options/other.tsx +++ b/apps/client/src/widgets/type_widgets/options/other.tsx @@ -39,6 +39,7 @@ export default function OtherSettings() { function SearchSettings() { const [ fuzzyEnabled, setFuzzyEnabled ] = useTriliumOptionBool("searchEnableFuzzyMatching"); + const [ fts5Enabled, setFts5Enabled ] = useTriliumOptionBool("searchEnableFts5"); return ( @@ -48,6 +49,12 @@ function SearchSettings() { currentValue={fuzzyEnabled} onChange={setFuzzyEnabled} /> + ); } diff --git a/apps/server/src/routes/api/options.ts b/apps/server/src/routes/api/options.ts index 049a898fca..5491fbf31e 100644 --- a/apps/server/src/routes/api/options.ts +++ b/apps/server/src/routes/api/options.ts @@ -98,6 +98,7 @@ const ALLOWED_OPTIONS = new Set([ "backgroundEffects", "allowedHtmlTags", "searchEnableFuzzyMatching", + "searchEnableFts5", "redirectBareDomain", "showLoginInShareTheme", "splitEditorOrientation", diff --git a/apps/server/src/services/options_init.ts b/apps/server/src/services/options_init.ts index 17ea5a1f0b..de22bbaafd 100644 --- a/apps/server/src/services/options_init.ts +++ b/apps/server/src/services/options_init.ts @@ -200,6 +200,7 @@ const defaultOptions: DefaultOption[] = [ // Search settings { name: "searchEnableFuzzyMatching", value: "true", isSynced: true }, + { name: "searchEnableFts5", value: "true", isSynced: true }, // Share settings { name: "redirectBareDomain", value: "false", isSynced: true }, diff --git a/apps/server/src/services/search/expressions/note_content_fulltext.ts b/apps/server/src/services/search/expressions/note_content_fulltext.ts index 86aa40e36a..f2e0122d32 100644 --- a/apps/server/src/services/search/expressions/note_content_fulltext.ts +++ b/apps/server/src/services/search/expressions/note_content_fulltext.ts @@ -107,6 +107,14 @@ class NoteContentFulltextExp extends Expression { * Whether this operator can be served by FTS5. */ private canUseFts(): boolean { + try { + const optionService = require("../../options.js").default; + if (!optionService.getOptionBool("searchEnableFts5")) { + return false; + } + } catch { + // Option not available yet — allow FTS + } return ["=", "!=", "*=*"].includes(this.operator); } diff --git a/packages/commons/src/lib/options_interface.ts b/packages/commons/src/lib/options_interface.ts index 6e36ebd7a3..555fd20a5b 100644 --- a/packages/commons/src/lib/options_interface.ts +++ b/packages/commons/src/lib/options_interface.ts @@ -137,6 +137,8 @@ export interface OptionDefinitions extends KeyboardShortcutsOptions Date: Fri, 20 Mar 2026 11:56:49 -0700 Subject: [PATCH 16/33] fix(search): fix busy connection error in FTS5 index build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Collect rows before inserting — iterateRows() holds an open cursor that conflicts with writes on the same connection. --- apps/server/src/services/search/fts_index.ts | 42 +++++++++++--------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/apps/server/src/services/search/fts_index.ts b/apps/server/src/services/search/fts_index.ts index 83c7616999..30359f7d19 100644 --- a/apps/server/src/services/search/fts_index.ts +++ b/apps/server/src/services/search/fts_index.ts @@ -70,28 +70,32 @@ function buildIndex(): void { sql.execute("DELETE FROM note_content_fts"); - const count = sql.transactional(() => { - let count = 0; + // Collect all rows first, then batch-insert in a transaction. + // iterateRows() holds an open cursor that conflicts with writes on the same connection. + const prepared: { noteId: string; content: string }[] = []; - for (const row of sql.iterateRows(` - SELECT noteId, type, mime, content, isProtected, isDeleted - FROM notes JOIN blobs USING (blobId) - WHERE type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap') - AND isDeleted = 0 - AND content IS NOT NULL - AND LENGTH(content) < ${MAX_CONTENT_SIZE} - `)) { - const processedContent = prepareContent(row); - if (processedContent) { - sql.execute( - "INSERT INTO note_content_fts (noteId, content) VALUES (?, ?)", - [row.noteId, processedContent] - ); - count++; - } + for (const row of sql.iterateRows(` + SELECT noteId, type, mime, content, isProtected, isDeleted + FROM notes JOIN blobs USING (blobId) + WHERE type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap') + AND isDeleted = 0 + AND content IS NOT NULL + AND LENGTH(content) < ${MAX_CONTENT_SIZE} + `)) { + const processedContent = prepareContent(row); + if (processedContent) { + prepared.push({ noteId: row.noteId, content: processedContent }); } + } - return count; + const count = sql.transactional(() => { + for (const { noteId, content } of prepared) { + sql.execute( + "INSERT INTO note_content_fts (noteId, content) VALUES (?, ?)", + [noteId, content] + ); + } + return prepared.length; }); const elapsed = Date.now() - startTime; From 87fc4e12811c01ea7d01ee9b32ceb54963c83803 Mon Sep 17 00:00:00 2001 From: perfectra1n Date: Fri, 20 Mar 2026 12:00:16 -0700 Subject: [PATCH 17/33] docs(search): add FTS5 benchmark results to performance comparison Adds real SQLite benchmarks showing FTS5 is 15-33x faster for the raw content query, though end-to-end improvement is masked by JS pipeline overhead (scoring, snippets, path walking). --- docs/search-performance-benchmarks.md | 465 ++++++++++++++++++++++++++ 1 file changed, 465 insertions(+) create mode 100644 docs/search-performance-benchmarks.md diff --git a/docs/search-performance-benchmarks.md b/docs/search-performance-benchmarks.md new file mode 100644 index 0000000000..bb7411230c --- /dev/null +++ b/docs/search-performance-benchmarks.md @@ -0,0 +1,465 @@ +# Search Performance Benchmarks: `main` vs `feat/search-perf-take1` + +> **Date:** 2026-03-20 +> **Environment:** In-memory benchmarks (monkeypatched `getContent()`, no real SQLite I/O). Both branches tested on the same machine in the same session for fair comparison. All times are avg of 5 iterations with warm caches unless noted. +> **Benchmark source:** `apps/server/src/services/search/services/search_benchmark.spec.ts` + +--- + +## Table of Contents + +- [Single-Token Autocomplete](#single-token-autocomplete) +- [Multi-Token Autocomplete](#multi-token-autocomplete) +- [No-Match Queries (worst case)](#no-match-queries-worst-case) +- [Diacritics / Unicode](#diacritics--unicode) +- [Typing Progression (keystroke simulation)](#typing-progression-keystroke-simulation) +- [Long Queries (4 tokens)](#long-queries-4-tokens) +- [Attribute Matching](#attribute-matching) +- [Fuzzy Matching Effectiveness (typos & misspellings)](#fuzzy-matching-effectiveness-typos--misspellings) +- [Cache Warmth Impact (feature branch only)](#cache-warmth-impact-feature-branch-only) +- [Realistic User Session](#realistic-user-session) +- [Scale Comparison Summary](#scale-comparison-summary) + +--- + +## Single-Token Autocomplete + +The most common case — user typing in the search bar. Query: `"meeting"`. + +### Autocomplete (fuzzy OFF) + +| Notes | main | feature | Change | +|------:|-----:|--------:|-------:| +| 1,000 | 3.6ms | 2.8ms | **-22%** | +| 5,000 | 11.9ms | 10.6ms | **-11%** | +| 10,000 | 27.5ms | 22.8ms | **-17%** | +| 20,000 | 53.7ms | 46.2ms | **-14%** | + +### Autocomplete (fuzzy ON) + +| Notes | main | feature | Change | +|------:|-----:|--------:|-------:| +| 1,000 | 2.4ms | 2.3ms | -4% | +| 5,000 | 11.7ms | 10.7ms | **-9%** | +| 10,000 | 28.9ms | 21.6ms | **-25%** | +| 20,000 | 58.6ms | 44.5ms | **-24%** | + +### Full Search (fuzzy OFF) + +| Notes | main | feature | Change | +|------:|-----:|--------:|-------:| +| 1,000 | 2.7ms | 4.3ms | +59% | +| 5,000 | 14.3ms | 10.8ms | **-24%** | +| 10,000 | 30.8ms | 26.9ms | **-13%** | +| 20,000 | 63.1ms | 56.7ms | **-10%** | + +### Full Search (fuzzy ON) + +| Notes | main | feature | Change | +|------:|-----:|--------:|-------:| +| 1,000 | 2.5ms | 2.4ms | -4% | +| 5,000 | 13.0ms | 11.4ms | **-12%** | +| 10,000 | 29.8ms | 25.6ms | **-14%** | +| 20,000 | 63.4ms | 54.5ms | **-14%** | + +--- + +## Multi-Token Autocomplete + +### 2-Token: `"meeting notes"` (autocomplete, fuzzy OFF) + +| Notes | main | feature | Change | +|------:|-----:|--------:|-------:| +| 1,000 | 3.7ms | 3.5ms | -5% | +| 5,000 | 19.0ms | 19.3ms | +2% | +| 10,000 | 40.2ms | 40.4ms | 0% | +| 20,000 | 86.1ms | 80.7ms | **-6%** | + +### 3-Token: `"meeting notes january"` (autocomplete, fuzzy OFF) + +| Notes | main | feature | Change | +|------:|-----:|--------:|-------:| +| 1,000 | 4.1ms | 4.3ms | +5% | +| 5,000 | 25.7ms | 24.9ms | -3% | +| 10,000 | 50.9ms | 50.5ms | -1% | +| 20,000 | 104.5ms | 107.2ms | +3% | + +### 2-Token: `"meeting notes"` (full search, fuzzy OFF) + +| Notes | main | feature | Change | +|------:|-----:|--------:|-------:| +| 1,000 | 3.4ms | 3.3ms | -3% | +| 5,000 | 22.3ms | 21.9ms | -2% | +| 10,000 | 42.9ms | 40.2ms | **-6%** | +| 20,000 | 95.8ms | 88.3ms | **-8%** | + +### 3-Token: `"meeting notes january"` (full search, fuzzy ON) + +| Notes | main | feature | Change | +|------:|-----:|--------:|-------:| +| 1,000 | 4.4ms | 4.3ms | -2% | +| 5,000 | 26.3ms | 25.5ms | -3% | +| 10,000 | 51.7ms | 52.6ms | +2% | +| 20,000 | 113.9ms | 114.0ms | 0% | + +--- + +## No-Match Queries (worst case) + +These are the worst case — every note must be scanned with no early exit. + +### Single token: `"xyznonexistent"` (autocomplete) + +| Notes | main | feature | Change | +|------:|-----:|--------:|-------:| +| 1,000 | 0.7ms | 0.5ms | **-29%** | +| 5,000 | 4.0ms | 3.4ms | **-15%** | +| 10,000 | 11.3ms | 7.0ms | **-38%** | +| 20,000 | 28.9ms | 19.0ms | **-34%** | + +### Single token: `"xyznonexistent"` (autocomplete, fuzzy ON) + +This is the biggest behavioral change. On `main`, autocomplete with fuzzy ON triggers the expensive two-phase search. On the feature branch, autocomplete **always skips** the fuzzy fallback phase. + +| Notes | main | feature | Change | +|------:|-----:|--------:|-------:| +| 1,000 | 1.7ms | 0.5ms | **-71%** | +| 5,000 | 12.8ms | 2.3ms | **-82%** | +| 10,000 | 26.4ms | 6.0ms | **-77%** | +| 20,000 | 60.4ms | 20.0ms | **-67%** | + +### Multi token: `"xyzfoo xyzbar"` (autocomplete, fuzzy ON) + +Same effect — autocomplete no longer triggers the fuzzy fallback: + +| Notes | main | feature | Change | +|------:|-----:|--------:|-------:| +| 1,000 | 6.5ms | 0.4ms | **-94%** | +| 5,000 | 33.9ms | 2.5ms | **-93%** | +| 10,000 | 134.5ms | 6.0ms | **-96%** | +| 20,000 | 151.8ms | 19.8ms | **-87%** | + +### Multi token: `"xyzfoo xyzbar"` (full search, fuzzy ON) + +Full search still does two-phase fuzzy on both branches, so improvement here is from the flat text index and pre-normalized attributes: + +| Notes | main | feature | Change | +|------:|-----:|--------:|-------:| +| 1,000 | 5.9ms | 5.8ms | -2% | +| 5,000 | 35.0ms | 33.7ms | -4% | +| 10,000 | 144.0ms | 68.8ms | **-52%** | +| 20,000 | 165.5ms | 140.6ms | **-15%** | + +--- + +## Diacritics / Unicode + +Searching `"résumé"` (with diacritics) vs `"resume"` (ASCII equivalent). Both forms find the same results thanks to diacritic normalization. + +### Autocomplete (fuzzy OFF) + +| Notes | Query | main | feature | Change | +|------:|:------|-----:|--------:|-------:| +| 1,000 | `"résumé"` | 4.1ms | 2.4ms | **-41%** | +| 1,000 | `"resume"` | 2.9ms | 2.4ms | **-17%** | +| 5,000 | `"résumé"` | 20.4ms | 15.0ms | **-26%** | +| 5,000 | `"resume"` | 18.1ms | 16.3ms | **-10%** | +| 10,000 | `"résumé"` | 40.6ms | 29.0ms | **-29%** | +| 10,000 | `"resume"` | 40.6ms | 29.5ms | **-27%** | + +--- + +## Typing Progression (keystroke simulation) + +Simulates a user typing `"documentation"` character by character. Autocomplete, fuzzy OFF. + +### 5,000 notes + +| Prefix | main | feature | Change | +|:-------|-----:|--------:|-------:| +| `"d"` | 44.7ms | 35.9ms | **-20%** | +| `"do"` | 12.9ms | 11.6ms | **-10%** | +| `"doc"` | 12.0ms | 10.2ms | **-15%** | +| `"docu"` | 10.9ms | 9.4ms | **-14%** | +| `"document"` | 9.1ms | 7.3ms | **-20%** | +| `"documentation"` | 10.3ms | 8.1ms | **-21%** | + +### 10,000 notes + +| Prefix | main | feature | Change | +|:-------|-----:|--------:|-------:| +| `"d"` | 85.4ms | 70.1ms | **-18%** | +| `"do"` | 30.0ms | 24.1ms | **-20%** | +| `"doc"` | 28.3ms | 20.8ms | **-27%** | +| `"docu"` | 24.3ms | 20.1ms | **-17%** | +| `"document"` | 19.2ms | 15.9ms | **-17%** | +| `"documentation"` | 23.0ms | 16.8ms | **-27%** | + +### 20,000 notes + +| Prefix | main | feature | Change | +|:-------|-----:|--------:|-------:| +| `"d"` | 178.3ms | 142.8ms | **-20%** | +| `"do"` | 63.7ms | 50.6ms | **-21%** | +| `"doc"` | 59.1ms | 44.0ms | **-26%** | +| `"docu"` | 59.3ms | 40.6ms | **-32%** | +| `"document"` | 45.7ms | 34.1ms | **-25%** | +| `"documentation"` | 47.4ms | 33.7ms | **-29%** | + +--- + +## Long Queries (4 tokens) + +Query: `"quarterly budget review report"` — autocomplete, fuzzy OFF. + +| Notes | Tokens | main | feature | Change | +|------:|-------:|-----:|--------:|-------:| +| 5,000 | 1 | 8.8ms | 6.5ms | **-26%** | +| 5,000 | 2 | 13.7ms | 11.0ms | **-20%** | +| 5,000 | 3 | 16.7ms | 15.1ms | **-10%** | +| 5,000 | 4 | 18.9ms | 22.3ms | +18% | +| 10,000 | 1 | 18.5ms | 15.6ms | **-16%** | +| 10,000 | 2 | 25.4ms | 24.9ms | -2% | +| 10,000 | 3 | 31.7ms | 33.3ms | +5% | +| 10,000 | 4 | 39.0ms | 40.7ms | +4% | + +--- + +## Attribute Matching + +Searching by label name (`"category"`) and label value (`"important"`). Notes have 5 labels each. + +### `"category"` (autocomplete) + +| Notes | main (fuzzy OFF) | feature (fuzzy OFF) | Change | main (fuzzy ON) | feature (fuzzy ON) | Change | +|------:|------------------:|--------------------:|-------:|----------------:|-------------------:|-------:| +| 5,000 | 12.0ms | 9.5ms | **-21%** | 34.4ms | 9.7ms | **-72%** | +| 10,000 | 26.7ms | 22.7ms | **-15%** | 77.5ms | 21.0ms | **-73%** | + +### `"important"` (autocomplete) + +| Notes | main (fuzzy OFF) | feature (fuzzy OFF) | Change | main (fuzzy ON) | feature (fuzzy ON) | Change | +|------:|------------------:|--------------------:|-------:|----------------:|-------------------:|-------:| +| 5,000 | 11.1ms | 9.2ms | **-17%** | 11.6ms | 8.8ms | **-24%** | +| 10,000 | 25.4ms | 18.7ms | **-26%** | 24.2ms | 19.4ms | **-20%** | + +--- + +## Fuzzy Matching Effectiveness (typos & misspellings) + +10K notes, keyword: `"performance"`. Shows both time and result quality. + +| Query | Fuzzy | main (time) | feature (time) | Change | main (results) | feature (results) | +|:------|:------|------------:|---------------:|-------:|---------------:|------------------:| +| `"performance"` (exact) | OFF | 26.8ms | 22.3ms | **-17%** | 1,000 | 1,000 | +| `"performance"` (exact) | ON | 18.7ms | 16.3ms | **-13%** | 1,000 | 1,000 | +| `"performanc"` (truncated) | OFF | 18.6ms | 16.4ms | **-12%** | 1,000 | 1,000 | +| `"performanc"` (truncated) | ON | 18.5ms | 15.6ms | **-16%** | 1,000 | 1,000 | +| `"preformance"` (typo) | OFF | 10.6ms | 7.9ms | **-25%** | 0 | 0 | +| `"preformance"` (typo) | ON | 55.1ms | 43.4ms | **-21%** | 1,000 | 1,000 | +| `"performence"` (misspelling) | OFF | 11.5ms | 8.8ms | **-23%** | 0 | 0 | +| `"performence"` (misspelling) | ON | 56.2ms | 48.3ms | **-14%** | 1,000 | 1,000 | +| `"optimization"` | OFF | 12.6ms | 9.9ms | **-21%** | 0 | 0 | +| `"optimization"` | ON | 37.2ms | 31.6ms | **-15%** | 0 | 0 | +| `"optimzation"` (typo) | OFF | 11.6ms | 8.1ms | **-30%** | 0 | 0 | +| `"optimzation"` (typo) | ON | 44.5ms | 31.3ms | **-30%** | 0 | 0 | +| `"perf optim"` (abbreviated) | OFF | 16.5ms | 11.8ms | **-28%** | 0 | 0 | +| `"perf optim"` (abbreviated) | ON | 74.9ms | 67.2ms | **-10%** | 0 | 0 | + +**Key insight:** Fuzzy matching is equally effective on both branches (same result counts). The feature branch is simply faster at executing it. + +--- + +## Cache Warmth Impact (feature branch only) + +This section only applies to the feature branch, which introduces a new flat text index cache in Becca. `main` does not have this cache. + +| Scenario | Time | +|:---------|------:| +| Cold (first search, builds index + search) | 61.7ms | +| Warm (reuse existing index, avg of 5 runs) | 25.6ms (avg), 19.8ms (min) | +| Incremental (50 notes dirtied, then search) | 21.1ms | +| Full rebuild (index invalidated, then search) | 20.7ms | + +The first search after startup pays a one-time index build cost (~2.4x). All subsequent searches reuse the cached index. When individual notes change, only their entries are recomputed. + +--- + +## Realistic User Session + +Simulates a typical user session at 10K notes with mixed query types and typos. + +| Query | Mode | main | feature | Change | +|:------|:-----|-----:|--------:|-------:| +| `"pro"` | autocomplete | 26.9ms | 24.6ms | **-9%** | +| `"project"` | autocomplete | 28.3ms | 24.1ms | **-15%** | +| `"project plan"` | autocomplete | 35.6ms | 35.0ms | -2% | +| `"project"` | fullSearch | 32.8ms | 30.0ms | **-9%** | +| `"project planning"` | fullSearch | 37.2ms | 36.4ms | -2% | +| `"project planning"` | fullSearch+fuzzy | 36.5ms | 35.9ms | -2% | +| `"projct"` (typo) | autocomplete | 11.4ms | 6.0ms | **-47%** | +| `"projct"` (typo) | autocomplete+fuzzy | **81.2ms** | **6.7ms** | **-92%** | +| `"projct planing"` (typo) | fullSearch | 12.5ms | 8.8ms | **-30%** | +| `"projct planing"` (typo) | fullSearch+fuzzy | 116.6ms | 113.2ms | -3% | +| `"xyznonexistent"` | autocomplete | 11.4ms | 6.7ms | **-41%** | +| `"xyznonexistent foo"` | fullSearch+fuzzy | 37.4ms | 23.2ms | **-38%** | +| `"note"` (very common) | autocomplete | **106.0ms** | **92.3ms** | **-13%** | +| `"document"` | autocomplete | 24.7ms | 20.7ms | **-16%** | + +**Biggest win:** `"projct"` autocomplete+fuzzy goes from 81.2ms to 6.7ms (**-92%**) because the feature branch skips the fuzzy fallback phase for autocomplete entirely. + +--- + +## Scale Comparison Summary + +Side-by-side comparison across all note counts for the most common query patterns. + +### `"meeting"` autocomplete (fuzzy OFF) + +| Notes | main | feature | Change | +|------:|-----:|--------:|-------:| +| 1,000 | 3.6ms | 2.3ms | **-36%** | +| 5,000 | 11.4ms | 12.2ms | +7% | +| 10,000 | 25.1ms | 22.9ms | **-9%** | +| 20,000 | 59.4ms | 52.3ms | **-12%** | + +### `"meeting notes"` autocomplete (fuzzy OFF) + +| Notes | main | feature | Change | +|------:|-----:|--------:|-------:| +| 1,000 | 4.0ms | 2.7ms | **-33%** | +| 5,000 | 15.9ms | 17.2ms | +8% | +| 10,000 | 36.1ms | 34.2ms | **-5%** | +| 20,000 | 71.0ms | 72.9ms | +3% | + +### `"meeting"` fullSearch (fuzzy ON) + +| Notes | main | feature | Change | +|------:|-----:|--------:|-------:| +| 1,000 | 2.5ms | 2.4ms | -4% | +| 5,000 | 12.1ms | 13.1ms | +8% | +| 10,000 | 27.8ms | 27.1ms | -3% | +| 20,000 | 67.2ms | 57.8ms | **-14%** | + +### `"xyznonexistent"` autocomplete (fuzzy OFF) + +| Notes | main | feature | Change | +|------:|-----:|--------:|-------:| +| 1,000 | 1.3ms | 0.5ms | **-62%** | +| 5,000 | 3.1ms | 2.5ms | **-19%** | +| 10,000 | 7.7ms | 9.4ms | +22% | +| 20,000 | 22.4ms | 16.6ms | **-26%** | + +### `"xyznonexistent"` fullSearch (fuzzy ON) — worst case path + +| Notes | main | feature | Change | +|------:|-----:|--------:|-------:| +| 1,000 | 2.7ms | 2.5ms | -7% | +| 5,000 | 11.2ms | 9.7ms | **-13%** | +| 10,000 | 25.4ms | 30.3ms | +19% | +| 20,000 | 68.7ms | 55.2ms | **-20%** | + +--- + +## Summary of Improvements + +### Where the feature branch clearly wins (consistent 10-30% improvement): +- **Single-token autocomplete** at all scales (10-25% faster) +- **Diacritics queries** (26-41% faster at 10K notes) +- **Typing progression** (17-32% faster per keystroke at 20K notes) +- **Fuzzy typo searches** (14-30% faster while finding same results) +- **Broad term autocomplete** (e.g., `"note"` matching 8,500 results: 13% faster) + +### Where the feature branch dramatically wins (50%+ improvement): +- **Autocomplete with fuzzy ON, no-match queries** (67-96% faster — fuzzy fallback skipped entirely) +- **Autocomplete typo queries** (e.g., `"projct"` + fuzzy: 81ms -> 7ms, **-92%**) + +### Where performance is roughly equal (within noise): +- Multi-token queries at smaller scales (1-5K notes) +- Full search with fuzzy ON when there are sufficient exact matches (fuzzy phase skipped on both branches) + +### Trade-offs: +- Some individual data points show slight regressions at 5K scale (+2-8%), likely noise from shared-machine benchmarking +- Long queries (4 tokens) at 5K notes show a small regression (+18%), but this evens out at 10K +- The new flat text index has a one-time build cost on first search (~62ms at 10K notes), amortized across all subsequent searches + +--- + +## FTS5 Content Index Benchmarks + +> These benchmarks use the **real SQLite database** with actual blob content (not monkeypatched). They test the `fastSearch=false` path that users hit when pressing Enter in search or using saved searches. This is the path that was taking **seconds** in production. + +### The Architecture + +When `fastSearch=false`, the expression tree is `OrExp([NoteFlatTextExp, NoteContentFulltextExp])`. Both expressions run: +- **NoteFlatTextExp**: In-memory scan of titles/attributes (fast — 5-25ms) +- **NoteContentFulltextExp**: Scans ALL note content from SQLite blobs (slow — the bottleneck) + +FTS5 replaces the sequential blob scan in `NoteContentFulltextExp` with an indexed FTS5 MATCH query. + +### FTS5 Query-Only Performance (isolating the content scan) + +This measures just the content search portion, stripped of the expression tree, scoring, and snippet extraction overhead. + +| Notes | FTS5 MATCH query | Sequential SQL scan | FTS5 Speedup | +|------:|-----------------:|--------------------:|-------------:| +| 1,000 | **0.2ms** | 3.6ms | **15x** | +| 5,000 | **0.5ms** | 16.0ms | **33x** | +| 10,000 | **1.1ms** | 36.4ms | **32x** | + +FTS5 is **15-33x faster** than the sequential scan for the raw content query. + +### Why Full Search Doesn't Show the Same Speedup + +When measured end-to-end through `findResultsWithQuery()` with `fastSearch=false`: + +| Notes | Query | FTS5 | Sequential | Speedup | +|------:|:------|-----:|-----------:|--------:| +| 1,000 | `"performance"` | 52.1ms | 48.3ms | 0.9x | +| 5,000 | `"performance"` | 233.4ms | 227.6ms | 1.0x | +| 10,000 | `"performance"` | 517.3ms | 515.9ms | 1.0x | +| 1,000 | `"xyznonexistent"` | 46.2ms | 57.6ms | 1.2x | +| 5,000 | `"xyznonexistent"` | 272.9ms | 229.3ms | 0.8x | +| 10,000 | `"xyznonexistent"` | 460.3ms | 468.3ms | 1.0x | + +The FTS5 query itself is 32x faster, but it's **drowned out by the rest of the pipeline**: + +| Component | Time at 10K notes | % of total | +|:----------|------------------:|-----------:| +| `NoteFlatTextExp` (in-memory scan) | ~25ms | ~5% | +| `NoteContentFulltextExp` content scan | 1-36ms | ~1-7% | +| Scoring (`computeScore` per result) | ~100-200ms | ~20-40% | +| Snippet extraction | ~50-100ms | ~10-20% | +| Highlighting | ~50ms | ~10% | +| `searchPathTowardsRoot` recursion | ~100-200ms | ~20-40% | + +The content scan (which FTS5 replaces) is only **1-7% of total search time** in this benchmark. The real bottleneck at this scale is scoring, snippet extraction, and the recursive parent-path walk — all JavaScript operations that FTS5 doesn't affect. + +### Where FTS5 Will Matter Most + +FTS5 will show significant real-world improvement when: +1. **Database is large (50K-200K+ notes)** — The sequential scan reads every blob from disk. At 200K notes with varying content sizes, the I/O cost dominates. FTS5 eliminates this entirely. +2. **Notes have large content** — The benchmark uses 300-word notes (~2KB each). Real notes can be 10KB-100KB+. The sequential scan reads and preprocesses ALL of that content; FTS5 returns noteIds without touching content blobs. +3. **Disk is slow** — These benchmarks run on fast local SSD. On slower storage (network drives, spinning disks, Docker volumes), the I/O savings from FTS5 will be dramatic. + +### FTS5 Index Build Cost + +| Notes | Build time | Notes indexed | +|------:|-----------:|--------------:| +| 1,000 | 213ms | 1,015 | +| 5,000 | 943ms | 5,015 | +| 10,000 | 2,720ms | 10,015 | + +The index builds lazily on first search and is maintained incrementally via `NOTE_CONTENT_CHANGE` events. Users using `unicode61` tokenizer (not trigram) keeps the index compact. + +### Reference: Autocomplete (fastSearch=true) — Not Affected by FTS5 + +For comparison, the in-memory autocomplete path remains fast: + +| Notes | `"performance"` | `"performance optimization"` | +|------:|-----------------:|-----------------------------:| +| 1,000 | 5.2ms | 1.4ms | +| 5,000 | 10.1ms | 3.7ms | +| 10,000 | 24.4ms | 10.4ms | + +These don't use FTS5 at all — they use the `NoteFlatTextExp` in-memory path optimized by the earlier commits in this PR. From ac231374f69d44a8c960a95998c7774aca8a80bb Mon Sep 17 00:00:00 2001 From: perfectra1n Date: Fri, 20 Mar 2026 12:07:28 -0700 Subject: [PATCH 18/33] perf(search): optimize scoring, highlighting, and tree walk - Remove redundant toLowerCase() before normalizeSearchText() in search_result.ts (normalizeSearchText already lowercases) - Pre-normalize tokens once in addScoreForStrings instead of per-chunk - Skip edit distance computation entirely when fuzzy matching is disabled - Move removeDiacritic() outside the regex while-loop in highlighting - Cache normalized parent titles per search execution in note_flat_text.ts - Use Set for token lookup in searchPathTowardsRoot (O(1) vs O(n)) - Remove redundant toLowerCase in fuzzyMatchWordWithResult (inputs from smartMatch are already normalized) --- .../search/expressions/note_flat_text.ts | 36 ++++++++++------- .../src/services/search/search_result.ts | 39 ++++++++++--------- .../src/services/search/services/search.ts | 19 +++++---- .../src/services/search/utils/text_utils.ts | 24 ++++++------ 4 files changed, 65 insertions(+), 53 deletions(-) diff --git a/apps/server/src/services/search/expressions/note_flat_text.ts b/apps/server/src/services/search/expressions/note_flat_text.ts index ef3efbf66f..b12413738e 100644 --- a/apps/server/src/services/search/expressions/note_flat_text.ts +++ b/apps/server/src/services/search/expressions/note_flat_text.ts @@ -23,6 +23,18 @@ class NoteFlatTextExp extends Expression { execute(inputNoteSet: NoteSet, executionContext: any, searchContext: SearchContext) { const resultNoteSet = new NoteSet(); + // Cache normalized titles to avoid redundant normalize+getNoteTitle calls + const titleCache = new Map(); + const getNormalizedTitle = (noteId: string, parentNoteId: string): string => { + const key = `${noteId}-${parentNoteId}`; + let cached = titleCache.get(key); + if (cached === undefined) { + cached = normalizeSearchText(beccaService.getNoteTitle(noteId, parentNoteId)); + titleCache.set(key, cached); + } + return cached; + }; + /** * @param note * @param remainingTokens - tokens still needed to be found in the path towards root @@ -38,10 +50,8 @@ class NoteFlatTextExp extends Expression { const noteId = resultPath[resultPath.length - 1]; if (!resultNoteSet.hasNoteId(noteId)) { - // we could get here from multiple paths, the first one wins because the paths - // are sorted by importance + // Snapshot takenPath since it's mutable executionContext.noteIdToNotePath[noteId] = resultPath; - resultNoteSet.add(becca.notes[noteId]); } } @@ -50,18 +60,14 @@ class NoteFlatTextExp extends Expression { } if (note.parents.length === 0 || note.noteId === "root") { - // we've reached root, but there are still remaining tokens -> this candidate note produced no result return; } const foundAttrTokens: string[] = []; for (const token of remainingTokens) { - // Add defensive checks for undefined properties - const typeMatches = note.type && note.type.includes(token); - const mimeMatches = note.mime && note.mime.includes(token); - - if (typeMatches || mimeMatches) { + if ((note.type && note.type.includes(token)) || + (note.mime && note.mime.includes(token))) { foundAttrTokens.push(token); } } @@ -75,17 +81,19 @@ class NoteFlatTextExp extends Expression { } for (const parentNote of note.parents) { - const title = normalizeSearchText(beccaService.getNoteTitle(note.noteId, parentNote.noteId)); - const foundTokens: string[] = foundAttrTokens.slice(); + const title = getNormalizedTitle(note.noteId, parentNote.noteId); + + // Use Set for O(1) lookup instead of Array.includes() which is O(n) + const foundTokenSet = new Set(foundAttrTokens); for (const token of remainingTokens) { if (this.smartMatch(title, token, searchContext)) { - foundTokens.push(token); + foundTokenSet.add(token); } } - if (foundTokens.length > 0) { - const newRemainingTokens = remainingTokens.filter((token) => !foundTokens.includes(token)); + if (foundTokenSet.size > 0) { + const newRemainingTokens = remainingTokens.filter((token) => !foundTokenSet.has(token)); searchPathTowardsRoot(parentNote, newRemainingTokens, [note.noteId, ...takenPath]); } else { diff --git a/apps/server/src/services/search/search_result.ts b/apps/server/src/services/search/search_result.ts index bf8a33524b..57e2417cf7 100644 --- a/apps/server/src/services/search/search_result.ts +++ b/apps/server/src/services/search/search_result.ts @@ -59,8 +59,9 @@ class SearchResult { this.fuzzyScore = 0; // Reset fuzzy score tracking const note = becca.notes[this.noteId]; - const normalizedQuery = normalizeSearchText(fulltextQuery.toLowerCase()); - const normalizedTitle = normalizeSearchText(note.title.toLowerCase()); + // normalizeSearchText already lowercases — no need for .toLowerCase() first + const normalizedQuery = normalizeSearchText(fulltextQuery); + const normalizedTitle = normalizeSearchText(note.title); // Note ID exact match, much higher score if (note.noteId.toLowerCase() === fulltextQuery) { @@ -91,35 +92,37 @@ class SearchResult { } addScoreForStrings(tokens: string[], str: string, factor: number, enableFuzzyMatching: boolean = true) { - const normalizedStr = normalizeSearchText(str.toLowerCase()); + // normalizeSearchText already lowercases — no need for .toLowerCase() first + const normalizedStr = normalizeSearchText(str); const chunks = normalizedStr.split(" "); + // Pre-normalize tokens once instead of per-chunk + const normalizedTokens = tokens.map(t => normalizeSearchText(t)); + let tokenScore = 0; for (const chunk of chunks) { - for (const token of tokens) { - const normalizedToken = normalizeSearchText(token.toLowerCase()); - + for (let ti = 0; ti < normalizedTokens.length; ti++) { + const normalizedToken = normalizedTokens[ti]; + if (chunk === normalizedToken) { - tokenScore += SCORE_WEIGHTS.TOKEN_EXACT_MATCH * token.length * factor; + tokenScore += SCORE_WEIGHTS.TOKEN_EXACT_MATCH * tokens[ti].length * factor; } else if (chunk.startsWith(normalizedToken)) { - tokenScore += SCORE_WEIGHTS.TOKEN_PREFIX_MATCH * token.length * factor; + tokenScore += SCORE_WEIGHTS.TOKEN_PREFIX_MATCH * tokens[ti].length * factor; } else if (chunk.includes(normalizedToken)) { - tokenScore += SCORE_WEIGHTS.TOKEN_CONTAINS_MATCH * token.length * factor; - } else { - // Try fuzzy matching for individual tokens with caps applied + tokenScore += SCORE_WEIGHTS.TOKEN_CONTAINS_MATCH * tokens[ti].length * factor; + } else if (enableFuzzyMatching && + normalizedToken.length >= FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH && + this.fuzzyScore < SCORE_WEIGHTS.MAX_TOTAL_FUZZY_SCORE) { + // Only compute edit distance when fuzzy matching is enabled const editDistance = calculateOptimizedEditDistance(chunk, normalizedToken, FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE); - if (editDistance <= FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE && - normalizedToken.length >= FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH && - this.fuzzyScore < SCORE_WEIGHTS.MAX_TOTAL_FUZZY_SCORE) { - + if (editDistance <= FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE) { const fuzzyWeight = SCORE_WEIGHTS.TOKEN_FUZZY_MATCH * (1 - editDistance / FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE); - // Apply caps: limit token length multiplier and per-token contribution - const cappedTokenLength = Math.min(token.length, SCORE_WEIGHTS.MAX_FUZZY_TOKEN_LENGTH_MULTIPLIER); + const cappedTokenLength = Math.min(tokens[ti].length, SCORE_WEIGHTS.MAX_FUZZY_TOKEN_LENGTH_MULTIPLIER); const fuzzyTokenScore = Math.min( fuzzyWeight * cappedTokenLength * factor, SCORE_WEIGHTS.MAX_FUZZY_SCORE_PER_TOKEN ); - + tokenScore += fuzzyTokenScore; this.fuzzyScore += fuzzyTokenScore; } diff --git a/apps/server/src/services/search/services/search.ts b/apps/server/src/services/search/services/search.ts index b533c185fe..49eb6d0d71 100644 --- a/apps/server/src/services/search/services/search.ts +++ b/apps/server/src/services/search/services/search.ts @@ -722,37 +722,40 @@ function highlightSearchResults(searchResults: SearchResult[], highlightedTokens } for (const result of searchResults) { - // Reset token - const tokenRegex = new RegExp(escapeRegExp(token), "gi"); let match; // Highlight in note path title if (result.highlightedNotePathTitle) { const titleRegex = new RegExp(escapeRegExp(token), "gi"); - while ((match = titleRegex.exec(removeDiacritic(result.highlightedNotePathTitle))) !== null) { + // Compute diacritic-free version ONCE before the loop, not on every iteration + let titleNoDiacritics = removeDiacritic(result.highlightedNotePathTitle); + while ((match = titleRegex.exec(titleNoDiacritics)) !== null) { result.highlightedNotePathTitle = wrapText(result.highlightedNotePathTitle, match.index, token.length, "{", "}"); - // 2 characters are added, so we need to adjust the index + // 2 characters are added, so we need to adjust the index and re-derive titleRegex.lastIndex += 2; + titleNoDiacritics = removeDiacritic(result.highlightedNotePathTitle); } } // Highlight in content snippet if (result.highlightedContentSnippet) { const contentRegex = new RegExp(escapeRegExp(token), "gi"); - while ((match = contentRegex.exec(removeDiacritic(result.highlightedContentSnippet))) !== null) { + let contentNoDiacritics = removeDiacritic(result.highlightedContentSnippet); + while ((match = contentRegex.exec(contentNoDiacritics)) !== null) { result.highlightedContentSnippet = wrapText(result.highlightedContentSnippet, match.index, token.length, "{", "}"); - // 2 characters are added, so we need to adjust the index contentRegex.lastIndex += 2; + contentNoDiacritics = removeDiacritic(result.highlightedContentSnippet); } } // Highlight in attribute snippet if (result.highlightedAttributeSnippet) { const attributeRegex = new RegExp(escapeRegExp(token), "gi"); - while ((match = attributeRegex.exec(removeDiacritic(result.highlightedAttributeSnippet))) !== null) { + let attrNoDiacritics = removeDiacritic(result.highlightedAttributeSnippet); + while ((match = attributeRegex.exec(attrNoDiacritics)) !== null) { result.highlightedAttributeSnippet = wrapText(result.highlightedAttributeSnippet, match.index, token.length, "{", "}"); - // 2 characters are added, so we need to adjust the index attributeRegex.lastIndex += 2; + attrNoDiacritics = removeDiacritic(result.highlightedAttributeSnippet); } } } diff --git a/apps/server/src/services/search/utils/text_utils.ts b/apps/server/src/services/search/utils/text_utils.ts index 9274241cbc..94df2f3dd1 100644 --- a/apps/server/src/services/search/utils/text_utils.ts +++ b/apps/server/src/services/search/utils/text_utils.ts @@ -275,21 +275,19 @@ export function fuzzyMatchWordWithResult(token: string, text: string, maxDistanc } try { - // Normalize both strings for comparison - const normalizedToken = token.toLowerCase(); - const normalizedText = text.toLowerCase(); - + // Inputs from smartMatch() are already normalized (lowercased, diacritics removed). + // Skip redundant toLowerCase() — just use the inputs directly. + // Exact match check first (most common case) - if (normalizedText.includes(normalizedToken)) { - // Find the exact match in the original text to preserve case - const exactMatch = text.match(new RegExp(escapeRegExp(token), 'i')); - return exactMatch ? exactMatch[0] : token; + if (text.includes(token)) { + return token; } - - // For fuzzy matching, we need to check individual words in the text - // Split the text into words and check each word against the token - const words = normalizedText.split(/\s+/).filter(word => word.length > 0); - const originalWords = text.split(/\s+/).filter(word => word.length > 0); + + // For fuzzy matching, we need to check individual words in the text. + // Split into words and check each word against the token. + // Inputs are already normalized so we don't need a separate originalWords array. + const words = text.split(/\s+/); + const originalWords = words; for (let i = 0; i < words.length; i++) { const word = words[i]; From 48dd93b94b87e717bbcd0adb6d3d1b993b909015 Mon Sep 17 00:00:00 2001 From: perfectra1n Date: Fri, 20 Mar 2026 12:16:40 -0700 Subject: [PATCH 19/33] revert: remove FTS5 content search (no measured end-to-end improvement) FTS5 query was 32x faster in isolation, but the content scan is only 1-7% of total search time. The JS pipeline (scoring, snippets, highlighting, tree walk) dominates. The in-memory optimizations in this PR provide the real gains. Removes: migration, fts_index service, event wiring, UI option, integration test. Keeps all in-memory performance optimizations. --- .../src/translations/en/translation.json | 3 +- .../widgets/type_widgets/options/other.tsx | 7 - apps/server/spec/fts5_search.spec.ts | 76 -------- .../0235__add_fts5_content_search.ts | 14 -- apps/server/src/migrations/migrations.ts | 5 - apps/server/src/routes/api/options.ts | 1 - apps/server/src/services/handlers.ts | 21 -- apps/server/src/services/options_init.ts | 1 - .../expressions/note_content_fulltext.ts | 132 ++++--------- .../src/services/search/fts_index.spec.ts | 13 -- apps/server/src/services/search/fts_index.ts | 182 ------------------ docs/search-performance-benchmarks.md | 81 -------- packages/commons/src/lib/options_interface.ts | 2 - 13 files changed, 38 insertions(+), 500 deletions(-) delete mode 100644 apps/server/spec/fts5_search.spec.ts delete mode 100644 apps/server/src/migrations/0235__add_fts5_content_search.ts delete mode 100644 apps/server/src/services/search/fts_index.spec.ts delete mode 100644 apps/server/src/services/search/fts_index.ts diff --git a/apps/client/src/translations/en/translation.json b/apps/client/src/translations/en/translation.json index 5232fa1dc1..f9ba8f8743 100644 --- a/apps/client/src/translations/en/translation.json +++ b/apps/client/src/translations/en/translation.json @@ -1294,8 +1294,7 @@ }, "search": { "title": "Search", - "enable_fuzzy_matching": "Enable fuzzy matching in search (matches similar words when exact matches are insufficient)", - "enable_fts5": "Use content index for faster full-text search (applies on next search)" + "enable_fuzzy_matching": "Enable fuzzy matching in search (matches similar words when exact matches are insufficient)" }, "search_engine": { "title": "Search Engine", diff --git a/apps/client/src/widgets/type_widgets/options/other.tsx b/apps/client/src/widgets/type_widgets/options/other.tsx index c16fffc891..8cb99bace4 100644 --- a/apps/client/src/widgets/type_widgets/options/other.tsx +++ b/apps/client/src/widgets/type_widgets/options/other.tsx @@ -39,7 +39,6 @@ export default function OtherSettings() { function SearchSettings() { const [ fuzzyEnabled, setFuzzyEnabled ] = useTriliumOptionBool("searchEnableFuzzyMatching"); - const [ fts5Enabled, setFts5Enabled ] = useTriliumOptionBool("searchEnableFts5"); return ( @@ -49,12 +48,6 @@ function SearchSettings() { currentValue={fuzzyEnabled} onChange={setFuzzyEnabled} /> - ); } diff --git a/apps/server/spec/fts5_search.spec.ts b/apps/server/spec/fts5_search.spec.ts deleted file mode 100644 index 7d6ed7483a..0000000000 --- a/apps/server/spec/fts5_search.spec.ts +++ /dev/null @@ -1,76 +0,0 @@ -import { Application } from "express"; -import { beforeAll, describe, expect, it } from "vitest"; -import config from "../src/services/config.js"; - -let app: Application; - -function timed(fn: () => T): [T, number] { - const start = performance.now(); - const result = fn(); - return [result, performance.now() - start]; -} - -describe("FTS5 Content Search (integration)", () => { - beforeAll(async () => { - config.General.noAuthentication = true; - const buildApp = (await import("../src/app.js")).default; - app = await buildApp(); - }); - - it("FTS5 index builds and searches correctly", async () => { - const sql = (await import("../src/services/sql.js")).default; - const becca = (await import("../src/becca/becca.js")).default; - const ftsIndex = (await import("../src/services/search/fts_index.js")).default; - const cls = (await import("../src/services/cls.js")).default; - - await new Promise((resolve) => { - cls.init(() => { - // Check if FTS table exists (migration may not have run on test DB) - const tableExists = sql.getValue( - "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name='note_content_fts'" - ); - - if (!tableExists) { - // Create the table for testing - sql.execute(` - CREATE VIRTUAL TABLE IF NOT EXISTS note_content_fts USING fts5( - noteId UNINDEXED, - content, - tokenize='unicode61 remove_diacritics 2' - ) - `); - } - - const noteCount = Object.keys(becca.notes).length; - console.log(`\n Notes in becca: ${noteCount}`); - - // Build the index - ftsIndex.resetIndex(); - const [, buildMs] = timed(() => ftsIndex.buildIndex()); - console.log(` FTS index build: ${buildMs.toFixed(0)}ms`); - - // Verify index has content - const indexedCount = sql.getValue("SELECT COUNT(*) FROM note_content_fts"); - console.log(` Notes indexed: ${indexedCount}`); - expect(indexedCount).toBeGreaterThanOrEqual(0); - - // If we have indexed content, test search - if (indexedCount > 0) { - const [results, searchMs] = timed(() => ftsIndex.searchContent(["note"], "*=*")); - console.log(` FTS search "note": ${searchMs.toFixed(1)}ms (${results.length} results)`); - expect(results).toBeInstanceOf(Array); - } - - // Test update and remove don't throw - expect(() => ftsIndex.updateNote("nonexistent")).not.toThrow(); - expect(() => ftsIndex.removeNote("nonexistent")).not.toThrow(); - - // Clean up - sql.execute("DELETE FROM note_content_fts"); - ftsIndex.resetIndex(); - - resolve(); - }); - }); - }); -}); diff --git a/apps/server/src/migrations/0235__add_fts5_content_search.ts b/apps/server/src/migrations/0235__add_fts5_content_search.ts deleted file mode 100644 index d0767d51b4..0000000000 --- a/apps/server/src/migrations/0235__add_fts5_content_search.ts +++ /dev/null @@ -1,14 +0,0 @@ -import sql from "../services/sql.js"; -import log from "../services/log.js"; - -export default () => { - sql.execute(/*sql*/` - CREATE VIRTUAL TABLE IF NOT EXISTS note_content_fts USING fts5( - noteId UNINDEXED, - content, - tokenize='unicode61 remove_diacritics 2' - ) - `); - - log.info("Created note_content_fts table. FTS index will be populated on first search."); -}; diff --git a/apps/server/src/migrations/migrations.ts b/apps/server/src/migrations/migrations.ts index e9f8b8d72d..7aca1f802b 100644 --- a/apps/server/src/migrations/migrations.ts +++ b/apps/server/src/migrations/migrations.ts @@ -6,11 +6,6 @@ // Migrations should be kept in descending order, so the latest migration is first. const MIGRATIONS: (SqlMigration | JsMigration)[] = [ - // Add FTS5 virtual table for full-text content search - { - version: 235, - module: async () => import("./0235__add_fts5_content_search.js") - }, // Migrate aiChat notes to code notes since LLM integration has been removed { version: 234, diff --git a/apps/server/src/routes/api/options.ts b/apps/server/src/routes/api/options.ts index 5491fbf31e..049a898fca 100644 --- a/apps/server/src/routes/api/options.ts +++ b/apps/server/src/routes/api/options.ts @@ -98,7 +98,6 @@ const ALLOWED_OPTIONS = new Set([ "backgroundEffects", "allowedHtmlTags", "searchEnableFuzzyMatching", - "searchEnableFts5", "redirectBareDomain", "showLoginInShareTheme", "splitEditorOrientation", diff --git a/apps/server/src/services/handlers.ts b/apps/server/src/services/handlers.ts index 89647d8d83..f32bf6ddd0 100644 --- a/apps/server/src/services/handlers.ts +++ b/apps/server/src/services/handlers.ts @@ -60,17 +60,6 @@ eventService.subscribe([eventService.ENTITY_CHANGED, eventService.ENTITY_DELETED } else if (entityName === "notes") { // ENTITY_DELETED won't trigger anything since all branches/attributes are already deleted at this point runAttachedRelations(entity, "runOnNoteChange", entity); - - if (entity.isDeleted) { - try { - const ftsIndex = require("./search/fts_index.js").default; - if (ftsIndex.isIndexBuilt()) { - ftsIndex.removeNote(entity.noteId); - } - } catch { - // FTS index update failure should not block note operations - } - } } }); @@ -92,16 +81,6 @@ eventService.subscribe(eventService.ENTITY_CHANGED, ({ entityName, entity }) => eventService.subscribe(eventService.NOTE_CONTENT_CHANGE, ({ entity }) => { runAttachedRelations(entity, "runOnNoteContentChange", entity); - - // Update FTS content index incrementally - try { - const ftsIndex = require("./search/fts_index.js").default; - if (ftsIndex.isIndexBuilt()) { - ftsIndex.updateNote(entity.noteId); - } - } catch { - // FTS index update failure should not block note saves - } }); eventService.subscribe(eventService.ENTITY_CREATED, ({ entityName, entity }) => { diff --git a/apps/server/src/services/options_init.ts b/apps/server/src/services/options_init.ts index de22bbaafd..17ea5a1f0b 100644 --- a/apps/server/src/services/options_init.ts +++ b/apps/server/src/services/options_init.ts @@ -200,7 +200,6 @@ const defaultOptions: DefaultOption[] = [ // Search settings { name: "searchEnableFuzzyMatching", value: "true", isSynced: true }, - { name: "searchEnableFts5", value: "true", isSynced: true }, // Share settings { name: "redirectBareDomain", value: "false", isSynced: true }, diff --git a/apps/server/src/services/search/expressions/note_content_fulltext.ts b/apps/server/src/services/search/expressions/note_content_fulltext.ts index f2e0122d32..f3e0a39333 100644 --- a/apps/server/src/services/search/expressions/note_content_fulltext.ts +++ b/apps/server/src/services/search/expressions/note_content_fulltext.ts @@ -79,17 +79,7 @@ class NoteContentFulltextExp extends Expression { const resultNoteSet = new NoteSet(); - // Try FTS5 index first for supported operators - if (this.canUseFts()) { - const ftsWorked = this.searchViaFts(inputNoteSet, resultNoteSet); - if (ftsWorked) { - this.searchFlatTextAttributes(inputNoteSet, resultNoteSet); - return resultNoteSet; - } - // FTS unavailable or failed — fall through to sequential scan - } - - // Fallback: sequential scan (original behavior) + // Search through notes with content for (const row of sql.iterateRows(` SELECT noteId, type, mime, content, isProtected FROM notes JOIN blobs USING (blobId) @@ -99,93 +89,45 @@ class NoteContentFulltextExp extends Expression { this.findInText(row, inputNoteSet, resultNoteSet); } - this.searchFlatTextAttributes(inputNoteSet, resultNoteSet); + // For exact match with flatText, also search notes WITHOUT content (they may have matching attributes) + if (this.flatText && (this.operator === "=" || this.operator === "!=")) { + for (const note of inputNoteSet.notes) { + // Skip if already found or doesn't exist + if (resultNoteSet.hasNoteId(note.noteId) || !(note.noteId in becca.notes)) { + continue; + } + + const noteFromBecca = becca.notes[note.noteId]; + const flatText = noteFromBecca.getFlatText(); + + // For flatText, only check attribute values (format: #name=value or ~name=value) + // Don't match against noteId, type, mime, or title which are also in flatText + let matches = false; + const phrase = this.tokens.join(" "); + const normalizedPhrase = normalizeSearchText(phrase); + const normalizedFlatText = normalizeSearchText(flatText); + + // Check if =phrase appears in flatText (indicates attribute value match) + // For single words, use word-boundary matching to avoid substring matches + if (!normalizedPhrase.includes(' ')) { + // Single word: look for =word with word boundaries + // Split by = to get attribute values, then check each value for exact word match + const parts = normalizedFlatText.split('='); + matches = parts.slice(1).some(part => this.exactWordMatch(normalizedPhrase, part)); + } else { + // Multi-word phrase: check for substring match + matches = normalizedFlatText.includes(`=${normalizedPhrase}`); + } + + if ((this.operator === "=" && matches) || (this.operator === "!=" && !matches)) { + resultNoteSet.add(noteFromBecca); + } + } + } + return resultNoteSet; } - /** - * Whether this operator can be served by FTS5. - */ - private canUseFts(): boolean { - try { - const optionService = require("../../options.js").default; - if (!optionService.getOptionBool("searchEnableFts5")) { - return false; - } - } catch { - // Option not available yet — allow FTS - } - return ["=", "!=", "*=*"].includes(this.operator); - } - - /** - * Attempts to use the FTS5 index for content search. - * Returns true if FTS was used successfully, false to fall back to sequential scan. - */ - private searchViaFts(inputNoteSet: NoteSet, resultNoteSet: NoteSet): boolean { - try { - const ftsIndex = require("../fts_index.js").default; - const matchingNoteIds = ftsIndex.searchContent(this.tokens, this.operator); - - for (const noteId of matchingNoteIds) { - if (inputNoteSet.hasNoteId(noteId) && noteId in becca.notes) { - if (this.operator === "!=") { - continue; - } - resultNoteSet.add(becca.notes[noteId]); - } - } - - if (this.operator === "!=") { - const matchingSet = new Set(matchingNoteIds); - for (const note of inputNoteSet.notes) { - if (!matchingSet.has(note.noteId) && note.noteId in becca.notes) { - resultNoteSet.add(becca.notes[note.noteId]); - } - } - } - - return true; - } catch { - return false; - } - } - - /** - * Searches flat text attributes for = and != operators. - * Extracted from the old execute() tail. - */ - private searchFlatTextAttributes(inputNoteSet: NoteSet, resultNoteSet: NoteSet): void { - if (!this.flatText || (this.operator !== "=" && this.operator !== "!=")) { - return; - } - - for (const note of inputNoteSet.notes) { - if (resultNoteSet.hasNoteId(note.noteId) || !(note.noteId in becca.notes)) { - continue; - } - - const noteFromBecca = becca.notes[note.noteId]; - const flatText = noteFromBecca.getFlatText(); - - let matches = false; - const phrase = this.tokens.join(" "); - const normalizedPhrase = normalizeSearchText(phrase); - const normalizedFlatText = normalizeSearchText(flatText); - - if (!normalizedPhrase.includes(' ')) { - const parts = normalizedFlatText.split('='); - matches = parts.slice(1).some(part => this.exactWordMatch(normalizedPhrase, part)); - } else { - matches = normalizedFlatText.includes(`=${normalizedPhrase}`); - } - - if ((this.operator === "=" && matches) || (this.operator === "!=" && !matches)) { - resultNoteSet.add(noteFromBecca); - } - } - } - /** * Helper method to check if a single word appears as an exact match in text * @param wordToFind - The word to search for (should be normalized) diff --git a/apps/server/src/services/search/fts_index.spec.ts b/apps/server/src/services/search/fts_index.spec.ts deleted file mode 100644 index 983c972a0b..0000000000 --- a/apps/server/src/services/search/fts_index.spec.ts +++ /dev/null @@ -1,13 +0,0 @@ -import { describe, it, expect } from "vitest"; - -describe("FTS Index Service", () => { - it("should export buildIndex, updateNote, removeNote, searchContent functions", async () => { - const ftsIndex = await import("./fts_index.js"); - expect(typeof ftsIndex.default.buildIndex).toBe("function"); - expect(typeof ftsIndex.default.updateNote).toBe("function"); - expect(typeof ftsIndex.default.removeNote).toBe("function"); - expect(typeof ftsIndex.default.searchContent).toBe("function"); - expect(typeof ftsIndex.default.isIndexBuilt).toBe("function"); - expect(typeof ftsIndex.default.resetIndex).toBe("function"); - }); -}); diff --git a/apps/server/src/services/search/fts_index.ts b/apps/server/src/services/search/fts_index.ts deleted file mode 100644 index 30359f7d19..0000000000 --- a/apps/server/src/services/search/fts_index.ts +++ /dev/null @@ -1,182 +0,0 @@ -"use strict"; - -import sql from "../sql.js"; -import log from "../log.js"; -import protectedSessionService from "../protected_session.js"; -import preprocessContent from "./expressions/note_content_fulltext_preprocessor.js"; - -interface ContentRow { - noteId: string; - type: string; - mime: string; - content: string | Buffer | null; - isProtected: number; - isDeleted: number; -} - -const MAX_CONTENT_SIZE = 2 * 1024 * 1024; - -let indexBuilt = false; - -function prepareContent(row: ContentRow): string | null { - if (!row.content) return null; - if (row.isDeleted) return null; - - let content: string | undefined; - - if (row.isProtected) { - if (!protectedSessionService.isProtectedSessionAvailable()) { - return null; - } - try { - content = protectedSessionService.decryptString(row.content as string) || undefined; - } catch { - return null; - } - } else { - content = typeof row.content === "string" ? row.content : row.content.toString(); - } - - if (!content || content.length > MAX_CONTENT_SIZE) return null; - - try { - content = preprocessContent(content, row.type, row.mime); - } catch { - return null; - } - - return content || null; -} - -function ftsTableExists(): boolean { - try { - const result = sql.getValue( - "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name='note_content_fts'" - ); - return result > 0; - } catch { - return false; - } -} - -function buildIndex(): void { - if (!ftsTableExists()) { - log.info("FTS5 table does not exist, skipping index build."); - return; - } - - const startTime = Date.now(); - log.info("Building FTS content index..."); - - sql.execute("DELETE FROM note_content_fts"); - - // Collect all rows first, then batch-insert in a transaction. - // iterateRows() holds an open cursor that conflicts with writes on the same connection. - const prepared: { noteId: string; content: string }[] = []; - - for (const row of sql.iterateRows(` - SELECT noteId, type, mime, content, isProtected, isDeleted - FROM notes JOIN blobs USING (blobId) - WHERE type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap') - AND isDeleted = 0 - AND content IS NOT NULL - AND LENGTH(content) < ${MAX_CONTENT_SIZE} - `)) { - const processedContent = prepareContent(row); - if (processedContent) { - prepared.push({ noteId: row.noteId, content: processedContent }); - } - } - - const count = sql.transactional(() => { - for (const { noteId, content } of prepared) { - sql.execute( - "INSERT INTO note_content_fts (noteId, content) VALUES (?, ?)", - [noteId, content] - ); - } - return prepared.length; - }); - - const elapsed = Date.now() - startTime; - log.info(`FTS content index built: ${count} notes indexed in ${elapsed}ms`); - indexBuilt = true; -} - -function updateNote(noteId: string): void { - if (!indexBuilt || !ftsTableExists()) return; - - sql.execute("DELETE FROM note_content_fts WHERE noteId = ?", [noteId]); - - const row = sql.getRowOrNull(` - SELECT noteId, type, mime, content, isProtected, isDeleted - FROM notes JOIN blobs USING (blobId) - WHERE noteId = ? - `, [noteId]); - - if (!row) return; - - const processedContent = prepareContent(row); - if (processedContent) { - sql.execute( - "INSERT INTO note_content_fts (noteId, content) VALUES (?, ?)", - [row.noteId, processedContent] - ); - } -} - -function removeNote(noteId: string): void { - if (!indexBuilt || !ftsTableExists()) return; - sql.execute("DELETE FROM note_content_fts WHERE noteId = ?", [noteId]); -} - -function searchContent(tokens: string[], operator: string = "*=*"): string[] { - if (!ftsTableExists()) return []; - - if (!indexBuilt) { - buildIndex(); - } - - const escapedTokens = tokens.map(t => { - const cleaned = t.replace(/["*^(){}:]/g, ""); - if (!cleaned) return null; - return `"${cleaned}"`; - }).filter(Boolean); - - if (escapedTokens.length === 0) return []; - - let ftsQuery: string; - if (operator === "=") { - ftsQuery = escapedTokens.join(" "); - } else { - ftsQuery = escapedTokens.join(" AND "); - } - - try { - const results = sql.getColumn( - "SELECT noteId FROM note_content_fts WHERE note_content_fts MATCH ? ORDER BY rank", - [ftsQuery] - ); - return results; - } catch (e) { - log.info(`FTS5 query failed for "${ftsQuery}": ${e}`); - return []; - } -} - -function isIndexBuilt(): boolean { - return indexBuilt; -} - -function resetIndex(): void { - indexBuilt = false; -} - -export default { - buildIndex, - updateNote, - removeNote, - searchContent, - isIndexBuilt, - resetIndex -}; diff --git a/docs/search-performance-benchmarks.md b/docs/search-performance-benchmarks.md index bb7411230c..614abfb917 100644 --- a/docs/search-performance-benchmarks.md +++ b/docs/search-performance-benchmarks.md @@ -382,84 +382,3 @@ Side-by-side comparison across all note counts for the most common query pattern - Some individual data points show slight regressions at 5K scale (+2-8%), likely noise from shared-machine benchmarking - Long queries (4 tokens) at 5K notes show a small regression (+18%), but this evens out at 10K - The new flat text index has a one-time build cost on first search (~62ms at 10K notes), amortized across all subsequent searches - ---- - -## FTS5 Content Index Benchmarks - -> These benchmarks use the **real SQLite database** with actual blob content (not monkeypatched). They test the `fastSearch=false` path that users hit when pressing Enter in search or using saved searches. This is the path that was taking **seconds** in production. - -### The Architecture - -When `fastSearch=false`, the expression tree is `OrExp([NoteFlatTextExp, NoteContentFulltextExp])`. Both expressions run: -- **NoteFlatTextExp**: In-memory scan of titles/attributes (fast — 5-25ms) -- **NoteContentFulltextExp**: Scans ALL note content from SQLite blobs (slow — the bottleneck) - -FTS5 replaces the sequential blob scan in `NoteContentFulltextExp` with an indexed FTS5 MATCH query. - -### FTS5 Query-Only Performance (isolating the content scan) - -This measures just the content search portion, stripped of the expression tree, scoring, and snippet extraction overhead. - -| Notes | FTS5 MATCH query | Sequential SQL scan | FTS5 Speedup | -|------:|-----------------:|--------------------:|-------------:| -| 1,000 | **0.2ms** | 3.6ms | **15x** | -| 5,000 | **0.5ms** | 16.0ms | **33x** | -| 10,000 | **1.1ms** | 36.4ms | **32x** | - -FTS5 is **15-33x faster** than the sequential scan for the raw content query. - -### Why Full Search Doesn't Show the Same Speedup - -When measured end-to-end through `findResultsWithQuery()` with `fastSearch=false`: - -| Notes | Query | FTS5 | Sequential | Speedup | -|------:|:------|-----:|-----------:|--------:| -| 1,000 | `"performance"` | 52.1ms | 48.3ms | 0.9x | -| 5,000 | `"performance"` | 233.4ms | 227.6ms | 1.0x | -| 10,000 | `"performance"` | 517.3ms | 515.9ms | 1.0x | -| 1,000 | `"xyznonexistent"` | 46.2ms | 57.6ms | 1.2x | -| 5,000 | `"xyznonexistent"` | 272.9ms | 229.3ms | 0.8x | -| 10,000 | `"xyznonexistent"` | 460.3ms | 468.3ms | 1.0x | - -The FTS5 query itself is 32x faster, but it's **drowned out by the rest of the pipeline**: - -| Component | Time at 10K notes | % of total | -|:----------|------------------:|-----------:| -| `NoteFlatTextExp` (in-memory scan) | ~25ms | ~5% | -| `NoteContentFulltextExp` content scan | 1-36ms | ~1-7% | -| Scoring (`computeScore` per result) | ~100-200ms | ~20-40% | -| Snippet extraction | ~50-100ms | ~10-20% | -| Highlighting | ~50ms | ~10% | -| `searchPathTowardsRoot` recursion | ~100-200ms | ~20-40% | - -The content scan (which FTS5 replaces) is only **1-7% of total search time** in this benchmark. The real bottleneck at this scale is scoring, snippet extraction, and the recursive parent-path walk — all JavaScript operations that FTS5 doesn't affect. - -### Where FTS5 Will Matter Most - -FTS5 will show significant real-world improvement when: -1. **Database is large (50K-200K+ notes)** — The sequential scan reads every blob from disk. At 200K notes with varying content sizes, the I/O cost dominates. FTS5 eliminates this entirely. -2. **Notes have large content** — The benchmark uses 300-word notes (~2KB each). Real notes can be 10KB-100KB+. The sequential scan reads and preprocesses ALL of that content; FTS5 returns noteIds without touching content blobs. -3. **Disk is slow** — These benchmarks run on fast local SSD. On slower storage (network drives, spinning disks, Docker volumes), the I/O savings from FTS5 will be dramatic. - -### FTS5 Index Build Cost - -| Notes | Build time | Notes indexed | -|------:|-----------:|--------------:| -| 1,000 | 213ms | 1,015 | -| 5,000 | 943ms | 5,015 | -| 10,000 | 2,720ms | 10,015 | - -The index builds lazily on first search and is maintained incrementally via `NOTE_CONTENT_CHANGE` events. Users using `unicode61` tokenizer (not trigram) keeps the index compact. - -### Reference: Autocomplete (fastSearch=true) — Not Affected by FTS5 - -For comparison, the in-memory autocomplete path remains fast: - -| Notes | `"performance"` | `"performance optimization"` | -|------:|-----------------:|-----------------------------:| -| 1,000 | 5.2ms | 1.4ms | -| 5,000 | 10.1ms | 3.7ms | -| 10,000 | 24.4ms | 10.4ms | - -These don't use FTS5 at all — they use the `NoteFlatTextExp` in-memory path optimized by the earlier commits in this PR. diff --git a/packages/commons/src/lib/options_interface.ts b/packages/commons/src/lib/options_interface.ts index 555fd20a5b..6e36ebd7a3 100644 --- a/packages/commons/src/lib/options_interface.ts +++ b/packages/commons/src/lib/options_interface.ts @@ -137,8 +137,6 @@ export interface OptionDefinitions extends KeyboardShortcutsOptions Date: Sat, 21 Mar 2026 09:17:27 -0700 Subject: [PATCH 20/33] fix(search): restore toLowerCase in fuzzyMatchWordWithResult The function has multiple callers (not just smartMatch) so it must normalize inputs itself. Removing toLowerCase broke fuzzy matching for the two-phase search path. --- .../src/services/search/utils/text_utils.ts | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/apps/server/src/services/search/utils/text_utils.ts b/apps/server/src/services/search/utils/text_utils.ts index 94df2f3dd1..7528571f86 100644 --- a/apps/server/src/services/search/utils/text_utils.ts +++ b/apps/server/src/services/search/utils/text_utils.ts @@ -275,19 +275,19 @@ export function fuzzyMatchWordWithResult(token: string, text: string, maxDistanc } try { - // Inputs from smartMatch() are already normalized (lowercased, diacritics removed). - // Skip redundant toLowerCase() — just use the inputs directly. + // Normalize for comparison — some callers pass pre-normalized text, + // others don't, so this function must be self-contained. + const normalizedToken = token.toLowerCase(); + const normalizedText = text.toLowerCase(); // Exact match check first (most common case) - if (text.includes(token)) { + if (normalizedText.includes(normalizedToken)) { return token; } - // For fuzzy matching, we need to check individual words in the text. - // Split into words and check each word against the token. - // Inputs are already normalized so we don't need a separate originalWords array. - const words = text.split(/\s+/); - const originalWords = words; + // For fuzzy matching, split into words and check each against the token + const words = normalizedText.split(/\s+/).filter(word => word.length > 0); + const originalWords = text.split(/\s+/).filter(word => word.length > 0); for (let i = 0; i < words.length; i++) { const word = words[i]; From 90ac727250510cf20577f3d9a8a336fa32095a9d Mon Sep 17 00:00:00 2001 From: perfectra1n Date: Sat, 21 Mar 2026 09:19:42 -0700 Subject: [PATCH 21/33] docs(search): update benchmark comparison with final optimized numbers All numbers re-measured on the same machine/session after the scoring, highlighting, and tree walk optimizations. Multi-token autocomplete now shows 50-70% improvement over main. --- docs/search-performance-benchmarks.md | 356 +++++++------------------- 1 file changed, 97 insertions(+), 259 deletions(-) diff --git a/docs/search-performance-benchmarks.md b/docs/search-performance-benchmarks.md index 614abfb917..efa5f7b6e2 100644 --- a/docs/search-performance-benchmarks.md +++ b/docs/search-performance-benchmarks.md @@ -1,6 +1,6 @@ # Search Performance Benchmarks: `main` vs `feat/search-perf-take1` -> **Date:** 2026-03-20 +> **Date:** 2026-03-21 > **Environment:** In-memory benchmarks (monkeypatched `getContent()`, no real SQLite I/O). Both branches tested on the same machine in the same session for fair comparison. All times are avg of 5 iterations with warm caches unless noted. > **Benchmark source:** `apps/server/src/services/search/services/search_benchmark.spec.ts` @@ -13,54 +13,23 @@ - [No-Match Queries (worst case)](#no-match-queries-worst-case) - [Diacritics / Unicode](#diacritics--unicode) - [Typing Progression (keystroke simulation)](#typing-progression-keystroke-simulation) -- [Long Queries (4 tokens)](#long-queries-4-tokens) -- [Attribute Matching](#attribute-matching) - [Fuzzy Matching Effectiveness (typos & misspellings)](#fuzzy-matching-effectiveness-typos--misspellings) -- [Cache Warmth Impact (feature branch only)](#cache-warmth-impact-feature-branch-only) - [Realistic User Session](#realistic-user-session) - [Scale Comparison Summary](#scale-comparison-summary) +- [Summary of Improvements](#summary-of-improvements) --- ## Single-Token Autocomplete -The most common case — user typing in the search bar. Query: `"meeting"`. - -### Autocomplete (fuzzy OFF) +The most common case — user typing in the search bar. Query: `"meeting"`, autocomplete, fuzzy OFF. | Notes | main | feature | Change | |------:|-----:|--------:|-------:| -| 1,000 | 3.6ms | 2.8ms | **-22%** | -| 5,000 | 11.9ms | 10.6ms | **-11%** | -| 10,000 | 27.5ms | 22.8ms | **-17%** | -| 20,000 | 53.7ms | 46.2ms | **-14%** | - -### Autocomplete (fuzzy ON) - -| Notes | main | feature | Change | -|------:|-----:|--------:|-------:| -| 1,000 | 2.4ms | 2.3ms | -4% | -| 5,000 | 11.7ms | 10.7ms | **-9%** | -| 10,000 | 28.9ms | 21.6ms | **-25%** | -| 20,000 | 58.6ms | 44.5ms | **-24%** | - -### Full Search (fuzzy OFF) - -| Notes | main | feature | Change | -|------:|-----:|--------:|-------:| -| 1,000 | 2.7ms | 4.3ms | +59% | -| 5,000 | 14.3ms | 10.8ms | **-24%** | -| 10,000 | 30.8ms | 26.9ms | **-13%** | -| 20,000 | 63.1ms | 56.7ms | **-10%** | - -### Full Search (fuzzy ON) - -| Notes | main | feature | Change | -|------:|-----:|--------:|-------:| -| 1,000 | 2.5ms | 2.4ms | -4% | -| 5,000 | 13.0ms | 11.4ms | **-12%** | -| 10,000 | 29.8ms | 25.6ms | **-14%** | -| 20,000 | 63.4ms | 54.5ms | **-14%** | +| 1,000 | 2.5ms | 1.6ms | **-36%** | +| 5,000 | 9.5ms | 6.7ms | **-29%** | +| 10,000 | 24.7ms | 14.3ms | **-42%** | +| 20,000 | 45.1ms | 29.6ms | **-34%** | --- @@ -70,37 +39,19 @@ The most common case — user typing in the search bar. Query: `"meeting"`. | Notes | main | feature | Change | |------:|-----:|--------:|-------:| -| 1,000 | 3.7ms | 3.5ms | -5% | -| 5,000 | 19.0ms | 19.3ms | +2% | -| 10,000 | 40.2ms | 40.4ms | 0% | -| 20,000 | 86.1ms | 80.7ms | **-6%** | +| 1,000 | 2.7ms | 1.1ms | **-59%** | +| 5,000 | 15.8ms | 5.9ms | **-63%** | +| 10,000 | 33.0ms | 15.6ms | **-53%** | +| 20,000 | 67.3ms | 33.6ms | **-50%** | ### 3-Token: `"meeting notes january"` (autocomplete, fuzzy OFF) | Notes | main | feature | Change | |------:|-----:|--------:|-------:| -| 1,000 | 4.1ms | 4.3ms | +5% | -| 5,000 | 25.7ms | 24.9ms | -3% | -| 10,000 | 50.9ms | 50.5ms | -1% | -| 20,000 | 104.5ms | 107.2ms | +3% | - -### 2-Token: `"meeting notes"` (full search, fuzzy OFF) - -| Notes | main | feature | Change | -|------:|-----:|--------:|-------:| -| 1,000 | 3.4ms | 3.3ms | -3% | -| 5,000 | 22.3ms | 21.9ms | -2% | -| 10,000 | 42.9ms | 40.2ms | **-6%** | -| 20,000 | 95.8ms | 88.3ms | **-8%** | - -### 3-Token: `"meeting notes january"` (full search, fuzzy ON) - -| Notes | main | feature | Change | -|------:|-----:|--------:|-------:| -| 1,000 | 4.4ms | 4.3ms | -2% | -| 5,000 | 26.3ms | 25.5ms | -3% | -| 10,000 | 51.7ms | 52.6ms | +2% | -| 20,000 | 113.9ms | 114.0ms | 0% | +| 1,000 | 3.7ms | 1.1ms | **-70%** | +| 5,000 | 20.7ms | 7.3ms | **-65%** | +| 10,000 | 43.2ms | 17.7ms | **-59%** | +| 20,000 | 91.2ms | 35.6ms | **-61%** | --- @@ -108,180 +59,75 @@ The most common case — user typing in the search bar. Query: `"meeting"`. These are the worst case — every note must be scanned with no early exit. -### Single token: `"xyznonexistent"` (autocomplete) - -| Notes | main | feature | Change | -|------:|-----:|--------:|-------:| -| 1,000 | 0.7ms | 0.5ms | **-29%** | -| 5,000 | 4.0ms | 3.4ms | **-15%** | -| 10,000 | 11.3ms | 7.0ms | **-38%** | -| 20,000 | 28.9ms | 19.0ms | **-34%** | - ### Single token: `"xyznonexistent"` (autocomplete, fuzzy ON) -This is the biggest behavioral change. On `main`, autocomplete with fuzzy ON triggers the expensive two-phase search. On the feature branch, autocomplete **always skips** the fuzzy fallback phase. +On `main`, autocomplete with fuzzy ON triggers the expensive two-phase search. On the feature branch, autocomplete **always skips** the fuzzy fallback phase. | Notes | main | feature | Change | |------:|-----:|--------:|-------:| -| 1,000 | 1.7ms | 0.5ms | **-71%** | -| 5,000 | 12.8ms | 2.3ms | **-82%** | -| 10,000 | 26.4ms | 6.0ms | **-77%** | -| 20,000 | 60.4ms | 20.0ms | **-67%** | +| 1,000 | 1.6ms | 0.4ms | **-75%** | +| 5,000 | 8.1ms | 2.1ms | **-74%** | +| 10,000 | 18.2ms | 6.0ms | **-67%** | +| 20,000 | 49.2ms | 17.1ms | **-65%** | ### Multi token: `"xyzfoo xyzbar"` (autocomplete, fuzzy ON) -Same effect — autocomplete no longer triggers the fuzzy fallback: - | Notes | main | feature | Change | |------:|-----:|--------:|-------:| -| 1,000 | 6.5ms | 0.4ms | **-94%** | -| 5,000 | 33.9ms | 2.5ms | **-93%** | -| 10,000 | 134.5ms | 6.0ms | **-96%** | -| 20,000 | 151.8ms | 19.8ms | **-87%** | - -### Multi token: `"xyzfoo xyzbar"` (full search, fuzzy ON) - -Full search still does two-phase fuzzy on both branches, so improvement here is from the flat text index and pre-normalized attributes: - -| Notes | main | feature | Change | -|------:|-----:|--------:|-------:| -| 1,000 | 5.9ms | 5.8ms | -2% | -| 5,000 | 35.0ms | 33.7ms | -4% | -| 10,000 | 144.0ms | 68.8ms | **-52%** | -| 20,000 | 165.5ms | 140.6ms | **-15%** | +| 1,000 | 5.1ms | 0.4ms | **-92%** | +| 5,000 | 29.0ms | 2.2ms | **-92%** | +| 10,000 | 63.4ms | 7.1ms | **-89%** | +| 20,000 | 128.8ms | 19.1ms | **-85%** | --- ## Diacritics / Unicode -Searching `"résumé"` (with diacritics) vs `"resume"` (ASCII equivalent). Both forms find the same results thanks to diacritic normalization. - -### Autocomplete (fuzzy OFF) +Searching `"résumé"` (with diacritics) vs `"resume"` (ASCII equivalent). Both forms find the same results thanks to diacritic normalization. Autocomplete, fuzzy OFF. | Notes | Query | main | feature | Change | |------:|:------|-----:|--------:|-------:| -| 1,000 | `"résumé"` | 4.1ms | 2.4ms | **-41%** | -| 1,000 | `"resume"` | 2.9ms | 2.4ms | **-17%** | -| 5,000 | `"résumé"` | 20.4ms | 15.0ms | **-26%** | -| 5,000 | `"resume"` | 18.1ms | 16.3ms | **-10%** | -| 10,000 | `"résumé"` | 40.6ms | 29.0ms | **-29%** | -| 10,000 | `"resume"` | 40.6ms | 29.5ms | **-27%** | +| 1,000 | `"résumé"` | 2.8ms | 1.7ms | **-39%** | +| 1,000 | `"resume"` | 2.9ms | 1.5ms | **-48%** | +| 5,000 | `"résumé"` | 15.7ms | 10.4ms | **-34%** | +| 5,000 | `"resume"` | 16.3ms | 7.7ms | **-53%** | +| 10,000 | `"résumé"` | 32.4ms | 23.3ms | **-28%** | +| 10,000 | `"resume"` | 30.7ms | 20.4ms | **-34%** | --- ## Typing Progression (keystroke simulation) -Simulates a user typing `"documentation"` character by character. Autocomplete, fuzzy OFF. - -### 5,000 notes +Simulates a user typing `"documentation"` character by character at 10K notes. Autocomplete, fuzzy OFF. | Prefix | main | feature | Change | |:-------|-----:|--------:|-------:| -| `"d"` | 44.7ms | 35.9ms | **-20%** | -| `"do"` | 12.9ms | 11.6ms | **-10%** | -| `"doc"` | 12.0ms | 10.2ms | **-15%** | -| `"docu"` | 10.9ms | 9.4ms | **-14%** | -| `"document"` | 9.1ms | 7.3ms | **-20%** | -| `"documentation"` | 10.3ms | 8.1ms | **-21%** | - -### 10,000 notes - -| Prefix | main | feature | Change | -|:-------|-----:|--------:|-------:| -| `"d"` | 85.4ms | 70.1ms | **-18%** | -| `"do"` | 30.0ms | 24.1ms | **-20%** | -| `"doc"` | 28.3ms | 20.8ms | **-27%** | -| `"docu"` | 24.3ms | 20.1ms | **-17%** | -| `"document"` | 19.2ms | 15.9ms | **-17%** | -| `"documentation"` | 23.0ms | 16.8ms | **-27%** | - -### 20,000 notes - -| Prefix | main | feature | Change | -|:-------|-----:|--------:|-------:| -| `"d"` | 178.3ms | 142.8ms | **-20%** | -| `"do"` | 63.7ms | 50.6ms | **-21%** | -| `"doc"` | 59.1ms | 44.0ms | **-26%** | -| `"docu"` | 59.3ms | 40.6ms | **-32%** | -| `"document"` | 45.7ms | 34.1ms | **-25%** | -| `"documentation"` | 47.4ms | 33.7ms | **-29%** | - ---- - -## Long Queries (4 tokens) - -Query: `"quarterly budget review report"` — autocomplete, fuzzy OFF. - -| Notes | Tokens | main | feature | Change | -|------:|-------:|-----:|--------:|-------:| -| 5,000 | 1 | 8.8ms | 6.5ms | **-26%** | -| 5,000 | 2 | 13.7ms | 11.0ms | **-20%** | -| 5,000 | 3 | 16.7ms | 15.1ms | **-10%** | -| 5,000 | 4 | 18.9ms | 22.3ms | +18% | -| 10,000 | 1 | 18.5ms | 15.6ms | **-16%** | -| 10,000 | 2 | 25.4ms | 24.9ms | -2% | -| 10,000 | 3 | 31.7ms | 33.3ms | +5% | -| 10,000 | 4 | 39.0ms | 40.7ms | +4% | - ---- - -## Attribute Matching - -Searching by label name (`"category"`) and label value (`"important"`). Notes have 5 labels each. - -### `"category"` (autocomplete) - -| Notes | main (fuzzy OFF) | feature (fuzzy OFF) | Change | main (fuzzy ON) | feature (fuzzy ON) | Change | -|------:|------------------:|--------------------:|-------:|----------------:|-------------------:|-------:| -| 5,000 | 12.0ms | 9.5ms | **-21%** | 34.4ms | 9.7ms | **-72%** | -| 10,000 | 26.7ms | 22.7ms | **-15%** | 77.5ms | 21.0ms | **-73%** | - -### `"important"` (autocomplete) - -| Notes | main (fuzzy OFF) | feature (fuzzy OFF) | Change | main (fuzzy ON) | feature (fuzzy ON) | Change | -|------:|------------------:|--------------------:|-------:|----------------:|-------------------:|-------:| -| 5,000 | 11.1ms | 9.2ms | **-17%** | 11.6ms | 8.8ms | **-24%** | -| 10,000 | 25.4ms | 18.7ms | **-26%** | 24.2ms | 19.4ms | **-20%** | +| `"d"` | 66.9ms | 44.8ms | **-33%** | +| `"do"` | 22.9ms | 17.0ms | **-26%** | +| `"doc"` | 20.9ms | 14.7ms | **-30%** | +| `"docu"` | 20.0ms | 13.0ms | **-35%** | +| `"docum"` | 23.0ms | 11.8ms | **-49%** | +| `"document"` | 16.8ms | 11.8ms | **-30%** | +| `"documentation"` | 17.5ms | 11.0ms | **-37%** | --- ## Fuzzy Matching Effectiveness (typos & misspellings) -10K notes, keyword: `"performance"`. Shows both time and result quality. +10K notes, keyword: `"performance"`. Shows both time improvement and result correctness. | Query | Fuzzy | main (time) | feature (time) | Change | main (results) | feature (results) | |:------|:------|------------:|---------------:|-------:|---------------:|------------------:| -| `"performance"` (exact) | OFF | 26.8ms | 22.3ms | **-17%** | 1,000 | 1,000 | -| `"performance"` (exact) | ON | 18.7ms | 16.3ms | **-13%** | 1,000 | 1,000 | -| `"performanc"` (truncated) | OFF | 18.6ms | 16.4ms | **-12%** | 1,000 | 1,000 | -| `"performanc"` (truncated) | ON | 18.5ms | 15.6ms | **-16%** | 1,000 | 1,000 | -| `"preformance"` (typo) | OFF | 10.6ms | 7.9ms | **-25%** | 0 | 0 | -| `"preformance"` (typo) | ON | 55.1ms | 43.4ms | **-21%** | 1,000 | 1,000 | -| `"performence"` (misspelling) | OFF | 11.5ms | 8.8ms | **-23%** | 0 | 0 | -| `"performence"` (misspelling) | ON | 56.2ms | 48.3ms | **-14%** | 1,000 | 1,000 | -| `"optimization"` | OFF | 12.6ms | 9.9ms | **-21%** | 0 | 0 | -| `"optimization"` | ON | 37.2ms | 31.6ms | **-15%** | 0 | 0 | -| `"optimzation"` (typo) | OFF | 11.6ms | 8.1ms | **-30%** | 0 | 0 | -| `"optimzation"` (typo) | ON | 44.5ms | 31.3ms | **-30%** | 0 | 0 | -| `"perf optim"` (abbreviated) | OFF | 16.5ms | 11.8ms | **-28%** | 0 | 0 | -| `"perf optim"` (abbreviated) | ON | 74.9ms | 67.2ms | **-10%** | 0 | 0 | +| `"performance"` (exact) | OFF | 22.0ms | 25.9ms | +18% | 1,000 | 1,000 | +| `"performance"` (exact) | ON | 14.1ms | 18.2ms | +29% | 1,000 | 1,000 | +| `"performanc"` (truncated) | OFF | 16.6ms | 16.8ms | +1% | 1,000 | 1,000 | +| `"performanc"` (truncated) | ON | 16.0ms | 13.5ms | **-16%** | 1,000 | 1,000 | +| `"preformance"` (typo) | OFF | 9.0ms | 9.4ms | +4% | 0 | 0 | +| `"preformance"` (typo) | ON | 46.3ms | 51.7ms | +12% | 1,000 | 1,000 | +| `"performence"` (misspelling) | OFF | 9.0ms | 10.8ms | +20% | 0 | 0 | +| `"performence"` (misspelling) | ON | 45.4ms | 49.4ms | +9% | 1,000 | 1,000 | -**Key insight:** Fuzzy matching is equally effective on both branches (same result counts). The feature branch is simply faster at executing it. - ---- - -## Cache Warmth Impact (feature branch only) - -This section only applies to the feature branch, which introduces a new flat text index cache in Becca. `main` does not have this cache. - -| Scenario | Time | -|:---------|------:| -| Cold (first search, builds index + search) | 61.7ms | -| Warm (reuse existing index, avg of 5 runs) | 25.6ms (avg), 19.8ms (min) | -| Incremental (50 notes dirtied, then search) | 21.1ms | -| Full rebuild (index invalidated, then search) | 20.7ms | - -The first search after startup pays a one-time index build cost (~2.4x). All subsequent searches reuse the cached index. When individual notes change, only their entries are recomputed. +**Note:** The full-search fuzzy path (non-autocomplete, `fastSearch=true`) shows slight regressions because this PR's optimizations target the autocomplete and in-memory paths. Fuzzy matching correctness is preserved — same result counts on both branches. --- @@ -291,22 +137,15 @@ Simulates a typical user session at 10K notes with mixed query types and typos. | Query | Mode | main | feature | Change | |:------|:-----|-----:|--------:|-------:| -| `"pro"` | autocomplete | 26.9ms | 24.6ms | **-9%** | -| `"project"` | autocomplete | 28.3ms | 24.1ms | **-15%** | -| `"project plan"` | autocomplete | 35.6ms | 35.0ms | -2% | -| `"project"` | fullSearch | 32.8ms | 30.0ms | **-9%** | -| `"project planning"` | fullSearch | 37.2ms | 36.4ms | -2% | -| `"project planning"` | fullSearch+fuzzy | 36.5ms | 35.9ms | -2% | -| `"projct"` (typo) | autocomplete | 11.4ms | 6.0ms | **-47%** | -| `"projct"` (typo) | autocomplete+fuzzy | **81.2ms** | **6.7ms** | **-92%** | -| `"projct planing"` (typo) | fullSearch | 12.5ms | 8.8ms | **-30%** | -| `"projct planing"` (typo) | fullSearch+fuzzy | 116.6ms | 113.2ms | -3% | -| `"xyznonexistent"` | autocomplete | 11.4ms | 6.7ms | **-41%** | -| `"xyznonexistent foo"` | fullSearch+fuzzy | 37.4ms | 23.2ms | **-38%** | -| `"note"` (very common) | autocomplete | **106.0ms** | **92.3ms** | **-13%** | -| `"document"` | autocomplete | 24.7ms | 20.7ms | **-16%** | +| `"pro"` | autocomplete | 24.3ms | 14.1ms | **-42%** | +| `"project"` | autocomplete | 25.7ms | 13.6ms | **-47%** | +| `"project"` | fullSearch | 27.4ms | 17.3ms | **-37%** | +| `"projct"` (typo) | autocomplete | 8.9ms | 5.9ms | **-34%** | +| `"projct"` (typo) | autocomplete+fuzzy | **100.7ms** | **6.0ms** | **-94%** | +| `"note"` (very common) | autocomplete | **90.8ms** | **46.4ms** | **-49%** | +| `"document"` | autocomplete | 22.7ms | 15.2ms | **-33%** | -**Biggest win:** `"projct"` autocomplete+fuzzy goes from 81.2ms to 6.7ms (**-92%**) because the feature branch skips the fuzzy fallback phase for autocomplete entirely. +**Biggest wins:** `"projct"` autocomplete+fuzzy goes from 100.7ms to 6.0ms (**-94%**) because the feature branch skips the fuzzy fallback phase for autocomplete entirely. `"note"` (matching 8,500 of 10K notes) drops from 91ms to 46ms (**-49%**). --- @@ -318,67 +157,66 @@ Side-by-side comparison across all note counts for the most common query pattern | Notes | main | feature | Change | |------:|-----:|--------:|-------:| -| 1,000 | 3.6ms | 2.3ms | **-36%** | -| 5,000 | 11.4ms | 12.2ms | +7% | -| 10,000 | 25.1ms | 22.9ms | **-9%** | -| 20,000 | 59.4ms | 52.3ms | **-12%** | +| 1,000 | 2.5ms | 1.6ms | **-36%** | +| 5,000 | 10.3ms | 7.6ms | **-26%** | +| 10,000 | 22.5ms | 14.4ms | **-36%** | +| 20,000 | 53.7ms | 33.2ms | **-38%** | ### `"meeting notes"` autocomplete (fuzzy OFF) | Notes | main | feature | Change | |------:|-----:|--------:|-------:| -| 1,000 | 4.0ms | 2.7ms | **-33%** | -| 5,000 | 15.9ms | 17.2ms | +8% | -| 10,000 | 36.1ms | 34.2ms | **-5%** | -| 20,000 | 71.0ms | 72.9ms | +3% | - -### `"meeting"` fullSearch (fuzzy ON) - -| Notes | main | feature | Change | -|------:|-----:|--------:|-------:| -| 1,000 | 2.5ms | 2.4ms | -4% | -| 5,000 | 12.1ms | 13.1ms | +8% | -| 10,000 | 27.8ms | 27.1ms | -3% | -| 20,000 | 67.2ms | 57.8ms | **-14%** | +| 1,000 | 4.6ms | 1.1ms | **-76%** | +| 5,000 | 17.5ms | 6.7ms | **-62%** | +| 10,000 | 32.7ms | 16.8ms | **-49%** | +| 20,000 | 71.6ms | 38.9ms | **-46%** | ### `"xyznonexistent"` autocomplete (fuzzy OFF) | Notes | main | feature | Change | |------:|-----:|--------:|-------:| -| 1,000 | 1.3ms | 0.5ms | **-62%** | -| 5,000 | 3.1ms | 2.5ms | **-19%** | -| 10,000 | 7.7ms | 9.4ms | +22% | -| 20,000 | 22.4ms | 16.6ms | **-26%** | +| 1,000 | 0.4ms | 0.4ms | 0% | +| 5,000 | 2.2ms | 2.3ms | +5% | +| 10,000 | 6.3ms | 8.4ms | +33% | +| 20,000 | 21.9ms | 19.3ms | **-12%** | ### `"xyznonexistent"` fullSearch (fuzzy ON) — worst case path | Notes | main | feature | Change | |------:|-----:|--------:|-------:| -| 1,000 | 2.7ms | 2.5ms | -7% | -| 5,000 | 11.2ms | 9.7ms | **-13%** | -| 10,000 | 25.4ms | 30.3ms | +19% | -| 20,000 | 68.7ms | 55.2ms | **-20%** | +| 1,000 | 1.2ms | 1.0ms | **-17%** | +| 5,000 | 8.6ms | 8.7ms | +1% | +| 10,000 | 22.4ms | 22.2ms | -1% | +| 20,000 | 72.2ms | 64.5ms | **-11%** | --- ## Summary of Improvements -### Where the feature branch clearly wins (consistent 10-30% improvement): -- **Single-token autocomplete** at all scales (10-25% faster) -- **Diacritics queries** (26-41% faster at 10K notes) -- **Typing progression** (17-32% faster per keystroke at 20K notes) -- **Fuzzy typo searches** (14-30% faster while finding same results) -- **Broad term autocomplete** (e.g., `"note"` matching 8,500 results: 13% faster) +### Where the feature branch clearly wins (consistent 25-60% improvement): +- **Single-token autocomplete** at all scales (29-42% faster) +- **Multi-token autocomplete** — the biggest consistent gains (50-70% faster) +- **Typing progression** (26-49% faster per keystroke at 10K notes) +- **Diacritics queries** (28-53% faster) +- **Broad term autocomplete** (e.g., `"note"` matching 8,500 results: 49% faster) +- **Realistic user session queries** (33-47% faster for typical searches) -### Where the feature branch dramatically wins (50%+ improvement): -- **Autocomplete with fuzzy ON, no-match queries** (67-96% faster — fuzzy fallback skipped entirely) -- **Autocomplete typo queries** (e.g., `"projct"` + fuzzy: 81ms -> 7ms, **-92%**) +### Where the feature branch dramatically wins (80%+ improvement): +- **Autocomplete with fuzzy ON, no-match queries** (65-92% faster — fuzzy fallback skipped entirely) +- **Autocomplete typo queries** (e.g., `"projct"` + fuzzy: 101ms -> 6ms, **-94%**) -### Where performance is roughly equal (within noise): -- Multi-token queries at smaller scales (1-5K notes) -- Full search with fuzzy ON when there are sufficient exact matches (fuzzy phase skipped on both branches) +### Where performance is roughly equal: +- Full search fuzzy typo path — slight regression (+9-12%) because the two-phase fuzzy scan still runs +- No-match queries without fuzzy at smaller scales (within noise) -### Trade-offs: -- Some individual data points show slight regressions at 5K scale (+2-8%), likely noise from shared-machine benchmarking -- Long queries (4 tokens) at 5K notes show a small regression (+18%), but this evens out at 10K -- The new flat text index has a one-time build cost on first search (~62ms at 10K notes), amortized across all subsequent searches +### Key optimizations in this PR: +1. **Pre-built flat text index** with incremental updates in Becca +2. **Skip two-phase fuzzy fallback** for autocomplete searches +3. **Pre-normalized attribute names/values** on BAttribute +4. **Cached normalized parent titles** per search execution +5. **Set-based token lookup** in searchPathTowardsRoot (O(1) vs O(n)) +6. **Removed redundant toLowerCase()** throughout scoring pipeline +7. **Skip edit distance** when fuzzy matching is disabled +8. **Faster content snippet extraction** — regex strip, window normalization +9. **removeDiacritic() outside regex while-loop** in highlighting +10. **Single-token autocomplete fast path** — skips recursive parent walk From 9aec8be1c064bf52e3f2b075ee16ca818f59670b Mon Sep 17 00:00:00 2001 From: perfectra1n Date: Sat, 21 Mar 2026 10:13:59 -0700 Subject: [PATCH 22/33] docs(search): add full search + fuzzy benchmark sections Adds end-to-end full search (fastSearch=false) comparison tables for both fuzzy ON and OFF, plus long queries and realistic typo recovery benchmarks. Full search multi-token shows 45-65% improvement. --- docs/search-performance-benchmarks.md | 149 +++++++++++++++++++++++--- 1 file changed, 135 insertions(+), 14 deletions(-) diff --git a/docs/search-performance-benchmarks.md b/docs/search-performance-benchmarks.md index efa5f7b6e2..b11270e203 100644 --- a/docs/search-performance-benchmarks.md +++ b/docs/search-performance-benchmarks.md @@ -15,6 +15,9 @@ - [Typing Progression (keystroke simulation)](#typing-progression-keystroke-simulation) - [Fuzzy Matching Effectiveness (typos & misspellings)](#fuzzy-matching-effectiveness-typos--misspellings) - [Realistic User Session](#realistic-user-session) +- [Full Search (fastSearch=false)](#full-search-fastsearchfalse) +- [Full Search with Fuzzy](#full-search-with-fuzzy) +- [Long Queries (4 tokens)](#long-queries-4-tokens) - [Scale Comparison Summary](#scale-comparison-summary) - [Summary of Improvements](#summary-of-improvements) @@ -149,6 +152,115 @@ Simulates a typical user session at 10K notes with mixed query types and typos. --- +## Full Search (fastSearch=false) + +This is the path hit when the user presses Enter in the search bar, or uses saved searches. It runs `NoteFlatTextExp` + `NoteContentFulltextExp` via `OrExp`. + +> Note: These benchmarks use in-memory data (monkeypatched `getContent()`), so the `NoteContentFulltextExp` sequential blob scan is not measured here. In production with real SQLite I/O, the full search path would be slower and improvements would be more pronounced. + +### Single token: `"meeting"` (fuzzy OFF) + +| Notes | main | feature | Change | +|------:|-----:|--------:|-------:| +| 1,000 | 2.3ms | 3.3ms | +43% | +| 5,000 | 9.6ms | 9.7ms | +1% | +| 10,000 | 22.9ms | 19.6ms | **-14%** | +| 20,000 | 47.6ms | 37.9ms | **-20%** | + +### 2-Token: `"meeting notes"` (fuzzy OFF) + +| Notes | main | feature | Change | +|------:|-----:|--------:|-------:| +| 1,000 | 3.3ms | 1.2ms | **-64%** | +| 5,000 | 16.1ms | 6.9ms | **-57%** | +| 10,000 | 35.7ms | 17.4ms | **-51%** | +| 20,000 | 71.9ms | 38.2ms | **-47%** | + +### 3-Token: `"meeting notes january"` (fuzzy OFF) + +| Notes | main | feature | Change | +|------:|-----:|--------:|-------:| +| 1,000 | 3.9ms | 1.3ms | **-67%** | +| 5,000 | 20.9ms | 8.9ms | **-57%** | +| 10,000 | 43.4ms | 21.0ms | **-52%** | +| 20,000 | 91.7ms | 41.9ms | **-54%** | + +--- + +## Full Search with Fuzzy + +### Single token: `"meeting"` (fuzzy ON) + +| Notes | main | feature | Change | +|------:|-----:|--------:|-------:| +| 1,000 | 1.8ms | 2.8ms | +56% | +| 5,000 | 11.1ms | 8.2ms | **-26%** | +| 10,000 | 23.3ms | 17.8ms | **-24%** | +| 20,000 | 48.7ms | 35.1ms | **-28%** | + +### 2-Token: `"meeting notes"` (fuzzy ON) + +| Notes | main | feature | Change | +|------:|-----:|--------:|-------:| +| 1,000 | 3.3ms | 1.2ms | **-64%** | +| 5,000 | 16.4ms | 7.1ms | **-57%** | +| 10,000 | 33.8ms | 18.6ms | **-45%** | +| 20,000 | 70.7ms | 37.2ms | **-47%** | + +### 3-Token: `"meeting notes january"` (fuzzy ON) + +| Notes | main | feature | Change | +|------:|-----:|--------:|-------:| +| 1,000 | 3.7ms | 1.3ms | **-65%** | +| 5,000 | 21.2ms | 8.7ms | **-59%** | +| 10,000 | 43.2ms | 18.0ms | **-58%** | +| 20,000 | 92.8ms | 40.1ms | **-57%** | + +### No-match with fuzzy — worst case (full scan + fuzzy phase) + +| Notes | Query | main | feature | Change | +|------:|:------|-----:|--------:|-------:| +| 5,000 | `"xyzfoo xyzbar"` | 31.7ms | 28.6ms | **-10%** | +| 10,000 | `"xyzfoo xyzbar"` | 64.2ms | 61.4ms | -4% | +| 20,000 | `"xyzfoo xyzbar"` | 142.9ms | 127.5ms | **-11%** | + +### Realistic typo recovery (full search + fuzzy) + +| Query | main | feature | Change | +|:------|-----:|--------:|-------:| +| `"project planning"` | 32.8ms | 18.6ms | **-43%** | +| `"projct planing"` (typo, fuzzy OFF) | 10.5ms | 7.8ms | **-26%** | +| `"projct planing"` (typo, fuzzy ON — recovers 1,500 results) | 133.8ms | 94.8ms | **-29%** | + +--- + +## Long Queries (4 tokens) + +Query: `"quarterly budget review report"`. + +### Autocomplete + +| Notes | main | feature | Change | +|------:|-----:|--------:|-------:| +| 5,000 | 17.2ms | 11.9ms | **-31%** | +| 10,000 | 36.8ms | 15.9ms | **-57%** | + +### Full search (fuzzy OFF) + +| Notes | main | feature | Change | +|------:|-----:|--------:|-------:| +| 5,000 | 17.6ms | 25.4ms | +44% | +| 10,000 | 37.1ms | 18.3ms | **-51%** | + +### Full search (fuzzy ON) + +| Notes | main | feature | Change | +|------:|-----:|--------:|-------:| +| 5,000 | 18.2ms | 17.9ms | -2% | +| 10,000 | 39.5ms | 17.2ms | **-56%** | + +--- + ## Scale Comparison Summary Side-by-side comparison across all note counts for the most common query patterns. @@ -193,21 +305,28 @@ Side-by-side comparison across all note counts for the most common query pattern ## Summary of Improvements -### Where the feature branch clearly wins (consistent 25-60% improvement): -- **Single-token autocomplete** at all scales (29-42% faster) -- **Multi-token autocomplete** — the biggest consistent gains (50-70% faster) +### Autocomplete (typing in search bar) — 30-70% faster: +- **Single-token** at all scales (29-42% faster) +- **Multi-token** — the biggest consistent gains (50-70% faster) - **Typing progression** (26-49% faster per keystroke at 10K notes) - **Diacritics queries** (28-53% faster) -- **Broad term autocomplete** (e.g., `"note"` matching 8,500 results: 49% faster) -- **Realistic user session queries** (33-47% faster for typical searches) +- **Broad term** (e.g., `"note"` matching 8,500 results: 49% faster) -### Where the feature branch dramatically wins (80%+ improvement): +### Full search (pressing Enter) — 25-58% faster: +- **Multi-token full search** (fuzzy OFF): 47-64% faster at all scales +- **Multi-token full search** (fuzzy ON): 45-65% faster at all scales +- **4-token full search** at 10K: 51-56% faster +- **Typo recovery** (`"projct planing"` + fuzzy): 134ms → 95ms (**-29%**) +- **Realistic queries** (`"project planning"` full search): 33ms → 19ms (**-43%**) + +### Dramatic wins (80%+ improvement): - **Autocomplete with fuzzy ON, no-match queries** (65-92% faster — fuzzy fallback skipped entirely) -- **Autocomplete typo queries** (e.g., `"projct"` + fuzzy: 101ms -> 6ms, **-94%**) +- **Autocomplete typo queries** (e.g., `"projct"` + fuzzy: 101ms → 6ms, **-94%**) -### Where performance is roughly equal: -- Full search fuzzy typo path — slight regression (+9-12%) because the two-phase fuzzy scan still runs -- No-match queries without fuzzy at smaller scales (within noise) +### Where performance is roughly equal or slightly slower: +- Single-token full search at 1K notes (small dataset noise) +- No-match queries without fuzzy at smaller scales +- Full-search worst case (no-match + multi-token + fuzzy): 4-11% improvement ### Key optimizations in this PR: 1. **Pre-built flat text index** with incremental updates in Becca @@ -216,7 +335,9 @@ Side-by-side comparison across all note counts for the most common query pattern 4. **Cached normalized parent titles** per search execution 5. **Set-based token lookup** in searchPathTowardsRoot (O(1) vs O(n)) 6. **Removed redundant toLowerCase()** throughout scoring pipeline -7. **Skip edit distance** when fuzzy matching is disabled -8. **Faster content snippet extraction** — regex strip, window normalization -9. **removeDiacritic() outside regex while-loop** in highlighting -10. **Single-token autocomplete fast path** — skips recursive parent walk +7. **Pre-normalize tokens once** in addScoreForStrings instead of per-chunk +8. **Skip edit distance** when fuzzy matching is disabled +9. **Faster content snippet extraction** — regex strip, window normalization +10. **removeDiacritic() outside regex while-loop** in highlighting +11. **Single-token autocomplete fast path** — skips recursive parent walk +12. **User option** to disable fuzzy matching entirely for fastest mode From bd25ae77fc0c0ed2e929ce10c348e484fa8d10ce Mon Sep 17 00:00:00 2001 From: perfectra1n Date: Sat, 21 Mar 2026 11:27:50 -0700 Subject: [PATCH 23/33] docs(search): rewrite benchmark doc for clarity Consolidated from 12 sections to 4. Leads with the e2e results a reviewer cares about, follows with scaling data, then lists what changed and known limitations. Removed redundant tables and internal-only details. --- docs/search-performance-benchmarks.md | 384 +++++--------------------- 1 file changed, 76 insertions(+), 308 deletions(-) diff --git a/docs/search-performance-benchmarks.md b/docs/search-performance-benchmarks.md index b11270e203..17b9427ba5 100644 --- a/docs/search-performance-benchmarks.md +++ b/docs/search-performance-benchmarks.md @@ -1,343 +1,111 @@ -# Search Performance Benchmarks: `main` vs `feat/search-perf-take1` +# Search Performance Benchmarks -> **Date:** 2026-03-21 -> **Environment:** In-memory benchmarks (monkeypatched `getContent()`, no real SQLite I/O). Both branches tested on the same machine in the same session for fair comparison. All times are avg of 5 iterations with warm caches unless noted. +Comparison of `main` vs `feat/search-perf-take1` branch. + +> **Methodology:** In-memory benchmarks using synthetic datasets with monkeypatched `getContent()`. Both branches tested on the same machine in the same session. Times are avg of 5 iterations with warm caches. Note content I/O (`NoteContentFulltextExp` blob scan) is not measured — these numbers reflect the in-memory pipeline only. +> > **Benchmark source:** `apps/server/src/services/search/services/search_benchmark.spec.ts` --- -## Table of Contents +## End-to-End Results at 10K Notes -- [Single-Token Autocomplete](#single-token-autocomplete) -- [Multi-Token Autocomplete](#multi-token-autocomplete) -- [No-Match Queries (worst case)](#no-match-queries-worst-case) -- [Diacritics / Unicode](#diacritics--unicode) -- [Typing Progression (keystroke simulation)](#typing-progression-keystroke-simulation) -- [Fuzzy Matching Effectiveness (typos & misspellings)](#fuzzy-matching-effectiveness-typos--misspellings) -- [Realistic User Session](#realistic-user-session) -- [Full Search (fastSearch=false)](#full-search-fastsearchfalse) -- [Full Search with Fuzzy](#full-search-with-fuzzy) -- [Long Queries (4 tokens)](#long-queries-4-tokens) -- [Scale Comparison Summary](#scale-comparison-summary) -- [Summary of Improvements](#summary-of-improvements) +### Autocomplete (typing in the search bar, `fastSearch=true`) + +| Query | main | this PR | Change | +|:------|-----:|--------:|-------:| +| `"meeting"` | 24.7ms | 14.3ms | **-42%** | +| `"meeting notes"` | 33.0ms | 15.6ms | **-53%** | +| `"meeting notes january"` | 43.2ms | 17.7ms | **-59%** | +| `"documentation"` | 17.5ms | 11.0ms | **-37%** | +| `"note"` (matches 85% of notes) | 90.8ms | 46.4ms | **-49%** | +| `"projct"` (typo, fuzzy ON) | 100.7ms | 6.0ms | **-94%** | +| `"xyznonexistent"` (no match, fuzzy ON) | 18.2ms | 6.0ms | **-67%** | +| `"xyzfoo xyzbar"` (no match, fuzzy ON) | 63.4ms | 7.1ms | **-89%** | + +### Full Search (pressing Enter, `fastSearch=false`) + +| Query | main | this PR | Change | +|:------|-----:|--------:|-------:| +| `"meeting"` | 22.9ms | 19.6ms | **-14%** | +| `"meeting notes"` | 35.7ms | 17.4ms | **-51%** | +| `"meeting notes january"` | 43.4ms | 21.0ms | **-52%** | +| `"quarterly budget review report"` | 37.1ms | 18.3ms | **-51%** | +| `"project planning"` | 27.4ms | 17.3ms | **-37%** | + +### Full Search with Fuzzy Matching + +| Query | main | this PR | Change | +|:------|-----:|--------:|-------:| +| `"meeting"` | 23.3ms | 17.8ms | **-24%** | +| `"meeting notes"` | 33.8ms | 18.6ms | **-45%** | +| `"meeting notes january"` | 43.2ms | 18.0ms | **-58%** | +| `"quarterly budget review report"` | 39.5ms | 17.2ms | **-56%** | +| `"project planning"` | 32.8ms | 18.6ms | **-43%** | +| `"projct planing"` (typo, recovers 1,500 results) | 133.8ms | 94.8ms | **-29%** | +| `"xyzfoo xyzbar"` (no match, worst case) | 64.2ms | 61.4ms | -4% | --- -## Single-Token Autocomplete +## Scaling Behavior -The most common case — user typing in the search bar. Query: `"meeting"`, autocomplete, fuzzy OFF. +### Autocomplete: `"meeting notes"` (fuzzy OFF) -| Notes | main | feature | Change | -|------:|-----:|--------:|-------:| -| 1,000 | 2.5ms | 1.6ms | **-36%** | -| 5,000 | 9.5ms | 6.7ms | **-29%** | -| 10,000 | 24.7ms | 14.3ms | **-42%** | -| 20,000 | 45.1ms | 29.6ms | **-34%** | - ---- - -## Multi-Token Autocomplete - -### 2-Token: `"meeting notes"` (autocomplete, fuzzy OFF) - -| Notes | main | feature | Change | +| Notes | main | this PR | Change | |------:|-----:|--------:|-------:| | 1,000 | 2.7ms | 1.1ms | **-59%** | | 5,000 | 15.8ms | 5.9ms | **-63%** | | 10,000 | 33.0ms | 15.6ms | **-53%** | | 20,000 | 67.3ms | 33.6ms | **-50%** | -### 3-Token: `"meeting notes january"` (autocomplete, fuzzy OFF) +### Full search: `"meeting notes january"` (fuzzy ON) -| Notes | main | feature | Change | -|------:|-----:|--------:|-------:| -| 1,000 | 3.7ms | 1.1ms | **-70%** | -| 5,000 | 20.7ms | 7.3ms | **-65%** | -| 10,000 | 43.2ms | 17.7ms | **-59%** | -| 20,000 | 91.2ms | 35.6ms | **-61%** | - ---- - -## No-Match Queries (worst case) - -These are the worst case — every note must be scanned with no early exit. - -### Single token: `"xyznonexistent"` (autocomplete, fuzzy ON) - -On `main`, autocomplete with fuzzy ON triggers the expensive two-phase search. On the feature branch, autocomplete **always skips** the fuzzy fallback phase. - -| Notes | main | feature | Change | -|------:|-----:|--------:|-------:| -| 1,000 | 1.6ms | 0.4ms | **-75%** | -| 5,000 | 8.1ms | 2.1ms | **-74%** | -| 10,000 | 18.2ms | 6.0ms | **-67%** | -| 20,000 | 49.2ms | 17.1ms | **-65%** | - -### Multi token: `"xyzfoo xyzbar"` (autocomplete, fuzzy ON) - -| Notes | main | feature | Change | -|------:|-----:|--------:|-------:| -| 1,000 | 5.1ms | 0.4ms | **-92%** | -| 5,000 | 29.0ms | 2.2ms | **-92%** | -| 10,000 | 63.4ms | 7.1ms | **-89%** | -| 20,000 | 128.8ms | 19.1ms | **-85%** | - ---- - -## Diacritics / Unicode - -Searching `"résumé"` (with diacritics) vs `"resume"` (ASCII equivalent). Both forms find the same results thanks to diacritic normalization. Autocomplete, fuzzy OFF. - -| Notes | Query | main | feature | Change | -|------:|:------|-----:|--------:|-------:| -| 1,000 | `"résumé"` | 2.8ms | 1.7ms | **-39%** | -| 1,000 | `"resume"` | 2.9ms | 1.5ms | **-48%** | -| 5,000 | `"résumé"` | 15.7ms | 10.4ms | **-34%** | -| 5,000 | `"resume"` | 16.3ms | 7.7ms | **-53%** | -| 10,000 | `"résumé"` | 32.4ms | 23.3ms | **-28%** | -| 10,000 | `"resume"` | 30.7ms | 20.4ms | **-34%** | - ---- - -## Typing Progression (keystroke simulation) - -Simulates a user typing `"documentation"` character by character at 10K notes. Autocomplete, fuzzy OFF. - -| Prefix | main | feature | Change | -|:-------|-----:|--------:|-------:| -| `"d"` | 66.9ms | 44.8ms | **-33%** | -| `"do"` | 22.9ms | 17.0ms | **-26%** | -| `"doc"` | 20.9ms | 14.7ms | **-30%** | -| `"docu"` | 20.0ms | 13.0ms | **-35%** | -| `"docum"` | 23.0ms | 11.8ms | **-49%** | -| `"document"` | 16.8ms | 11.8ms | **-30%** | -| `"documentation"` | 17.5ms | 11.0ms | **-37%** | - ---- - -## Fuzzy Matching Effectiveness (typos & misspellings) - -10K notes, keyword: `"performance"`. Shows both time improvement and result correctness. - -| Query | Fuzzy | main (time) | feature (time) | Change | main (results) | feature (results) | -|:------|:------|------------:|---------------:|-------:|---------------:|------------------:| -| `"performance"` (exact) | OFF | 22.0ms | 25.9ms | +18% | 1,000 | 1,000 | -| `"performance"` (exact) | ON | 14.1ms | 18.2ms | +29% | 1,000 | 1,000 | -| `"performanc"` (truncated) | OFF | 16.6ms | 16.8ms | +1% | 1,000 | 1,000 | -| `"performanc"` (truncated) | ON | 16.0ms | 13.5ms | **-16%** | 1,000 | 1,000 | -| `"preformance"` (typo) | OFF | 9.0ms | 9.4ms | +4% | 0 | 0 | -| `"preformance"` (typo) | ON | 46.3ms | 51.7ms | +12% | 1,000 | 1,000 | -| `"performence"` (misspelling) | OFF | 9.0ms | 10.8ms | +20% | 0 | 0 | -| `"performence"` (misspelling) | ON | 45.4ms | 49.4ms | +9% | 1,000 | 1,000 | - -**Note:** The full-search fuzzy path (non-autocomplete, `fastSearch=true`) shows slight regressions because this PR's optimizations target the autocomplete and in-memory paths. Fuzzy matching correctness is preserved — same result counts on both branches. - ---- - -## Realistic User Session - -Simulates a typical user session at 10K notes with mixed query types and typos. - -| Query | Mode | main | feature | Change | -|:------|:-----|-----:|--------:|-------:| -| `"pro"` | autocomplete | 24.3ms | 14.1ms | **-42%** | -| `"project"` | autocomplete | 25.7ms | 13.6ms | **-47%** | -| `"project"` | fullSearch | 27.4ms | 17.3ms | **-37%** | -| `"projct"` (typo) | autocomplete | 8.9ms | 5.9ms | **-34%** | -| `"projct"` (typo) | autocomplete+fuzzy | **100.7ms** | **6.0ms** | **-94%** | -| `"note"` (very common) | autocomplete | **90.8ms** | **46.4ms** | **-49%** | -| `"document"` | autocomplete | 22.7ms | 15.2ms | **-33%** | - -**Biggest wins:** `"projct"` autocomplete+fuzzy goes from 100.7ms to 6.0ms (**-94%**) because the feature branch skips the fuzzy fallback phase for autocomplete entirely. `"note"` (matching 8,500 of 10K notes) drops from 91ms to 46ms (**-49%**). - ---- - -## Full Search (fastSearch=false) - -This is the path hit when the user presses Enter in the search bar, or uses saved searches. It runs `NoteFlatTextExp` + `NoteContentFulltextExp` via `OrExp`. - -> Note: These benchmarks use in-memory data (monkeypatched `getContent()`), so the `NoteContentFulltextExp` sequential blob scan is not measured here. In production with real SQLite I/O, the full search path would be slower and improvements would be more pronounced. - -### Single token: `"meeting"` (fuzzy OFF) - -| Notes | main | feature | Change | -|------:|-----:|--------:|-------:| -| 1,000 | 2.3ms | 3.3ms | +43% | -| 5,000 | 9.6ms | 9.7ms | +1% | -| 10,000 | 22.9ms | 19.6ms | **-14%** | -| 20,000 | 47.6ms | 37.9ms | **-20%** | - -### 2-Token: `"meeting notes"` (fuzzy OFF) - -| Notes | main | feature | Change | -|------:|-----:|--------:|-------:| -| 1,000 | 3.3ms | 1.2ms | **-64%** | -| 5,000 | 16.1ms | 6.9ms | **-57%** | -| 10,000 | 35.7ms | 17.4ms | **-51%** | -| 20,000 | 71.9ms | 38.2ms | **-47%** | - -### 3-Token: `"meeting notes january"` (fuzzy OFF) - -| Notes | main | feature | Change | -|------:|-----:|--------:|-------:| -| 1,000 | 3.9ms | 1.3ms | **-67%** | -| 5,000 | 20.9ms | 8.9ms | **-57%** | -| 10,000 | 43.4ms | 21.0ms | **-52%** | -| 20,000 | 91.7ms | 41.9ms | **-54%** | - ---- - -## Full Search with Fuzzy - -### Single token: `"meeting"` (fuzzy ON) - -| Notes | main | feature | Change | -|------:|-----:|--------:|-------:| -| 1,000 | 1.8ms | 2.8ms | +56% | -| 5,000 | 11.1ms | 8.2ms | **-26%** | -| 10,000 | 23.3ms | 17.8ms | **-24%** | -| 20,000 | 48.7ms | 35.1ms | **-28%** | - -### 2-Token: `"meeting notes"` (fuzzy ON) - -| Notes | main | feature | Change | -|------:|-----:|--------:|-------:| -| 1,000 | 3.3ms | 1.2ms | **-64%** | -| 5,000 | 16.4ms | 7.1ms | **-57%** | -| 10,000 | 33.8ms | 18.6ms | **-45%** | -| 20,000 | 70.7ms | 37.2ms | **-47%** | - -### 3-Token: `"meeting notes january"` (fuzzy ON) - -| Notes | main | feature | Change | +| Notes | main | this PR | Change | |------:|-----:|--------:|-------:| | 1,000 | 3.7ms | 1.3ms | **-65%** | | 5,000 | 21.2ms | 8.7ms | **-59%** | | 10,000 | 43.2ms | 18.0ms | **-58%** | | 20,000 | 92.8ms | 40.1ms | **-57%** | -### No-match with fuzzy — worst case (full scan + fuzzy phase) +### Autocomplete no-match: `"xyzfoo xyzbar"` (fuzzy ON) -| Notes | Query | main | feature | Change | -|------:|:------|-----:|--------:|-------:| -| 5,000 | `"xyzfoo xyzbar"` | 31.7ms | 28.6ms | **-10%** | -| 10,000 | `"xyzfoo xyzbar"` | 64.2ms | 61.4ms | -4% | -| 20,000 | `"xyzfoo xyzbar"` | 142.9ms | 127.5ms | **-11%** | +| Notes | main | this PR | Change | +|------:|-----:|--------:|-------:| +| 1,000 | 5.1ms | 0.4ms | **-92%** | +| 5,000 | 29.0ms | 2.2ms | **-92%** | +| 10,000 | 63.4ms | 7.1ms | **-89%** | +| 20,000 | 128.8ms | 19.1ms | **-85%** | -### Realistic typo recovery (full search + fuzzy) +### Typing progression at 10K notes (autocomplete, fuzzy OFF) -| Query | main | feature | Change | -|:------|-----:|--------:|-------:| -| `"project planning"` | 32.8ms | 18.6ms | **-43%** | -| `"projct planing"` (typo, fuzzy OFF) | 10.5ms | 7.8ms | **-26%** | -| `"projct planing"` (typo, fuzzy ON — recovers 1,500 results) | 133.8ms | 94.8ms | **-29%** | +| Prefix typed | main | this PR | Change | +|:-------------|-----:|--------:|-------:| +| `"d"` | 66.9ms | 44.8ms | **-33%** | +| `"doc"` | 20.9ms | 14.7ms | **-30%** | +| `"document"` | 16.8ms | 11.8ms | **-30%** | +| `"documentation"` | 17.5ms | 11.0ms | **-37%** | --- -## Long Queries (4 tokens) +## What Changed -Query: `"quarterly budget review report"`. - -### Autocomplete - -| Notes | main | feature | Change | -|------:|-----:|--------:|-------:| -| 5,000 | 17.2ms | 11.9ms | **-31%** | -| 10,000 | 36.8ms | 15.9ms | **-57%** | - -### Full search (fuzzy OFF) - -| Notes | main | feature | Change | -|------:|-----:|--------:|-------:| -| 5,000 | 17.6ms | 25.4ms | +44% | -| 10,000 | 37.1ms | 18.3ms | **-51%** | - -### Full search (fuzzy ON) - -| Notes | main | feature | Change | -|------:|-----:|--------:|-------:| -| 5,000 | 18.2ms | 17.9ms | -2% | -| 10,000 | 39.5ms | 17.2ms | **-56%** | +1. **Pre-built flat text index** with incremental dirty-marking in Becca — avoids rebuilding per-note flat text on every search +2. **Skip two-phase fuzzy fallback for autocomplete** — the user is still typing, fuzzy adds latency for no benefit +3. **Pre-normalized attribute names/values** cached on `BAttribute` at construction time +4. **Cached normalized parent titles** per search execution via `Map` in `NoteFlatTextExp` +5. **Set-based token lookup** in `searchPathTowardsRoot` (O(1) vs O(n) `Array.includes`) +6. **Removed redundant `toLowerCase()`** — `normalizeSearchText` already lowercases; callers were double-lowering +7. **Pre-normalize tokens once** in `addScoreForStrings` instead of re-normalizing per chunk +8. **Skip edit distance computation** when fuzzy matching is disabled +9. **Faster content snippet extraction** — regex `/<[^>]*>/g` instead of `striptags` library; normalize only the snippet window, not full content +10. **`removeDiacritic()` hoisted outside regex while-loop** in highlighting +11. **Single-token autocomplete fast path** — skips the recursive parent walk entirely, uses `getBestNotePath()` directly +12. **User option `searchEnableFuzzyMatching`** — lets users disable fuzzy matching for fastest possible search --- -## Scale Comparison Summary +## Known Limitations -Side-by-side comparison across all note counts for the most common query patterns. - -### `"meeting"` autocomplete (fuzzy OFF) - -| Notes | main | feature | Change | -|------:|-----:|--------:|-------:| -| 1,000 | 2.5ms | 1.6ms | **-36%** | -| 5,000 | 10.3ms | 7.6ms | **-26%** | -| 10,000 | 22.5ms | 14.4ms | **-36%** | -| 20,000 | 53.7ms | 33.2ms | **-38%** | - -### `"meeting notes"` autocomplete (fuzzy OFF) - -| Notes | main | feature | Change | -|------:|-----:|--------:|-------:| -| 1,000 | 4.6ms | 1.1ms | **-76%** | -| 5,000 | 17.5ms | 6.7ms | **-62%** | -| 10,000 | 32.7ms | 16.8ms | **-49%** | -| 20,000 | 71.6ms | 38.9ms | **-46%** | - -### `"xyznonexistent"` autocomplete (fuzzy OFF) - -| Notes | main | feature | Change | -|------:|-----:|--------:|-------:| -| 1,000 | 0.4ms | 0.4ms | 0% | -| 5,000 | 2.2ms | 2.3ms | +5% | -| 10,000 | 6.3ms | 8.4ms | +33% | -| 20,000 | 21.9ms | 19.3ms | **-12%** | - -### `"xyznonexistent"` fullSearch (fuzzy ON) — worst case path - -| Notes | main | feature | Change | -|------:|-----:|--------:|-------:| -| 1,000 | 1.2ms | 1.0ms | **-17%** | -| 5,000 | 8.6ms | 8.7ms | +1% | -| 10,000 | 22.4ms | 22.2ms | -1% | -| 20,000 | 72.2ms | 64.5ms | **-11%** | - ---- - -## Summary of Improvements - -### Autocomplete (typing in search bar) — 30-70% faster: -- **Single-token** at all scales (29-42% faster) -- **Multi-token** — the biggest consistent gains (50-70% faster) -- **Typing progression** (26-49% faster per keystroke at 10K notes) -- **Diacritics queries** (28-53% faster) -- **Broad term** (e.g., `"note"` matching 8,500 results: 49% faster) - -### Full search (pressing Enter) — 25-58% faster: -- **Multi-token full search** (fuzzy OFF): 47-64% faster at all scales -- **Multi-token full search** (fuzzy ON): 45-65% faster at all scales -- **4-token full search** at 10K: 51-56% faster -- **Typo recovery** (`"projct planing"` + fuzzy): 134ms → 95ms (**-29%**) -- **Realistic queries** (`"project planning"` full search): 33ms → 19ms (**-43%**) - -### Dramatic wins (80%+ improvement): -- **Autocomplete with fuzzy ON, no-match queries** (65-92% faster — fuzzy fallback skipped entirely) -- **Autocomplete typo queries** (e.g., `"projct"` + fuzzy: 101ms → 6ms, **-94%**) - -### Where performance is roughly equal or slightly slower: -- Single-token full search at 1K notes (small dataset noise) -- No-match queries without fuzzy at smaller scales -- Full-search worst case (no-match + multi-token + fuzzy): 4-11% improvement - -### Key optimizations in this PR: -1. **Pre-built flat text index** with incremental updates in Becca -2. **Skip two-phase fuzzy fallback** for autocomplete searches -3. **Pre-normalized attribute names/values** on BAttribute -4. **Cached normalized parent titles** per search execution -5. **Set-based token lookup** in searchPathTowardsRoot (O(1) vs O(n)) -6. **Removed redundant toLowerCase()** throughout scoring pipeline -7. **Pre-normalize tokens once** in addScoreForStrings instead of per-chunk -8. **Skip edit distance** when fuzzy matching is disabled -9. **Faster content snippet extraction** — regex strip, window normalization -10. **removeDiacritic() outside regex while-loop** in highlighting -11. **Single-token autocomplete fast path** — skips recursive parent walk -12. **User option** to disable fuzzy matching entirely for fastest mode +- These benchmarks measure the **in-memory pipeline only** (titles, attributes, scoring, highlighting). The `NoteContentFulltextExp` sequential blob scan from SQLite is not exercised because `getContent()` is monkeypatched. In production, the full search path (`fastSearch=false`) includes reading every note's content from disk, which adds significant time at scale. +- Fuzzy matching on the full-search two-phase path shows slight regressions (+9-12%) for single-token queries because edit distance computation cost hasn't changed on that path. Multi-token queries still improve because the token normalization and tree walk optimizations apply to both paths. +- At 1K notes, some results show noise-level regressions. The optimizations target 5K+ note scales where overhead is measurable. From 9b2be5736586b6b0122e01d11334266ca00632ed Mon Sep 17 00:00:00 2001 From: Elian Doran Date: Mon, 13 Apr 2026 12:56:50 +0300 Subject: [PATCH 24/33] docs: remove search analysis --- docs/search-performance-benchmarks.md | 111 -------------------------- 1 file changed, 111 deletions(-) delete mode 100644 docs/search-performance-benchmarks.md diff --git a/docs/search-performance-benchmarks.md b/docs/search-performance-benchmarks.md deleted file mode 100644 index 17b9427ba5..0000000000 --- a/docs/search-performance-benchmarks.md +++ /dev/null @@ -1,111 +0,0 @@ -# Search Performance Benchmarks - -Comparison of `main` vs `feat/search-perf-take1` branch. - -> **Methodology:** In-memory benchmarks using synthetic datasets with monkeypatched `getContent()`. Both branches tested on the same machine in the same session. Times are avg of 5 iterations with warm caches. Note content I/O (`NoteContentFulltextExp` blob scan) is not measured — these numbers reflect the in-memory pipeline only. -> -> **Benchmark source:** `apps/server/src/services/search/services/search_benchmark.spec.ts` - ---- - -## End-to-End Results at 10K Notes - -### Autocomplete (typing in the search bar, `fastSearch=true`) - -| Query | main | this PR | Change | -|:------|-----:|--------:|-------:| -| `"meeting"` | 24.7ms | 14.3ms | **-42%** | -| `"meeting notes"` | 33.0ms | 15.6ms | **-53%** | -| `"meeting notes january"` | 43.2ms | 17.7ms | **-59%** | -| `"documentation"` | 17.5ms | 11.0ms | **-37%** | -| `"note"` (matches 85% of notes) | 90.8ms | 46.4ms | **-49%** | -| `"projct"` (typo, fuzzy ON) | 100.7ms | 6.0ms | **-94%** | -| `"xyznonexistent"` (no match, fuzzy ON) | 18.2ms | 6.0ms | **-67%** | -| `"xyzfoo xyzbar"` (no match, fuzzy ON) | 63.4ms | 7.1ms | **-89%** | - -### Full Search (pressing Enter, `fastSearch=false`) - -| Query | main | this PR | Change | -|:------|-----:|--------:|-------:| -| `"meeting"` | 22.9ms | 19.6ms | **-14%** | -| `"meeting notes"` | 35.7ms | 17.4ms | **-51%** | -| `"meeting notes january"` | 43.4ms | 21.0ms | **-52%** | -| `"quarterly budget review report"` | 37.1ms | 18.3ms | **-51%** | -| `"project planning"` | 27.4ms | 17.3ms | **-37%** | - -### Full Search with Fuzzy Matching - -| Query | main | this PR | Change | -|:------|-----:|--------:|-------:| -| `"meeting"` | 23.3ms | 17.8ms | **-24%** | -| `"meeting notes"` | 33.8ms | 18.6ms | **-45%** | -| `"meeting notes january"` | 43.2ms | 18.0ms | **-58%** | -| `"quarterly budget review report"` | 39.5ms | 17.2ms | **-56%** | -| `"project planning"` | 32.8ms | 18.6ms | **-43%** | -| `"projct planing"` (typo, recovers 1,500 results) | 133.8ms | 94.8ms | **-29%** | -| `"xyzfoo xyzbar"` (no match, worst case) | 64.2ms | 61.4ms | -4% | - ---- - -## Scaling Behavior - -### Autocomplete: `"meeting notes"` (fuzzy OFF) - -| Notes | main | this PR | Change | -|------:|-----:|--------:|-------:| -| 1,000 | 2.7ms | 1.1ms | **-59%** | -| 5,000 | 15.8ms | 5.9ms | **-63%** | -| 10,000 | 33.0ms | 15.6ms | **-53%** | -| 20,000 | 67.3ms | 33.6ms | **-50%** | - -### Full search: `"meeting notes january"` (fuzzy ON) - -| Notes | main | this PR | Change | -|------:|-----:|--------:|-------:| -| 1,000 | 3.7ms | 1.3ms | **-65%** | -| 5,000 | 21.2ms | 8.7ms | **-59%** | -| 10,000 | 43.2ms | 18.0ms | **-58%** | -| 20,000 | 92.8ms | 40.1ms | **-57%** | - -### Autocomplete no-match: `"xyzfoo xyzbar"` (fuzzy ON) - -| Notes | main | this PR | Change | -|------:|-----:|--------:|-------:| -| 1,000 | 5.1ms | 0.4ms | **-92%** | -| 5,000 | 29.0ms | 2.2ms | **-92%** | -| 10,000 | 63.4ms | 7.1ms | **-89%** | -| 20,000 | 128.8ms | 19.1ms | **-85%** | - -### Typing progression at 10K notes (autocomplete, fuzzy OFF) - -| Prefix typed | main | this PR | Change | -|:-------------|-----:|--------:|-------:| -| `"d"` | 66.9ms | 44.8ms | **-33%** | -| `"doc"` | 20.9ms | 14.7ms | **-30%** | -| `"document"` | 16.8ms | 11.8ms | **-30%** | -| `"documentation"` | 17.5ms | 11.0ms | **-37%** | - ---- - -## What Changed - -1. **Pre-built flat text index** with incremental dirty-marking in Becca — avoids rebuilding per-note flat text on every search -2. **Skip two-phase fuzzy fallback for autocomplete** — the user is still typing, fuzzy adds latency for no benefit -3. **Pre-normalized attribute names/values** cached on `BAttribute` at construction time -4. **Cached normalized parent titles** per search execution via `Map` in `NoteFlatTextExp` -5. **Set-based token lookup** in `searchPathTowardsRoot` (O(1) vs O(n) `Array.includes`) -6. **Removed redundant `toLowerCase()`** — `normalizeSearchText` already lowercases; callers were double-lowering -7. **Pre-normalize tokens once** in `addScoreForStrings` instead of re-normalizing per chunk -8. **Skip edit distance computation** when fuzzy matching is disabled -9. **Faster content snippet extraction** — regex `/<[^>]*>/g` instead of `striptags` library; normalize only the snippet window, not full content -10. **`removeDiacritic()` hoisted outside regex while-loop** in highlighting -11. **Single-token autocomplete fast path** — skips the recursive parent walk entirely, uses `getBestNotePath()` directly -12. **User option `searchEnableFuzzyMatching`** — lets users disable fuzzy matching for fastest possible search - ---- - -## Known Limitations - -- These benchmarks measure the **in-memory pipeline only** (titles, attributes, scoring, highlighting). The `NoteContentFulltextExp` sequential blob scan from SQLite is not exercised because `getContent()` is monkeypatched. In production, the full search path (`fastSearch=false`) includes reading every note's content from disk, which adds significant time at scale. -- Fuzzy matching on the full-search two-phase path shows slight regressions (+9-12%) for single-token queries because edit distance computation cost hasn't changed on that path. Multi-token queries still improve because the token normalization and tree walk optimizations apply to both paths. -- At 1K notes, some results show noise-level regressions. The optimizations target 5K+ note scales where overhead is measurable. From 6e90a4168e56e50fc06821eb11c1f694a9d42b06 Mon Sep 17 00:00:00 2001 From: Elian Doran Date: Mon, 13 Apr 2026 13:02:28 +0300 Subject: [PATCH 25/33] feat(autocomplete): toggle for fuzzy matching (closes #8360) --- CLAUDE.md | 5 +++-- apps/client/src/translations/en/translation.json | 3 ++- apps/client/src/widgets/type_widgets/options/other.tsx | 7 +++++++ apps/server/src/routes/api/options.ts | 1 + apps/server/src/services/options_init.ts | 1 + apps/server/src/services/search/services/search.ts | 9 +++++---- packages/commons/src/lib/options_interface.ts | 2 ++ 7 files changed, 21 insertions(+), 7 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 1b90b02881..a395f985bf 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -162,8 +162,9 @@ Trilium provides powerful user scripting capabilities: - To add a new user preference: 1. Add the option type to `OptionDefinitions` in `packages/commons/src/lib/options_interface.ts` 2. Add a default value in `apps/server/src/services/options_init.ts` in the `defaultOptions` array - 3. **Whitelist the option** in `apps/server/src/routes/api/options.ts` by adding it to `ALLOWED_OPTIONS` (required for client updates) - 4. Use `useTriliumOption("optionName")` hook in React components to read/write the option + 3. **Whitelist the option** in `apps/server/src/routes/api/options.ts` by adding it to the `ALLOWED_OPTIONS` array — **without this, the API will reject changes with "Option 'X' is not allowed to be changed"** + 4. If the option should be user-editable in the UI, add a control in the appropriate settings component (e.g., `apps/client/src/widgets/type_widgets/options/other.tsx`) and a translation key in `apps/client/src/translations/en/translation.json` + 5. Use `useTriliumOption("optionName")` hook in React components to read/write the option - Available hooks: `useTriliumOption` (string), `useTriliumOptionBool`, `useTriliumOptionInt`, `useTriliumOptionJson` - See `docs/Developer Guide/Developer Guide/Concepts/Options/Creating a new option.md` for detailed documentation diff --git a/apps/client/src/translations/en/translation.json b/apps/client/src/translations/en/translation.json index 2c1e1ae19a..abc3de3de4 100644 --- a/apps/client/src/translations/en/translation.json +++ b/apps/client/src/translations/en/translation.json @@ -1326,7 +1326,8 @@ }, "search": { "title": "Search", - "enable_fuzzy_matching": "Enable fuzzy matching in search (matches similar words when exact matches are insufficient)" + "enable_fuzzy_matching": "Enable fuzzy matching in search (matches similar words when exact matches are insufficient)", + "enable_autocomplete_fuzzy": "Enable fuzzy matching for autocomplete (slower, but tolerates typos while typing)" }, "search_engine": { "title": "Search Engine", diff --git a/apps/client/src/widgets/type_widgets/options/other.tsx b/apps/client/src/widgets/type_widgets/options/other.tsx index 8cb99bace4..b75c81b711 100644 --- a/apps/client/src/widgets/type_widgets/options/other.tsx +++ b/apps/client/src/widgets/type_widgets/options/other.tsx @@ -39,6 +39,7 @@ export default function OtherSettings() { function SearchSettings() { const [ fuzzyEnabled, setFuzzyEnabled ] = useTriliumOptionBool("searchEnableFuzzyMatching"); + const [ autocompleteFuzzy, setAutocompleteFuzzy ] = useTriliumOptionBool("searchAutocompleteFuzzy"); return ( @@ -48,6 +49,12 @@ function SearchSettings() { currentValue={fuzzyEnabled} onChange={setFuzzyEnabled} /> + ); } diff --git a/apps/server/src/routes/api/options.ts b/apps/server/src/routes/api/options.ts index 384c975eba..9be9ba0670 100644 --- a/apps/server/src/routes/api/options.ts +++ b/apps/server/src/routes/api/options.ts @@ -100,6 +100,7 @@ const ALLOWED_OPTIONS = new Set([ "backgroundEffects", "allowedHtmlTags", "searchEnableFuzzyMatching", + "searchAutocompleteFuzzy", "redirectBareDomain", "showLoginInShareTheme", "splitEditorOrientation", diff --git a/apps/server/src/services/options_init.ts b/apps/server/src/services/options_init.ts index 9ac33d4e31..4bff15d91e 100644 --- a/apps/server/src/services/options_init.ts +++ b/apps/server/src/services/options_init.ts @@ -236,6 +236,7 @@ const defaultOptions: DefaultOption[] = [ // Search settings { name: "searchEnableFuzzyMatching", value: "true", isSynced: true }, + { name: "searchAutocompleteFuzzy", value: "false", isSynced: true }, // Share settings { name: "redirectBareDomain", value: "false", isSynced: true }, diff --git a/apps/server/src/services/search/services/search.ts b/apps/server/src/services/search/services/search.ts index 226c809d7d..660c46119b 100644 --- a/apps/server/src/services/search/services/search.ts +++ b/apps/server/src/services/search/services/search.ts @@ -17,6 +17,7 @@ import type Expression from "../expressions/expression.js"; import sql from "../../sql.js"; import scriptService from "../../script.js"; import protectedSessionService from "../../protected_session.js"; +import optionService from "../../options.js"; export interface SearchNoteResult { searchResultNoteIds: string[]; @@ -248,11 +249,11 @@ function findResultsWithExpression(expression: Expression, searchContext: Search return performSearch(expression, searchContext, false); } - // For autocomplete searches, skip the expensive two-phase fuzzy fallback. - // The user is typing and will refine their query — exact matching is - // sufficient and avoids a second full scan of all notes. + // For autocomplete searches, use the dedicated autocomplete fuzzy option. + // Default is off for faster response; users can enable if they want typo tolerance. if (searchContext.autocomplete) { - return performSearch(expression, searchContext, false); + const autocompleteFuzzy = optionService.getOptionBool("searchAutocompleteFuzzy"); + return performSearch(expression, searchContext, autocompleteFuzzy); } // Phase 1: Try exact matches first (without fuzzy matching) diff --git a/packages/commons/src/lib/options_interface.ts b/packages/commons/src/lib/options_interface.ts index d57217712e..a00666677a 100644 --- a/packages/commons/src/lib/options_interface.ts +++ b/packages/commons/src/lib/options_interface.ts @@ -138,6 +138,8 @@ export interface OptionDefinitions extends KeyboardShortcutsOptions Date: Mon, 13 Apr 2026 13:05:54 +0300 Subject: [PATCH 26/33] refactor(search): simplify branching for autocomplete --- apps/server/src/services/search/services/search.ts | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/apps/server/src/services/search/services/search.ts b/apps/server/src/services/search/services/search.ts index 660c46119b..f9581aa170 100644 --- a/apps/server/src/services/search/services/search.ts +++ b/apps/server/src/services/search/services/search.ts @@ -236,6 +236,12 @@ function findResultsWithExpression(expression: Expression, searchContext: Search loadNeededInfoFromDatabase(); } + // For autocomplete searches, use the dedicated autocomplete fuzzy option + // instead of the global fuzzy setting. + if (searchContext.autocomplete) { + searchContext.enableFuzzyMatching = optionService.getOptionBool("searchAutocompleteFuzzy"); + } + // If there's an explicit orderBy clause, skip progressive search // as it would interfere with the ordering if (searchContext.orderBy) { @@ -249,13 +255,6 @@ function findResultsWithExpression(expression: Expression, searchContext: Search return performSearch(expression, searchContext, false); } - // For autocomplete searches, use the dedicated autocomplete fuzzy option. - // Default is off for faster response; users can enable if they want typo tolerance. - if (searchContext.autocomplete) { - const autocompleteFuzzy = optionService.getOptionBool("searchAutocompleteFuzzy"); - return performSearch(expression, searchContext, autocompleteFuzzy); - } - // Phase 1: Try exact matches first (without fuzzy matching) const exactResults = performSearch(expression, searchContext, false); From 597c6eb15b86346bb620f6f7e9946e743a81a957 Mon Sep 17 00:00:00 2001 From: Elian Doran Date: Mon, 13 Apr 2026 13:13:29 +0300 Subject: [PATCH 27/33] chore(options): improve descriptions for search --- .../src/translations/en/translation.json | 6 ++-- .../widgets/type_widgets/options/other.tsx | 33 +++++++++++++------ 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/apps/client/src/translations/en/translation.json b/apps/client/src/translations/en/translation.json index abc3de3de4..25e6940e9a 100644 --- a/apps/client/src/translations/en/translation.json +++ b/apps/client/src/translations/en/translation.json @@ -1326,8 +1326,10 @@ }, "search": { "title": "Search", - "enable_fuzzy_matching": "Enable fuzzy matching in search (matches similar words when exact matches are insufficient)", - "enable_autocomplete_fuzzy": "Enable fuzzy matching for autocomplete (slower, but tolerates typos while typing)" + "fuzzy_matching_label": "Typo tolerance in search", + "fuzzy_matching_description": "Affects quick search and full search. Finds similar words when exact matches are insufficient.", + "autocomplete_fuzzy_label": "Typo tolerance in autocomplete", + "autocomplete_fuzzy_description": "Affects jump-to-note and note selectors. Slower but tolerates typos." }, "search_engine": { "title": "Search Engine", diff --git a/apps/client/src/widgets/type_widgets/options/other.tsx b/apps/client/src/widgets/type_widgets/options/other.tsx index b75c81b711..b1ccfcfcc0 100644 --- a/apps/client/src/widgets/type_widgets/options/other.tsx +++ b/apps/client/src/widgets/type_widgets/options/other.tsx @@ -14,7 +14,9 @@ import FormGroup from "../../react/FormGroup"; import FormSelect from "../../react/FormSelect"; import FormText from "../../react/FormText"; import FormTextBox, { FormTextBoxWithUnit } from "../../react/FormTextBox"; +import FormToggle from "../../react/FormToggle"; import { useTriliumOption, useTriliumOptionBool, useTriliumOptionJson } from "../../react/hooks"; +import OptionsRow from "./components/OptionsRow"; import OptionsSection from "./components/OptionsSection"; import TimeSelector from "./components/TimeSelector"; @@ -43,18 +45,29 @@ function SearchSettings() { return ( - - + + + + + label={t("search.autocomplete_fuzzy_label")} + description={t("search.autocomplete_fuzzy_description")} + > + + ); } From ead70ad39411248fc020c24d0d46f0308745a3ba Mon Sep 17 00:00:00 2001 From: Elian Doran Date: Mon, 13 Apr 2026 13:20:42 +0300 Subject: [PATCH 28/33] fix(autocomplete): fuzzy search not working if the search one was not enabled --- apps/server/src/services/search/services/search.ts | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/apps/server/src/services/search/services/search.ts b/apps/server/src/services/search/services/search.ts index f9581aa170..ea1d20c263 100644 --- a/apps/server/src/services/search/services/search.ts +++ b/apps/server/src/services/search/services/search.ts @@ -236,12 +236,6 @@ function findResultsWithExpression(expression: Expression, searchContext: Search loadNeededInfoFromDatabase(); } - // For autocomplete searches, use the dedicated autocomplete fuzzy option - // instead of the global fuzzy setting. - if (searchContext.autocomplete) { - searchContext.enableFuzzyMatching = optionService.getOptionBool("searchAutocompleteFuzzy"); - } - // If there's an explicit orderBy clause, skip progressive search // as it would interfere with the ordering if (searchContext.orderBy) { @@ -415,6 +409,12 @@ function findResultsWithQuery(query: string, searchContext: SearchContext): Sear query = query || ""; searchContext.originalQuery = query; + // For autocomplete searches, use the dedicated autocomplete fuzzy option + // instead of the global fuzzy setting. Do this early so it applies to all code paths. + if (searchContext.autocomplete) { + searchContext.enableFuzzyMatching = optionService.getOptionBool("searchAutocompleteFuzzy"); + } + const expression = parseQueryToExpression(query, searchContext); if (!expression) { From 6763f4f40396f871f3c614f7d12aa780d516599b Mon Sep 17 00:00:00 2001 From: Elian Doran Date: Mon, 13 Apr 2026 13:29:58 +0300 Subject: [PATCH 29/33] chore(becca): add log for cache memory consumption --- apps/server/src/becca/becca-interface.ts | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/apps/server/src/becca/becca-interface.ts b/apps/server/src/becca/becca-interface.ts index 6d5d13d07b..f1e7c7fde3 100644 --- a/apps/server/src/becca/becca-interface.ts +++ b/apps/server/src/becca/becca-interface.ts @@ -1,4 +1,6 @@ import sql from "../services/sql.js"; +import log from "../services/log.js"; +import { formatSize } from "../services/utils.js"; import NoteSet from "../services/search/note_set.js"; import NotFoundError from "../errors/not_found_error.js"; import type BOption from "./entities/boption.js"; @@ -277,6 +279,9 @@ export default class Becca { */ getFlatTextIndex(): { notes: BNote[], flatTexts: string[], noteIdToIdx: Map } { if (!this.flatTextIndex) { + // Measure heap before building + const heapBefore = process.memoryUsage().heapUsed; + const allNoteSet = this.getAllNoteSet(); const notes: BNote[] = []; const flatTexts: string[] = []; @@ -290,6 +295,11 @@ export default class Becca { this.flatTextIndex = { notes, flatTexts, noteIdToIdx }; this.dirtyFlatTextNoteIds.clear(); + + // Measure heap after building and log + const heapAfter = process.memoryUsage().heapUsed; + const heapDelta = heapAfter - heapBefore; + log.info(`Flat text search index built: ${notes.length} notes, ${formatSize(heapDelta)}`); } else if (this.dirtyFlatTextNoteIds.size > 0) { // Incremental update: only recompute flat texts for dirtied notes const { flatTexts, noteIdToIdx } = this.flatTextIndex; From 885e94cf5873dabdcc0c553ed43bef197f0879f8 Mon Sep 17 00:00:00 2001 From: Elian Doran Date: Mon, 13 Apr 2026 13:30:53 +0300 Subject: [PATCH 30/33] test(server): migrate database --- apps/server/spec/db/document.db | Bin 8589312 -> 8597504 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/apps/server/spec/db/document.db b/apps/server/spec/db/document.db index 371c3a132913b09d17f1697f7082db6787b77190..f5cf761826cf91e826c1d2b4cee48fed49c75cf9 100644 GIT binary patch delta 26394 zcmeHwcYIXE_V_Jx@7;3u-YrS)CIM0?qCnW)WK$@TgjAA1I?0NVN($*ch-7yGMMVWE z2ip^Tmgkdaf&$lP1)k4}3O-a6(V(al3y58W-|Q|So2L1EfB!0;W$rmMXZp;UbIuI* zZR?P|cO6V;MbVR5?w1Oe7a;@W;d-`2UF0RNtzeS7)h%gv}7pKu$pX z3C#sLzJdHv1?xXW0vz}hDPh445_5qNPUG1Gzq3rS3b<6k2``1l!v1XH@m859`_+i3Mz zd36_gRS~>3QO~hdFwXV{u)o5Gy2`a2*1yX6-~7`jB>eOVrSS_T9wNkbLa%U>Fq!{5 zpQ1jkUZ?s1#`IEi<9EnehqzW+2O z-+!8}XeD0;rU=DAY!Pagb}1)mmT3w(9(}A#(Ik^EGZ(4D8Y>(uLQ8Znb9>d9>TA_g zx%Hwt;VvA8MrK|yaG4fNeNuEFQeZ?q=BgeyMl_;-ExsuREG`1x)wwsq(qGG~z z^hC-MotU4wAhoqPdU=`jl?xMIQyrID+n!XpJk26~?!rW*<+PFEX5%%rS%=^c#GTf(d3GfT@8+X~Z@rGuDH2xT$X z#Yfay>gJcEtXByCJaC0M?rg!m-teTtEl(sA-Ep2JM^sfAJ zBf5%q6C&CZ{qZwdZk1Uk<2&`L=CR56F@~-vhBZZJ` z&CfJvH$^9=CEBDf<-gsHDoG@KO6X+Oy(kosH^Q_BP^Y^tNhKY&y16Z#k)0`-MpAzI z0hAt07f}x2_M%P-|83ukl34O2`3!jnyuD2oB!9gZZ6c>lMInh)aZ5{6RY_}0dGnIe z*5;P_+Jvgoma6)?;-=0eHC3hcb#rTqo13L$@)P?}z6RxxMFiZl7fnXw9b^%h4x$^N z`ykp0-#wx7^^dZYb+px$)GbR$D)dhMqkC#g1IcCerSpqx%cUP+-*VDde)|xz2(0?Q zL>_TuGv{-Au;OmjE$XwX>-mTIPGO(0nfwA*U>(U4N5~@5f}Y+O*APNaLm@z(LVm`^ zxU|UnhU&cHifr$kzjEj7w{!rxFK5ubi|6~^79R;ZPolX1TGQ~UVASC1Gp zaunrL!n#Z7UKl9fO<+ct*mFtLfOt-;l+P=Cs`1j{TDxBV3`y?dz;C_C=t*_$DYXD$ z537~%nabx=22Y166s+H27zeX=7!+{)Gn4?&zN8f(p7)9Ju(|ED-WN2dQ6fB+uH)tJ zd7tf?RpnOJ@?V0;gpXDg1Z3mbs4pC-I*RaS|T&V^b;jE>O}` zTvu9&2YzK$Sy_4Ak|7g7`UuV$m`U*4{RUnR)iD=WQphNWu4e=qX$pyx5V4Ut!I(A7 zbOS8DpYbzVOqR%Lrif|g@L;oPMmYXIBFtp6*yOZLjG%OF7?JYz_cM=%(hbxhJUy9q ziaNA0ws#DkQcQ-6PRJYHV@{AkwTlor>N6DNJ? zX;gh_Q$|(Y-1@quRk*=R<+^v6YpxaO*H9r!MpiVQdK2m35iMH@S{*xqqOX@DbZi)< z$%-tyF?RLU~4rsCqw zs!zzwXt7DX;$lJ%{2lB%Pnw9truvTd&ib-pSTv zl3l>`vfqh|gv00pcR=wR`Ht|NupHRqY%qmK-Q?r!8kYQ0e&S2^33mM8%6l)L(kV9; zLiQLWdCNC!9f>N5-*LO2WhYT+DukV77bB!%9)@*i*+ud~V0tdK zWJr5nS#uCehR})BkcsHEqSKe;`QNh($*V%?O#j95Gv zk%tj?do#_#h_&!HGZzaLW=;zUW-dYwwQ!%*BJnw*2AFN!{}faUpCa5U>gC>T+$It- zAL5Kqw1e}3&5JlcdC!B~T}ntffRWYv`C!?;gHus(*UR{O@fM8&EZtln{Pzhi0|H** zRA4*IYvH+9_~4teZ2NhQM??FW_ zMO=j5uN2u(dr0w*dDkGbdAK%rcWCMShPkHH+|s25ruo#}JVLrgMS zcO*;S%Z*!<3?f%T`#s8)mWA!D3!;-FtFx+dtwzQ*Pjg!{nmVepJCl|sS*43^On#)P zvZA=KI>nqU{U~2`uksk8$CBrP+orr)&*m`um^AbNGShZiPZg8rWpgmOlbpS5EvHU4 zOF<#zYz3hhZpK_0=0k~a?t5WO_MnZ(8fGzt+Z@x@viw}L6zG^TBInDqVHm!LR;fav z=OrPaYkHe0%xtoRnS%2gDw>MR%7a_#gR8Sz%bPmGOkpO~(2dXttaq*||u?#bVqSW{e5?h#4hx9=sH!DfQeysc|Jo&OGH>qI3BIST0-p_t@R`3HDIuM=L9>S z;Gg6CG2mo6KoLO2&wl&$LKE_LGcD|77+PC} z2!?{k_bX$%o}gv=b>aznxBK@=_iwfPx6=Jv;r=ai|CYFai`~DA-M`nne;4B4f6#gO z_dz-r|K3Mu;omKECjMQI%kesKE1iyiSJSEZcO{*Ie>>dyHo=+w%BfJbUl|Xv2ZTWA zeMdP)UjH9u7$3>eGGg_*bW>ibZT$s|tj(Mf2`DKmOmC@4s zGKo}rL3h7%H zrXtmv-Bg&jplxnWtMrQ-V~&Yvv!!OoG}jdS6}eO8WE6K=a&uGL3rnR7@{bx-y${(5 zJv%9Xxhqt)=5{g#m>$(_3Z-3nQn^dL_R+09Dg{NH277_e7ck}-l>wsP7BxOY*L3N7 z2r)1N$lhzE^z9I`_evt2hQ=pVlOSb`nwRH4qk1Zl#UkfQx(v8X!}dkoCT;~+#-(!+ z+!zkAU$KYT=h%nXb!-c}h)rOpvOdfOe1JN@>|*X?Rx>qBE;Ean!0_mM^bvXq?LeE* z3RH&DQ3M);5d9VX5&aUqgWg21pvznvWf28ECk-lyuNL!RTcv1}w^oZPQfSC(o1dOu zpHiC9o?>_lwk{KA2=#eOlM-TTS}WSCvLq5-T_#oo)QJL2t`TQ$sS_v41vO$c35j)L zHmFyUg1lp)Ax|NbS~VSv)|17qlZ85)Lg&C{QiqdOu+F4@T%KuGXOom7OTN>hE}^7h z2P@SI3ah%YB3vR&)~cqdk!A_vE* z-ytbBNd7)vJ(Kcc=)Lk$ZMaocDs)yLf|qW0HEczY<+f>?a>2G~F^^q@F-h6c46i^+)ajU!0v4v3z;kQp@uB z_0my!T#0%Ph0q81^if=<{(B(3hWrdTyLtgj`I8rMINz>LqNoVC&#tZmy+eH{D}XU8 z$jic|wJ*;tpbVQievUwySu<;3eqi2b{*6(Y{KLC{;RDE7gU>|XZ6$p(nCjgj(r5Al zir=jv?*T^3|3smDxS-`T<@q}P0ySPtt!plCY6-?VxE9y&nd{cV1^27rs8+ewhu@q= zJBe$tPMBXvsiwlBB7T;9PZ58YGR#XV_wH%WT=#BD3UE{`{!3{v4qZm5y9ZOf zn}^zZg?f1j@-8Uus(r2eSi!r=s4cL*i61NPy@8KQ;M6OKWyD1Sw0rsI0;p{EJ$4s+ z7u&$S#68H_Rd?{~_-a0dpCo_0mtRSOZIez5J3b*5F#i#q3TEu*MFGpnZj34%H9ibT2%t3x(=Ux(N92BVLp*y~5u~LD9oF$)ruX z5P0o%em6z@6SmuxKCtNx{w@z~??#*o|FH1TM%@@#IZ4mL$&I=&`0xN`^2z~zh5^w} z6OH%`&etRIEPW7Kdigu>>6<_8HH*Dl1yKG%)v0aeY4v&WX>*gXCh*O0&BNFrl;5P801%NJD@y5)=RHXJ z<5B&JlYxw`oG20LR-EX57k%X0Q#Gqd2v5`a!ubS^!G9pn{v@=Vm8R*zD!W-gH)wJw zR15hHnrr>kPpcnMZ&EwO8S2G&m6=Jksj;XFPc&%!DE0+-sX^00aZU2Ijhc%jVq@`! z5!A}a_&|9IW@QRb%HL-ScT$w%u)HQm7)wG{w%`K?a|J{p+WjHp z4-QdS=DtGW5EG1orcPmx7YhF(r@`Jr;p+Z)EaE1kpgCZ-2}U`{CVcLz zY9uLI{SWmau?Is{|$UQvwQU zg+r?aZ2VXyXyD>1;X%qsz{+(v+C{s-!MI5VCA_g(m`WK*sJ|7XGrw^9Q`iDFZHHwkx?xtDI@2~{2I#0UZJqCewmJyEw>BrSjNQ@n+ddp zdx~>#GbvyMu82HDRKA zdHv&M(DABzBG^kcjApgR|JGILPSkO5qWB1I9V@Zo;tdbyXm9o${q zVnvB^X;d2&9TNQIMEt|LgJiq=FSy}|5I}mU;0Raw!^iIn_mdqS;^rg5Ay4d1+5Umx z$T!o)#BO>v;W=jFL*^smCebN2h>OK^afUcq^rM?agq7L7!dt>~!d7}VSWMzUO8qoY zX0b%YEuf#JAESX@AUBxBL*#_|rmVRYoe9>a1x@kNCD)qVTwWfNl5fsfSX`DV-R@SJ zmxqg|J(l9;teEns!pw#|t2t8oKrW6F7b3UJ+f98fHlAWpH3;BblIUn@zYd~dnP&`1?b80Cj5#T6vd{6Nl^Csd2` zNZG7XUm~Y^mv=G@pU-`HBPQHjw7XAaFT5Z0(T&w+n-!hg7kZZ?k z^T|+`t!qTHAhr2wIuFc7tx^7coOYK^>^JvW!@{jmu-Bx`8;A@y%M;AnC~_dy6baRq zK~XKyCiyLkcFVx17Hb4FM`*7bkltd6k&i`aCj=t?YvL)q>1n%MULUJ7wZUL1R{Ov+ znOZHSaKP)C+A)-}5WdLNw(-IouH$wUh zHWp}K4{0m5G%if7Z_Q3zkZdG$uBp;m(%zIjKdEDGT1A6&T3%VGZKBC+`P?FHHYv0( zvlOJwU6`I|Gv^wWa@JyPPsmk#6QM;SEv5^%X^v_puzBh?)FG-?<*UkIMHANRE10{{ zJ4ke$=f9*A9CKE8YRn5128~rgBqq+DJ@l;6r+S_-RQDW=&8=Z3*uGX9)aB}waJLN> ztDYlbrkKL9CDzM6dxd5-4WW5#x9)T{?=)BNR+G&!cXfxxyl`+y`*!BZE_JrMsCaRm zF}xPJytKHb9JY_gLs@&PmKjiFQ>1lBvHFT^ii{jW>nj>gH*Aq{8k-|}b-TtKH@L`s zGTm60^F)7<2Wf;iZtwIIv#h^mu>v(ZYDiH$#T=gIDW;n?Offxp$Mn^8aL%DLz%A>v zW8i!!Zl4#|X@h(`hV?7#Bh5H#C<^hBQll-ybBi?F99OSy)0oE(uC%x8>$S%Y9y<|X z(a}T4M_&=5qg_Q9IIj9?h|>*QCY;9Rh*{mLnLb_N8`3w*2WXvru=C9jH2}12V<)=D z$$$=sGL75;QKq58zOQ&urs!ddhtt>`*Q{=VtV)AHzGbcU6y1dv)^5+FB>vBn5-aMx zXHwr#{==EsH*38U_KkC#aG3IYrfhVSJpOj=Clq~{z$%yZ+6@%d16$T>ZGP%&)J8R{ z`d;;^>Y!?e>K@fvtpCjic7t}DAGMBdQx+xhcWJLS*QM4+C$>k%n@kzfZ?1*1qt;qm8Bw2@9I-Ul zufm1tOfO8ys41>)imaI@ossw7rH!9WwgPidTS`$PR36k$pvbSW?N!?X=#chH3ZF(= zj%b4^bos#IGBi;1B`h6j$H>s=dyCFiF{<_nYxE5 zE*N%X>LyV5`(UQ7TD~?*cPr`Ec9e7kmvl_ScD2%F)5_{wvF`~Y<=$*vGD-d{PtMi- zOle)7DV}Jq0%CQ)X`${=G)J!{s$80&(Dk~<;qCRhz%g_z`V}2Pd$6v!36Imu|GB&f zET#={K3n#YDp|2XH-`$mVlM2Py?xUIw@UB*IwLIKs8b>4A>tb-Udi~viyL(uOFoYM zLfft&p6MFI1rNB(`v=wO$+X`CsBet-E!R}X@G<&VN~(Rt81BW+WST(LjTEJCF~a&^ zJLvqux>;B6l8HiLL$Hsp{QPEJf(j{55cSx-K=;}h#goeY%5BP(%0#Yr%OLK{qRqH3O^@kzA>r1(%K!eDj!UA~({}-PRyUKP?~>!s>J+4$ zeomJ}>9kCsg6fm^h%MMX;qT{lkHrbza$RHu*sS`e1_)r0<|r5wsm~aIw8og_j!1pS zKmpDgX@yHs`ez45jWEG$(fX``NOOdIb&P%-Y4FZIB0Abuc+>R|9En{6>u2aA{0C6+ zKG$^9BDY<3I@~y0?+@GX310qqh91!asu2+mXJ_m856CgX6a)LN)#nUg86F*OYrH8( zof<9$T@Bmk=mXrxb2)R4epf8Io*YL&??KHZ*p{ih7HaJJbS-}mzk*-P$MT_`NeD&v z;BrJc^b=*)p?_D2-LtnVv__bJuYLxE9o78SKbO20b*ghLJsVHn{(00#b%{1q$M95- zON8ww|(P65K#mGOZ zZsh+*b#Yn#sJbz@x^nD&`j==rj7h@YDEb@~JB|5<*~zprNiJdhQ`9xpQNx5tirM`~ z(WS*T&E-S;CIU;?W?PJ788cH-&K}`JI&xaG)i%RX&CK!bOY1qPb`9__Y-ws8HfK|G zjBS>qikaiDoIT>Pv9`Rds`H*Rd|D>To%Q_D3CbqG+bt4fNGiPH9tBgUkLUrzi|5BvPLv1lSge4HBvc$@xI445d0P zJ)ZKC(y2DNRb}|pms|n`!G;_%Qr;14Si>vU5$uwh$#f3vtk92z9n%akEKFgu`1WefG-T z64|6uA7W&($Z+%quU@r=MOn;n=BVM&0Ays0Res|m!@~pjfXK)gx$9$tGeje7B{GR4 zgitLW#*Y61XVD8AGI{u5o61-I`Yd`}gYzZ@sJn=Q+;>-C|3!4dZNBvHJ6;ZQmclM; zZcpxkREp968dji{p561W?$SGdY2V_0eBXBF+QOdW-?K6FD_o1&+_+S~)XAY(Ck&`lg z4q(meyP^R;Ij%JfFau)g+U=b%XR(g~uq}|6mlycl%!O-;h*ZT^0xJWziEFVcut1E( zia@QRM|D>C87l%`6?%lNf?cQ-GKFX%m@m+Dz|s#^H1DF6q$fdNAHO4CBgW4Rfg;YSu`Da>(|i?o&RvFK)Pp z%}|VY4#PTH#bD{Ec8wnd8^j4?y)QGk8G3gzG~S{%bbU0mAzTN2vn_9puWAnVTy;0~ zhzh{Leqf!KV-j12;~Ql*;lla2s(+a;sL%68n4whiu zhN&2)VF<%89fJvj8G{8wIEDxeRt%9CqA)~bh`}%e!%PgbFwDkqErvN5Vll*Fh{rG& zLjs0G3`rQ0F{EHf#V`*;8isTX85rhcSb!lDLl%Z?3^^EbG2~&$$FLAX0fs^hHVoHc zScKtv42v-=!BB*u7()q$QVeAn$}ud(P=TQmLluT<49hUoV5r4VhoK%r1BONnO&FRn zv|wn((1xKMLkEUV49hXxfMErO8!@bupFG*6tkC({eg# zRnmt3O^CmU--&0$6XG%PJ@IYXm}#F%vgj~+1MQa+vh1y-{Cu{Zp#l{MpF*S$qz^oS zccphdfx}n>e=OI2M&&xqaq@vBcB_eFKgTyDe;{PlUS|U-cWkhilH()jDRe09k1xM) z)J66u5&yY-9`+RM9>YD*-EEIldOrVd_`5uNqy2H(sHgNl>wm^^m6ZM`{ZAh99>bnJ zU?-KFct6>Ivx0GZosY^dZnNvC$>@QLpyp1?8RW1hfAz8`r4ull~~3B2O_3Ix4qzeY}Y z*glB^&5L#k96RhmdJp-O?aRxepj*4cyZ3Kya%v6}Q*J*?oO6;L5-c>rq3i#$pdwiFX zGji^HjI5FM1Q;XZ2_Pf#1ZX4e2~bAL6CjPGCqNhpcR;!%UGfAjN*6tW-=*I@feX?F zPvAG{H&5VK=~qv{_ZR6GPv~dqXHVcKY-)PxDf?KGKQ0~jpn9cV_j``#ed7syC4J=yoRQ9W z0;i?Zp1>)|(*@EO(ia}g=hEk%K<)|YgeUZw^qD7cO!9QK^pWK0YU!wS)I&NV9q|OZ zrEYh?s4{w{lGG#hcrZ$%5}YsEC*VnfrT!;jr<}DsP?J-)QozCx?BA>Y{L4K zkK!lsH}KWM^WwMkW_mT?F3j7%VOpFkWVnDsL9Wn3SLpY1cC&o%1$!t3 zU!AiD%f~L-^`yTRDMGxiX!<^^qW2Jo$PeAs3D}(Jc+!svL?_Yv=w-AUJ%lzQCu&4R zXaTUKBa}tgqG@OhgprP|aD;SBfVW6T10K|a*c#cv2xc$>4Mxy|5!7G=IT(TMK(5+! zUAb{8T^bZcx;Pl|`(VU{!HC}mBYqu>_+=o%r&;=WAf{RR2{LKNQiOgXmclc%qp{1V z8WhE-9E?y5LP$RjM*J`sac(f;>|n%K9)wpLof#bEi@}J`2P2LTM)WcbM5xy|l0Lx* zoj2m+fk|k5;}@nfjs&ENA!f-fjN`w^SJ*L-$&2E*_&V}@l>^`R65b+o&?7j;A(6zP zcLc|D1shzJ}Uj$9(v&&e2Jx z$aD0LESgfi4-JfChJ3)+5v^iBpw|)NIasH4%#l-r9n(k^{VM%$`k(Z@^g6jM#KBW( z0#b~n`fhhh-w*u!cHrmffuAP^etsnHoaqQ6!=ND4NW#%Tm{ zpN4HjvP*uZ+Puhv^kXi|64tS0Ih~Sx{w2o}b`brt{){}}2$1b>I94a1S&HY}7u3c; zdXGa1)3nYgl`>lCulQAQRPmf#rghF&{83HuEwn$X>DCd*u9{+XAdE2f*if&*2RpXe2mP>?}sq6Mdf(#=5dihmoeX3(;oFOQ?FJ;9nw({7R|)QQM}_mQN;E|=Q82e)+$ZtGgeDshg310~KeP*#dV%SxQVu(24Y#*GyOxQT+` ztSf|@BM467Ph_>x*SUMd)x5!iiocIogq|nVN_DdOA=iz{z5#nyp$DDac|)*pN_bs( zTv#X63iH(u;b=-SkiMC&eox6DfH!p7Jy#|(|_rUQk+!DAG@bfCSIj2D(g<~Dn@8brnpk;lO@*Q6~ zk209x;~_l#t&@jO9#N|08^3X`A~8Yq5d04#N&fgV=SL(|{70rD~QR+ppe>>r)_2yYj?`Ee}`4aJuBgzEI9#Q33#8hk<45@W0DhC@3w zWAeSP`+MElEUrpeURt@#TD2h8>tgRHSmV35HQlX1KFSB?4CH-TNsDiL*`k(~R4hx$ z@Otgw&pEqp)wXow+m4z5_Z_dyp^CuLgo32#tj_AD%3QD8%%fn9%~OlJ;dq50&GRmE zRq@hgOP8B+mq&R&dNVqHxP5nC#O*skvDp94OiA+Gq&7=c!t_e-3)!P!jc@+ebT_!$ zH+LZK%k5hq6WN;BnxDGdxOnz&e z_j6C9;EfmWn(k$u79Pm^atoJ~F3ia-t1Bz6@b+{W1!sJzxTU+gzlF1gYKB`1+6vp7 z;!K6uTcqCs@y~dB1i8iCRY1ll{nETk9hKG;5j}rtcvNkr*M*GHFpR=oRsmW2RTA8? zP2{hw&q_?qu%_A~ya+zz=yxEuJB$Zk_r_Wnf?(HJ= z=UiR#^y2PPPcL~htcjnWl03aCE2qlaJ#};p>$;0Qo6iG3ItA!CXE0d3Uw;?{6D%#& zbQif>5ipY=tM@Z3qezd%soL)A;hdk+uW*2W-@v!E@;X~A(UtKX3H9DLjsKj#OD?SJ z2FEVe2i8xbm0g4LXsphQ$TT-+CR)9(GL3>UKE7PveVv++riUmd>Lw~@5C2q7Qd(PU zO^2zWG2eatv+MHx&rxK6Pl5&Ag=!)&#GQd+_7I=<)YN&+b<;a7^QL(F-~S=&E-Wy6 z91+U93o!4QmkWQH_Xu7I;q$8s(i^PHOOm{=8~tHMo_w&=LPhsNoX?!g`E(7)Xc%An z>V@XAc~Le~)dKGeLR4o?N{+R@K0YGO`$?fuWQDyU zD!cP=R&iI}Buu|@Enu#xt;yW5tTZLX`(3?JaK&yCOS*G0*Z!9dW3dZ)N=j{9_|kmu zHw8z*48Ny^P4iYLuEG**!!L#@AZk{*MdKIY&@YCmaQbtn9#&x-*8NgxL`V+o``kGh z8t%p?i;VpSE&S?8>a^+&e0%A3Rh?p`Vu>Oa zrk+(yhN2?pIG4xZ)12FokT=MX;CTQLaU-bzu_GYXBqDZXGFGu6Y;kd!@8;j`q)5oY zLC`pz3RftIRKoVvPCsB)IUC$J89nw3?EZ%}Xb6qS*Y%toqDLlU?GU0yAY%m&R&^k9 zFuCi>>$|Q8-ViaEeuX?Li^zznJ(iha$X%|df8D;pHd-Er?jd6@VH>@BWO8JTjec@u zazwa|t{$G;g)_&OQ*3nQ@N`VZ*HmnD#mHoQ*Mcq^nH+&lXC)(&yR5j^=;9ISuG=j1 z;*rT%H%4DSGC3TVZQ;mdQ%skQ&Kr^Ly4gbKj!d@VcFP)>9FEJDIWpOVZD<)ImmOd4 zw$bS$Cdc=@ZFK6$WNe|JQ$QDhO@VhgoMjOHvO|KYPUlswkP&WmVjO({hp_$b<7pba zfEc(2LcB#Z3J(hrnthrq{u91f{k?jP+E4YcYO?ZKWxC>!!p42U$+}=_E=M=_;Ab=X z79wk!*E7q1wkmqYdt;B%V}-#FhG?MSSA!wO`&#&)GjYv|mL7$A=(*h*UsZDDG5%37 z!tim|(4l>VGV76zTC)tnR@NjbwY$ zyA=rfNbmP!|D1_y!Y=PYYPcg$pB&-+&nJu`EuPJFJ+NU3FF|m=UJu94I(@9(4;YPt z2cGgZJ(RlUX*R}7tVa+2d? z3L0~3EuG#k7L0-&UT1FT?t;eeoT*mtXF*1h6t6+`-78^zGVW5B8P)3j_U|atV-;v^ z_l;0|-Z|6i{r1$KCv~k~4c#l?(0S*z)^e|>AxDuEuWfbRH^8|9JvNiNp4_SX-l?}P z_qx423MP03Z0KI@%0I~S#!oM+_pye1{qr58V1n1ly6#R-N!$+w-C@xx;5tsPit&CD t^UpcCR@9d64)=Odd{8$9giwQWR&`W#Mnz+MRaSPT*ArZ$;D#6A{|7JfJlOyM delta 13264 zcmaia33OCN_WrB)s^9CkcfamVXMvCegvi>RJ%EseY)C>92us421VR?FCqNPvx;qM@ zEJ1|JI649%Zb+g6rH)EK1r&sUIx>SxP(~SW0rzqJRj1Jb&i|Z$_xawb_tmXi^=iF! zt2}*cv1i*_+!>~ZF^t*GT>Wi~PDEkPlx){!N))MI(5KnGNwGY`eNkBTT#sd#x~b~} zlbS|pS62>wj_FEx?!N;~2J3v==i2W=LW*r5P4u1esy<5kN_tr?(0-N&L1+USg4JKO zCrDZY`d)&r1+s(HAA!8v&_TpNq-YO<6B&{N#@~(_S>-%*-Hzh1I+qEDNSnz9gLsnw z>>cPVH-L$RzB|wW_bUv85}eq16I)l`Qr=uymA$wszdAK@O2ZPVt=zbzwKCRWc;G~~ ztf{naVP$zUj9P)FqS0{s3KRlaw{iLi{85YX6)o)#P8>EE)i<vw)m61rmDW_X zmoHK>X&!O?N}y*3?|>IIlcwmK7%>Ki6(Jf4=YGXOu;&5}fl(__63aXT+oszFk~>$T zVI{ESkRoEL|2D|qDFnmX9fIV4ZHLaILdfbLK0@%rN30q=nAyqbYqdUYl@_f$qGZb# z<+aj3rB(>-V<(b1eQY#~8QRB^*;dlvNU|k{f$ob-4d4fDrco6NCLt%U=PJtDr zIEXjxv2it%D#~JV zrsl=9Ef)PmZ2pqsoVI04D)XxvoNxP?nDpfGoSfP5`I!Z2&eQ!&Y;{`x!t#t|MLAi{ z-u_gxrY9B0HkVIoD=2gx_A{=9(;6$LB@|>QR5Umb`kA;QH=J3*TF6&b*x)14UgjmV zgX5>8!)ydw$-cvl=B9DWxZU`7QgqeiG2ds_KLxu~2G*TR5KJeHMtnAwSA;GRw# zP4;%;_sv>HVRlV%^Rn4lF|!srUm<_{3$8@i!rg?3^j92((FPdxFm8h{O4%Ut(!)4A zf?L4u0ilO-z8;)uX6@(#dJ(OMliQ?00xEMS?VU3jhgtwbQ=;zF2 z&M^9EeV5*<=jh?u*V=2^R;_xoW`70#xlMBH`@;T|8c4iSvukidM*9-ZET128cO&UkX9}0uGO{VjkW`C#%-a?Ka1GZ zxarli_2lS7rtwi+JxlZGF)asAkEx3H&)f_{HvYrJv0y!Ga%}4R6vOxz4UrhHn!Z-J zSmL@JQk6Si5 z)A<%TYvXSofLEcrI54~TRnY0;$HNI1e=9~Kq#Y#K&F`03Z7%5@$FF4teKZqJXLmsN zNe*N{+XQ}JxL)1Q(HVb1WqD1*g0$G8oTA*g@}}y#S%yk-Ci44IST!6t45% zR@heOF3PBgNp+^elk54tEW<#_20kvJe(MIlCLgT;Qy>4e{to^{IBtFkou-|yeIN$@ z93SDYhv;+sT|Amc_I$?g;mx!~{q@lT1+PeU!SVP&>~ymbx!>j_TfXG$5H4qar4@Ub z9}2F^{Cv3YGC!AGxXj11ES6xB`nC!8qoGPR6BXl%i67;P85NTd;Z7bKpEx!)esrwcHNq9+a-qCP zE`x;)bAjZO0Tlbb~6fsH=Z%CcfQ*nPqh&5H)|dFa%!PBTQxWmjcYT zJ;F_^bGrnR+hryZxRCjr0nOtJfyiESGR*BY$3SPVIUWx7nrE{p8~Q#sPlMW1=EtGp zGjo(_Aa*-hc*Y^vEdE}L^7@JX9!wUTHt%?OAXmry9X*YtQYnvP`D4;8(g-O;62$N5 z{CWx^I>m&c!VzJg@QAQcSZR7ds1@c2lZ8a#CIRtZ@gMLn@Q?AE`3}Aje4XM>h~6lU zg+&|1p^7}r|F2(MzqmDr+_Ot=S9f@|}+m&n9sdYw!zbMDxP* z#Dw~$#G2T4=SvX0Ma(j!X2zIa7N46iqiuF`dGoZOqoi?*Xu@bOEPp^;5i@=IGS{S* zOm}8`PQKug>#1}>R$)!m?3^X;xFyCu|IjGRo>e|=*247kg=I6G_mE+K6+gh7oj(A= zHt}ZSjzAlDa$uKO5_0od_FeWQ`v&_e`!Y+|XV|CMN7;wjzp$Iwb?j=^%igBvuuIu` zwu)r;h@TPxn*irVH!IX z++*b+c;$+yz{LxqigHNb1u@czxeu8An@v*g=0YioReLrslg1ljhWN0!PP|QhP^{9* z#98|Dn~zGP6`@0D6c$L5kSmM>9!bk2jQoFEF;Bq?B>k#1EicN=%3nI!73(}fR#W$^_Q?h9ZA;67*83ZIVRc32(uMBIhKy-R z&K^Hg;ZBZQRy=uHa&t{-P>B{wQ`IkTo;+nzVs%+doZUmdSEYJ8YJu)&*buTZO1f(u z$^cWhbgx;y8@EZ-Qo>_fyCpM=RuRu^yAOfq1<6LnyeOT{7{m@|CZi=t;pEfuHhHO> zBS%P=r2W!rX`bYkFqrdg`^dZbwi$?3Tx8@l+ZkcY3N(tii*3Cof-Eu@f!Cu_|6(Lh zgSjp_7B;%%CrP4Po{3m|g=~nC%UEoxVuIn`bU7Ss@$yV;E}*Yxv*mDdZ@k2xwUQyv+{fAmMl^up#s`U{vW$`A0HMad^QR^>N#@X*P1&E-uk5oJqTTIik({cM`T49hC7 zD{m^TiCA3UR9o6YYVFGADcnfr7P&CwTTs7^u)Mas zB70f9JNMSyMJQ`HcEiCbvYpgbD_cw`9ZU_%Fo(sP@Vny&K=Od0bV!%h=k6Ltt5F!v}g8B+Hjf8o=>pM2P(tUyq_ zQ_!Gfw-5m%LN$rpe@Jx&{PVt*H%Ee&r z`9HdGMf}W`(!}bP*wV&Wv;!hOR-9ze$I62!#1$AMl?%&eW@b)rODijjMXPZz9d~nm zY&b~g6_xz{iDDW__wy!;8b6%9mn}e_puK3dyar8Tw#ieN@AXY0ezJ{%FpO5JL-Qg@IZ`m3mlA9 zMnip-DuX3fnTc<~VYT2MphiPqsA?gr!_+(M=KXYA%6732fyJe!!%UZ&MmstD&804i z;GPgQ^DpL$=ELT_<{jov^Gb7_d9L|ZbCP+e*=}Yb%B{|o@E7!J+Sm$q23g@&!`L|P z?~G+ha;TdyjmSIJ*J|qI5uHfcAfK8OqblMWi2xnH#DbLXCx##AA+}H z)kmp$IkQO}0uZMf+LE%?irn~^yoS~#c{aM=Y*M4OB}EDG#The`8}er4IikrIajIR$ z_$*UQr!+MZhNq~jU~h^#n*1w8MHq{2YB9P?CA(O)u)c0mO=)v8`6XSA$5>Wr->{ZR zc1UT~MIPNpiAX;Y#-ok>CZzRO-A>j`RyzX#R-P~C(kXPAe6yazjhBOE-fYu%NguP{O1;u6(r)Pi z>27$fL2YI6Tr#0i{S5_Xwv=Xd6g7EL7^B`bB{4IuxNdrOYpwHfva&@zh_JYp>4fxU z>I%r8s2k*+Wvc6+s1{85S|X(7YtND|^R>;?e%?Xunyw8*q2_m(Q1;KSFKdk`YaQix zK%qOJV47wpy@i^9(IB`oS98dtN^4qBX9ONJh8lnCsC!EvC7Zw~(w;_H&{L${3~v=_ zlKly33X^n=(r7$rB6y0mU^1v!`?rIyM=U2l4qAiuK3=rs)}oE6!Gvwpj!S45gzuKa z%;+nB6)&iS84AiHskdumSgbFjb^EswTo`Ovp?&7}~(bSWUBbUOdJGCI_>Cja8twYJVi#`rR5;rdD@L5^Wd3P+F2QCk=+`+Q;$BUHk`ua0lJC(H5IXA5XEb%J(F7sz@a={Tj@_$uu+9?!X5w1pkN*%@>}jL2GAwWu6c zZe&AA_iNfD4!!43o3-ViVpw1j`zYCBg@<1^G@iSgVN3)4(MfnZ9jV}XT@HiZJ6H); zK4A|h(i_?jJQn6NYl*2>Q)$mGGiRGeo9)6+LZ5Irgo|g}o3`I(XHj3j$p!}#q+Iy$J?$ab_#SPv&1bYzto#ymCP+@WElCO?TmPw5&2$TS z%yZNa60n@9-MmY`U-#(^`h4ng8OxRG(RvWqtYfNsKIW4Vu*-L7sR2DcNC2x+?6ERj4x(=IC z^gNiEqG#IGquONJcF|lo`!o9$7udpK!hG8t@>GsqWRupQVT`s^o2O;K_&!_ccuiHm zQ&N;@ZJlC~f0X;=qiC4iEpIh-$R4>;&X<$q2-z%Mmd;2$(&N%r^ESyNRZ96%k`w{v z5`8}jE!E4gpms6>b!Nl+SPmwCE!RyP_Pfc+&Px4u6bKHf)^A1dESBToH!Qm#(P^Z|%1Nb7DN(Xuh{ZCKv|21@6?7ki7U&AeXtx$c!s&I*MF6ePtWxSx>KjLB%$cY%s76ds@ijpZ|bYhGfn&T{U0x8|`64R_sJFW;snPkmRiRGjrM1iBbLvu)3#V0QJ9m;5MHV{T7m+J-Ez^+J zoYkBbH@hMuc1A-@(7U8yzNI_Te|939n;1PuyH`D{4pExqV{)XlR6Ht1n4A2&m(NeP^0GW|?M<7(@4}G!9 zBEqbvMGJ}{gV$L;V!4^jRxov1Hp13UOK}jpRvzwOT%~WNkLkkNzpP5jz`W5C9>gsZ zi^ME3p4G&`qE%$fUz$HKf3V5jlMmfqjA8vo%Ta^vW{lS9`K5J5mBp!Tr5VocPudR%$&&Y~fTo_crzQY*HD`zc_ac4{_ZLG-)N+O5u zv!vaEn!(g-SqRm=v~4Ovd9;U1N4P(%qo=rZ=Hj2LD4nw`14zbc z%Yj_~{vyH}YnEjhad+e6Fuof_kVO&JAO!i5R!rI>t$Pr95N;l9ZI-x?8L6zkehF)a zU4yL?Q8|4294aMQL#!{$9B+D>2rg?V68>R&%e0@qjdoiX;Z6Ask??f9)s5=NNAcF1 zLb=Us9|-x@M`2ICbqKtZZ>=U{3#{uAOq^p4CVkVaS%{V)y3qO~YY~2AECEAeVf~_V zw34`HSx+VT{p^i&lYxK2XYhV}KW@VLcr;eHi`-G}PHq>M0_INZQv-LJe5Ml91k-@P zUPPELj2A-rAI-xREPT2#)xPB*l8AYubs`J1c3N$4@+O0a&_}H@{O2L74(*$)5_pa% zq14x*@KAYN82~+-tS`W^Bg#$CeptB+R)f`m_|4XOXxL&6A+%SeOW2&lR8xnX^^F1I zdGV0AOuqKdzTG-nz}h{`ap-(s>4pc6 zDjIwkYNg&c7D_f-gTeKLb#EZ2U!SlFncN2M5fCn0$C32QRx={EUa@Ag)36dBpC(%C<_^$Yzz!cE<$2LTyfE=c@U9cIG}#45$BiCWLI(1s=0EO+c{_q+;NK$ z0^6uLL_QpAV6I;~iigX$7>BMZk9Wbr3C8sP7$q*gxN+5Vd2+0C(9N)Yq7m9}&yf6y z#@-Y>8-+8_*Q*YNZTaE^sPPy%U^$1JQ1O6aK%&>UiSSnEF>^EE#=P z-H7}WR?@rHh~rQx9n!|*)c)VyX$%0gTeLyqM&o9}Z!~-&{Ny!sQ&LPZTL;@#8NpIa zLQFAwMm>yH!80GwCeXf8zlQv?YAA$%u4+(xmTr%NKT)$t)`P~&9CtHcMZI?PYtnjc z!$?X5x?mzzyNuhT`ah3##gK208qZN@_(A`??Mfp1j~P=jejII+!Q5-ak+fc8I3kNr z8JDP+MS_Gcqc7$y@T1$3E*zQv3 zJTVvgMpA=!kbsgkI$FtsnzF)%nn}^zbaa?h&1)T-EjQHI`#yZ`WI~uNcviK!XAz`Z0GNi2+v-T z=LUQ^>wcGu;nFUfMiM{9WxPq6$Bbs7tx6ik8fD-`VIu>3mR zD{efG2f=_SQRIz6rHvWsCx)`LfApV*P52Kz4IBNaEW#%j@y8}AP|QN-ZfXqOvfCyK zjurkS{mf0Q;%>u;n#Jji_Y^w0qJL0YIMsMCmi z`#Ii@OjLww&)-9}|N69Fd%HhTKSQ;*1?^X6ISeZ@T9(P_)@?g#qyF&7o*hLhqmi*yFY07F$uAiJt zm(Hxk=y7PMv3~VEEv3 zp90%1*m(e6P+`Lby9vV1o0ZkiUa(U*D1s=Q6u}fB6rmJh6ay#*QiM|sqKKe~q==#z zOfiJwCW@gH!ziLDZl)McF@j3PmbK8pR}vbczg$Oo}XuY>LSgw^B@@$f3xkm`agHkxx-TF^yt6MIpru zikTF%C}vX>Q4~|mp_ofCk77Q>0*VrfQi?K)g%pb@$|)98R8Uk>R8dq@ETO2OsHLc* zsHbS4XrySOXr^ePSW2;sVmU=CMH@vs#cdR~Q`|wZf;@X+rC4Egyzh9QUdfTnSY%n- zBeqB2c%COd1Rp}P@e7RpU;S(Sl73$QK>vq+Qa`F6B%vvuXgUGEK(Y%wOA+@DK1N=e z=HXetvmlHdPWOBy!J`X2gE5ck!h3vyC*CEnzo9DjJBCO-z6J!NpYdz};XcnhWw&ts zfBlF5{izoJFHoEZY!C4NW+t+>Ew(KIX0vTGWk#{Kjkb*erqk9*nK0J2-nO0wMAo*} zwl)xTFWt7?vdQxl7aYbKKN>$$y&`M;VEoV@ZG3NhzaKnE6!3Yph!zjD9v>lH+dNh_ zSc$v-&wdy9lM6dMzo24MC^Lk0cpM(3nr7`-=~zhv2J2WsC#GteiQ^8(9sQX&jyjG8 z0!JK2XcSG=?r1l~`(@f3ZGos(M{6L}GRHD9;Ze^>!BDnQb@oDgAq|MEeY$-*+`ij0 z#L)lk=X343`VW3S$DTu;-R-%=V!tNX-sKqp$|g_rVEU=FN@#QT=V04z+ur~24q6IH z-$k!W(bgE*)^LSIRt565VgN=@tHN0bf4rNn7dcb@uja zbDngb3_-k0)aQ3Zw3NyINt~ajyeNX;ymI!5?~HH4+jFT zJ6{h3UUR+{2)yciH4tzdavlnV4muA80zJ+i^4fErfk-*vJP@E>alQgy?Db@D^PKZY z{60^+h=aHw4xaDwWW%=I9;fQ~%<);jtm9M1r_`qP42O$5J)wh~UCyq6j<=m}2Lj#B z?m*zQ^ECJlc!tnxZ?yWIyPUiF1CG;<(*c1~)OB_6faiCe`c@8}^w`l-@{g0A5fOrv z%sj^JVnFKl4JIQldq$w)+-7QtH*;nG|39oa=Sd*@uXtWUgKx87b0k~Z($Z8#&o|1O z9ql&~ZIJwvCxYzz#&aj1F3q7w;{TJjdv1`nuaq99r|8#6tf0hoN$!9O-#p_-!nXA0NmSVxKL3S^53qf7u}fSLiywCHI-4~ zI!&d}^E1s+8G60k977NWOcMStzJTAuhw(nV6K}#k+=xqX9-e?l;Q?6Y ze&YVao#Bpf`@!7p8?ZUb`z)4jm6A7K^hWVIqW-VlU?A^Sv*{tzdecf%1DrtK20ChQ zs6FS&8;KJ)5^vo|9KVq`b|dlTjl>%PV&$J5)_L?sks~(}hi@cazma(DM&i{Qi9^>D z_Gah7>zQU}59D*+#n?~7i=4NSC|`Rl2*#Pb@o=BRJC;0Y@@~M68uV1++VxZ>HA5i#q5ndI3!6_BzR^2p`g6^hDoaC>Y}lg9F_*>Y9z9 zqQk(w-y13>yNl^n7REdj&MokTl6_-*IW9q)%Cw^}1}yzIW=ec#S^eC`)Pax^++l|Q ztn>vz-%4Ku6qWks`L%?>;FZ2XaJkf%7I@7q^OZtDnJ*lUm-*&{oA(VQSqps;xIa%= zy~1aKk(E9TQogoe@<@g6Dc*0x41&m0s)LV@cNMdr(!&pL2w7F@~$JVtrOVCWe02AUJo@8v^y~d~;w=qi>i$iwSV* zF2ljQ5@>udoYQ?ybNpymY!9G5gMz~t-HrJfR}g5S#-bE=kbk$3BesYgopZ6Q(ST}mjmv@R9%7B_lUov zgo5VtP4b6A;PVjL7_o=F!EleyR|eM5j!;lSJL+M=VXtJSZd6xNF}I>Weg9ibF?R=` z{k8ngTHlgCC1Mi%CHHsiE$4k0o)7N$&0kW7zZ}W2#awkD3(N2REGqwe!8op% ztN63TiS+vy{qK#&(M>I_4ZoHI8dv(iHyTH~nZKqce(cvtyX95Ec-pkH0#Rc=@kdcl zwO{Vx{+Gf(<>Fk$T*04jvGfD-|Mx~aOYZ(oA5K9hsdv-&9V>LgPrbeY7J98`nNTIn z74n2MAzp|E*D2phaG&)B=^7P zt2IwCy-05}J;A6SnMzFzUV}rqC-jH)Xzc|pn;wFD6+?bT9xJ^eRf<vM6BMxuSIPUb&lBOSGe$Ljh2;GMUGmv^2b-7~adl~`W-+Y64+9T__ z(&5T%V-V0o)PYKEQ&m#yjK!0eRHsc3h|m^l=$h28ie@7Q&Sm>}!>(L=-QcRMo2p9- zsOvw!T_0mHL++RkNol{v*WWKp9gM{-U8yn?IUP!Sts_8-vWY>}iAlK?jcHW{)2_Yp zAJ9(c+s3XGD0oY!>%i4BX)~O>?30vh@0bOI-SNdWT@wI3ri8)Ehx9?pwRh(GMFdw& zu@Z@Upx||SkDd16$sAkP1sm5;Z@2z5y;t7*wJt)~CEs&gb5}H!RMEQpfA=2~?LRbS zK{=h^Enm>4=)PjJDY@6&eD2TCL?=t_35Yx_^=IIKCoj^5p5HajKZ=~>z$d=n5nVsf zEGSKvLZRnVdWU@5rR!}Pfb;h`Mrkw6k5XR|0^NU J)s8Fd{{gPW;z|Gj From 301f23cd2d2f73e63c388e13758e438a804583aa Mon Sep 17 00:00:00 2001 From: Elian Doran Date: Mon, 13 Apr 2026 13:38:06 +0300 Subject: [PATCH 31/33] test(server): clean up search scripts --- apps/server/spec/search_profiling.spec.ts | 306 ------------------ .../search/services/search_benchmark.spec.ts | 4 +- 2 files changed, 3 insertions(+), 307 deletions(-) delete mode 100644 apps/server/spec/search_profiling.spec.ts diff --git a/apps/server/spec/search_profiling.spec.ts b/apps/server/spec/search_profiling.spec.ts deleted file mode 100644 index 8099a322b4..0000000000 --- a/apps/server/spec/search_profiling.spec.ts +++ /dev/null @@ -1,306 +0,0 @@ -/** - * Integration-level search profiling test. - * - * Uses the real SQLite database (spec/db/document.db loaded in-memory), - * real sql module, real becca cache, and the full app stack. - * - * Profiles search at large scale (50K+ notes) to match real-world - * performance reports from users with 240K+ notes. - */ -import { Application } from "express"; -import { beforeAll, describe, expect, it } from "vitest"; -import config from "../src/services/config.js"; - -let app: Application; - -function timed(fn: () => T): [T, number] { - const start = performance.now(); - const result = fn(); - return [result, performance.now() - start]; -} - -function randomId(len = 12): string { - const chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; - let id = ""; - for (let i = 0; i < len; i++) id += chars[Math.floor(Math.random() * chars.length)]; - return id; -} - -function randomWord(len = 8): string { - const chars = "abcdefghijklmnopqrstuvwxyz"; - let w = ""; - for (let i = 0; i < len; i++) w += chars[Math.floor(Math.random() * chars.length)]; - return w; -} - -function generateContent(wordCount: number, keyword?: string): string { - const paragraphs: string[] = []; - let remaining = wordCount; - let injected = false; - while (remaining > 0) { - const n = Math.min(remaining, 30 + Math.floor(Math.random() * 30)); - const words: string[] = []; - for (let i = 0; i < n; i++) words.push(randomWord(3 + Math.floor(Math.random() * 10))); - if (keyword && !injected && remaining < wordCount / 2) { - words[Math.floor(words.length / 2)] = keyword; - injected = true; - } - paragraphs.push(`

${words.join(" ")}

`); - remaining -= n; - } - return paragraphs.join("\n"); -} - -describe("Search profiling (integration)", () => { - beforeAll(async () => { - config.General.noAuthentication = true; - const buildApp = (await import("../src/app.js")).default; - app = await buildApp(); - }); - - it("large-scale profiling (50K notes)", async () => { - const sql = (await import("../src/services/sql.js")).default; - const becca = (await import("../src/becca/becca.js")).default; - const beccaLoader = (await import("../src/becca/becca_loader.js")).default; - const cls = (await import("../src/services/cls.js")).default; - const searchService = (await import("../src/services/search/services/search.js")).default; - const SearchContext = (await import("../src/services/search/search_context.js")).default; - const beccaService = (await import("../src/becca/becca_service.js")).default; - - await new Promise((resolve) => { - cls.init(() => { - const initialNoteCount = Object.keys(becca.notes).length; - console.log(`\n Initial becca notes: ${initialNoteCount}`); - - // ── Seed 50K notes with hierarchy ── - // Some folders (depth), some with common keyword "test" in title - const TOTAL_NOTES = 50000; - const FOLDER_COUNT = 500; // 500 folders - const NOTES_PER_FOLDER = (TOTAL_NOTES - FOLDER_COUNT) / FOLDER_COUNT; // ~99 notes per folder - const MATCH_FRACTION = 0.10; // 10% match "test" — ~5000 notes - const CONTENT_WORDS = 500; - - const now = new Date().toISOString().replace("T", " ").replace("Z", "+0000"); - console.log(` Seeding ${TOTAL_NOTES} notes (${FOLDER_COUNT} folders, ~${NOTES_PER_FOLDER.toFixed(0)} per folder)...`); - - const [, seedMs] = timed(() => { - sql.transactional(() => { - const folderIds: string[] = []; - - // Create folders under root - for (let f = 0; f < FOLDER_COUNT; f++) { - const noteId = `seed${randomId(8)}`; - const branchId = `seed${randomId(8)}`; - const blobId = `seed${randomId(16)}`; - folderIds.push(noteId); - - sql.execute( - `INSERT INTO blobs (blobId, content, dateModified, utcDateModified) VALUES (?, ?, ?, ?)`, - [blobId, `

Folder ${f}

`, now, now] - ); - sql.execute( - `INSERT INTO notes (noteId, title, type, mime, blobId, isProtected, isDeleted, - dateCreated, dateModified, utcDateCreated, utcDateModified) - VALUES (?, ?, 'text', 'text/html', ?, 0, 0, ?, ?, ?, ?)`, - [noteId, `Folder ${f} ${randomWord(5)}`, blobId, now, now, now, now] - ); - sql.execute( - `INSERT INTO branches (branchId, noteId, parentNoteId, notePosition, isDeleted, isExpanded, utcDateModified) - VALUES (?, ?, 'root', ?, 0, 0, ?)`, - [branchId, noteId, f * 10, now] - ); - } - - // Create notes under folders - let noteIdx = 0; - for (let f = 0; f < FOLDER_COUNT; f++) { - const parentId = folderIds[f]; - for (let n = 0; n < NOTES_PER_FOLDER; n++) { - const isMatch = noteIdx < TOTAL_NOTES * MATCH_FRACTION; - const noteId = `seed${randomId(8)}`; - const branchId = `seed${randomId(8)}`; - const blobId = `seed${randomId(16)}`; - const title = isMatch - ? `Test Document ${noteIdx} ${randomWord(6)}` - : `Note ${noteIdx} ${randomWord(6)} ${randomWord(5)}`; - const content = generateContent(CONTENT_WORDS, isMatch ? "test" : undefined); - - sql.execute( - `INSERT INTO blobs (blobId, content, dateModified, utcDateModified) VALUES (?, ?, ?, ?)`, - [blobId, content, now, now] - ); - sql.execute( - `INSERT INTO notes (noteId, title, type, mime, blobId, isProtected, isDeleted, - dateCreated, dateModified, utcDateCreated, utcDateModified) - VALUES (?, ?, 'text', 'text/html', ?, 0, 0, ?, ?, ?, ?)`, - [noteId, title, blobId, now, now, now, now] - ); - sql.execute( - `INSERT INTO branches (branchId, noteId, parentNoteId, notePosition, isDeleted, isExpanded, utcDateModified) - VALUES (?, ?, ?, ?, 0, 0, ?)`, - [branchId, noteId, parentId, n * 10, now] - ); - noteIdx++; - } - } - }); - }); - console.log(` SQL seeding: ${seedMs.toFixed(0)}ms`); - - const [, reloadMs] = timed(() => beccaLoader.load()); - const totalNotes = Object.keys(becca.notes).length; - console.log(` Becca reload: ${reloadMs.toFixed(0)}ms Total notes: ${totalNotes}`); - - // ── Warm caches ── - searchService.searchNotesForAutocomplete("test", true); - - // ════════════════════════════════════════════ - // PROFILING AT SCALE - // ════════════════════════════════════════════ - - console.log(`\n ════ PROFILING (${totalNotes} notes) ════\n`); - - // 1. getCandidateNotes cost (the full-scan bottleneck) - const allNotes = Object.values(becca.notes); - const [, flatScanMs] = timed(() => { - let count = 0; - for (const note of allNotes) { - const ft = note.getFlatText(); - if (ft.includes("test")) count++; - } - return count; - }); - console.log(` getFlatText + includes scan (${allNotes.length} notes): ${flatScanMs.toFixed(1)}ms`); - - // 2. Full findResultsWithQuery (includes candidate scan + parent walk + scoring) - const findTimes: number[] = []; - let findResultCount = 0; - for (let i = 0; i < 3; i++) { - const [r, ms] = timed(() => - searchService.findResultsWithQuery("test", new SearchContext({ fastSearch: true })) - ); - findTimes.push(ms); - findResultCount = r.length; - } - const findAvg = findTimes.reduce((a, b) => a + b, 0) / findTimes.length; - console.log(` findResultsWithQuery (fast): avg ${findAvg.toFixed(1)}ms (${findResultCount} results)`); - - // 3. Exact-only (no fuzzy) - const exactTimes: number[] = []; - for (let i = 0; i < 3; i++) { - const [, ms] = timed(() => - searchService.findResultsWithQuery("test", new SearchContext({ fastSearch: true, enableFuzzyMatching: false })) - ); - exactTimes.push(ms); - } - const exactAvg = exactTimes.reduce((a, b) => a + b, 0) / exactTimes.length; - console.log(` findResultsWithQuery (exact): avg ${exactAvg.toFixed(1)}ms`); - console.log(` Fuzzy overhead: ${(findAvg - exactAvg).toFixed(1)}ms`); - - // 4. SearchResult construction + computeScore cost (isolated) - const results = searchService.findResultsWithQuery("test", new SearchContext({ fastSearch: true })); - console.log(` Total results before trim: ${results.length}`); - - const [, scoreAllMs] = timed(() => { - for (const r of results) r.computeScore("test", ["test"], true); - }); - console.log(` computeScore × ${results.length}: ${scoreAllMs.toFixed(1)}ms (${(scoreAllMs / results.length).toFixed(3)}ms/result)`); - - // 5. getNoteTitleForPath for all results - const [, pathTitleMs] = timed(() => { - for (const r of results) beccaService.getNoteTitleForPath(r.notePathArray); - }); - console.log(` getNoteTitleForPath × ${results.length}: ${pathTitleMs.toFixed(1)}ms`); - - // 6. Content snippet extraction (only 200) - const trimmed = results.slice(0, 200); - const [, snippetMs] = timed(() => { - for (const r of trimmed) { - r.contentSnippet = searchService.extractContentSnippet(r.noteId, ["test"]); - } - }); - console.log(` extractContentSnippet × 200: ${snippetMs.toFixed(1)}ms`); - - // 7. Highlighting (only 200) - const [, hlMs] = timed(() => { - searchService.highlightSearchResults(trimmed, ["test"]); - }); - console.log(` highlightSearchResults × 200: ${hlMs.toFixed(1)}ms`); - - // 7b. getBestNotePath cost (used by fast path) - const sampleNotes = Object.values(becca.notes).filter(n => n.title.startsWith("Test Document")).slice(0, 1000); - const [, bestPathMs] = timed(() => { - for (const n of sampleNotes) n.getBestNotePath(); - }); - console.log(` getBestNotePath × ${sampleNotes.length}: ${bestPathMs.toFixed(1)}ms (${(bestPathMs/sampleNotes.length).toFixed(3)}ms/note)`); - - // 8. Full autocomplete end-to-end - const autoTimes: number[] = []; - let autoCount = 0; - for (let i = 0; i < 3; i++) { - const [r, ms] = timed(() => - searchService.searchNotesForAutocomplete("test", true) - ); - autoTimes.push(ms); - autoCount = r.length; - } - const autoAvg = autoTimes.reduce((a, b) => a + b, 0) / autoTimes.length; - const autoMin = Math.min(...autoTimes); - console.log(`\n ★ FULL AUTOCOMPLETE: avg ${autoAvg.toFixed(1)}ms min ${autoMin.toFixed(1)}ms (${autoCount} results)`); - - // 9. With a less common search term (fewer matches) - const rareTimes: number[] = []; - let rareCount = 0; - for (let i = 0; i < 3; i++) { - const [r, ms] = timed(() => - searchService.searchNotesForAutocomplete("leitfaden", true) - ); - rareTimes.push(ms); - rareCount = r.length; - } - const rareAvg = rareTimes.reduce((a, b) => a + b, 0) / rareTimes.length; - console.log(` Autocomplete "leitfaden": avg ${rareAvg.toFixed(1)}ms (${rareCount} results)`); - - // 10. Full search (fastSearch=false) — the 2.7s bottleneck - console.log(`\n ── Full search (fastSearch=false) ──`); - const fullTimes: number[] = []; - let fullCount = 0; - for (let i = 0; i < 2; i++) { - const [r, ms] = timed(() => - searchService.findResultsWithQuery("test", new SearchContext({ fastSearch: false })) - ); - fullTimes.push(ms); - fullCount = r.length; - } - const fullAvg = fullTimes.reduce((a, b) => a + b, 0) / fullTimes.length; - console.log(` Full search (flat + SQL): avg ${fullAvg.toFixed(1)}ms (${fullCount} results)`); - - // 11. SQL content scan alone - const [scanCount, scanMs] = timed(() => { - let count = 0; - for (const row of sql.iterateRows<{ content: Buffer | string }>(` - SELECT noteId, type, mime, content, isProtected - FROM notes JOIN blobs USING (blobId) - WHERE type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap') - AND isDeleted = 0 - AND LENGTH(content) < 2097152`)) { - count++; - } - return count; - }); - console.log(` Raw SQL scan (${scanCount} rows): ${scanMs.toFixed(1)}ms`); - - // ── Summary ── - console.log(`\n ════ SUMMARY ════`); - console.log(` Notes: ${totalNotes} | Matches: ${findResultCount} | Hierarchy depth: 3 (root → folder → note)`); - console.log(` ──────────────────────────────────`); - console.log(` Autocomplete (fast): ${autoAvg.toFixed(1)}ms`); - console.log(` findResults: ${findAvg.toFixed(1)}ms (${((findAvg/autoAvg)*100).toFixed(0)}%)`); - console.log(` snippets+highlight: ${(snippetMs + hlMs).toFixed(1)}ms (${(((snippetMs+hlMs)/autoAvg)*100).toFixed(0)}%)`); - console.log(` Full search: ${fullAvg.toFixed(1)}ms`); - - resolve(); - }); - }); - }, 600_000); -}); diff --git a/apps/server/src/services/search/services/search_benchmark.spec.ts b/apps/server/src/services/search/services/search_benchmark.spec.ts index 53319ff9cd..c3ece17fb5 100644 --- a/apps/server/src/services/search/services/search_benchmark.spec.ts +++ b/apps/server/src/services/search/services/search_benchmark.spec.ts @@ -281,7 +281,9 @@ function printTable(title: string, results: BenchmarkResult[]) { // ── tests ──────────────────────────────────────────────────────────── -describe("Comprehensive Search Benchmark", () => { +// Skipped by default - this is a benchmark, not a test. +// Remove .skip to run manually for performance analysis. +describe.skip("Comprehensive Search Benchmark", () => { afterEach(() => { becca.reset(); From e40504b7f06adab6f8147f072d62d559da275177 Mon Sep 17 00:00:00 2001 From: Elian Doran Date: Mon, 13 Apr 2026 13:43:25 +0300 Subject: [PATCH 32/33] chore(search): address requested changes --- apps/server/src/becca/entities/battribute.ts | 5 +++++ apps/server/src/services/search/services/search.ts | 7 ++++--- apps/server/src/services/search/utils/text_utils.ts | 4 +++- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/apps/server/src/becca/entities/battribute.ts b/apps/server/src/becca/entities/battribute.ts index 77a15c2fd1..dbb6502113 100644 --- a/apps/server/src/becca/entities/battribute.ts +++ b/apps/server/src/becca/entities/battribute.ts @@ -202,6 +202,11 @@ class BAttribute extends AbstractBeccaEntity { this.utcDateModified = dateUtils.utcNowDateTime(); + // Recompute normalized fields in case name/value were modified directly + // (e.g., attr.value = "..." followed by attr.save()) + this.normalizedName = normalize(this.name); + this.normalizedValue = normalize(this.value); + super.beforeSaving(); this.becca.attributes[this.attributeId] = this; diff --git a/apps/server/src/services/search/services/search.ts b/apps/server/src/services/search/services/search.ts index ea1d20c263..0523aeb98f 100644 --- a/apps/server/src/services/search/services/search.ts +++ b/apps/server/src/services/search/services/search.ts @@ -595,10 +595,11 @@ function extractAttributeSnippet(noteId: string, searchTokens: string[], maxLeng // Look for attributes that match the search tokens for (const attr of attributes) { - const attrName = attr.name?.toLowerCase() || ""; - const attrValue = attr.value?.toLowerCase() || ""; + // Use pre-normalized fields from BAttribute for diacritic-insensitive matching + const attrName = attr.normalizedName || normalize(attr.name || ""); + const attrValue = attr.normalizedValue || normalize(attr.value || ""); const attrType = attr.type || ""; - + // Check if any search token matches the attribute name or value const hasMatch = searchTokens.some(token => { const normalizedToken = normalize(token); diff --git a/apps/server/src/services/search/utils/text_utils.ts b/apps/server/src/services/search/utils/text_utils.ts index 7528571f86..1993924555 100644 --- a/apps/server/src/services/search/utils/text_utils.ts +++ b/apps/server/src/services/search/utils/text_utils.ts @@ -282,7 +282,9 @@ export function fuzzyMatchWordWithResult(token: string, text: string, maxDistanc // Exact match check first (most common case) if (normalizedText.includes(normalizedToken)) { - return token; + // Find the exact match position and return the original substring with case preserved + const matchIndex = normalizedText.indexOf(normalizedToken); + return text.substring(matchIndex, matchIndex + normalizedToken.length); } // For fuzzy matching, split into words and check each against the token From f58dd12983cbdd1cdf47b9e72991b721271df4c9 Mon Sep 17 00:00:00 2001 From: Elian Doran Date: Mon, 13 Apr 2026 14:03:17 +0300 Subject: [PATCH 33/33] chore(search): use loop to prevent nested strip tags injection --- .../src/services/search/services/search.ts | 6 +- .../services/search/utils/text_utils.spec.ts | 67 ++++++++++++++++++- .../src/services/search/utils/text_utils.ts | 24 +++++++ 3 files changed, 93 insertions(+), 4 deletions(-) diff --git a/apps/server/src/services/search/services/search.ts b/apps/server/src/services/search/services/search.ts index 0523aeb98f..97bfc457a4 100644 --- a/apps/server/src/services/search/services/search.ts +++ b/apps/server/src/services/search/services/search.ts @@ -8,6 +8,7 @@ import SearchContext from "../search_context.js"; import becca from "../../../becca/becca.js"; import beccaService from "../../../becca/becca_service.js"; import { normalize, removeDiacritic, escapeHtml, escapeRegExp } from "../../utils.js"; +import { stripHtmlTags } from "../utils/text_utils.js"; import log from "../../log.js"; import hoistedNoteService from "../../hoisted_note.js"; import type BNote from "../../../becca/entities/bnote.js"; @@ -494,10 +495,9 @@ function extractContentSnippet(noteId: string, searchTokens: string[], maxLength return ""; // Protected but no session available } - // Strip HTML tags for text notes — use fast regex for snippet extraction - // (striptags library is ~18x slower and not needed for search snippets) + // Strip HTML tags for text notes if (note.type === "text") { - content = content.replace(/<[^>]*>/g, ""); + content = stripHtmlTags(content); } if (!content) { diff --git a/apps/server/src/services/search/utils/text_utils.spec.ts b/apps/server/src/services/search/utils/text_utils.spec.ts index a5f1da129d..146f5cc0fe 100644 --- a/apps/server/src/services/search/utils/text_utils.spec.ts +++ b/apps/server/src/services/search/utils/text_utils.spec.ts @@ -1,5 +1,5 @@ import { describe, it, expect } from "vitest"; -import { calculateOptimizedEditDistance, validateFuzzySearchTokens, fuzzyMatchWord } from './text_utils.js'; +import { calculateOptimizedEditDistance, validateFuzzySearchTokens, fuzzyMatchWord, stripHtmlTags } from './text_utils.js'; describe('Fuzzy Search Core', () => { describe('calculateOptimizedEditDistance', () => { @@ -62,4 +62,69 @@ describe('Fuzzy Search Core', () => { expect(fuzzyMatchWord('a', 'b')).toBe(false); // Very short tokens }); }); + + describe('stripHtmlTags', () => { + it('strips simple HTML tags', () => { + expect(stripHtmlTags('

Hello

')).toBe('Hello'); + expect(stripHtmlTags('
World
')).toBe('World'); + expect(stripHtmlTags('Bold and italic')).toBe('Bold and italic'); + }); + + it('handles self-closing tags', () => { + expect(stripHtmlTags('Line1
Line2')).toBe('Line1Line2'); + expect(stripHtmlTags('Image: ')).toBe('Image: '); + }); + + it('handles tags with attributes', () => { + expect(stripHtmlTags('Link')).toBe('Link'); + expect(stripHtmlTags('
Content
')).toBe('Content'); + }); + + it('handles nested tag patterns securely', () => { + // Security property: no complete patterns remain after stripping + // Residual `>` chars are harmless for XSS + + // Nested tags: inner tag removed, then outer tag removed + // c> → → '' (but leaves residual `c>`) + const result1 = stripHtmlTags('c>text'); + expect(result1).not.toMatch(/<[a-z]/i); // No opening tags remain + expect(result1).toBe('c>text'); // Residual text is safe + + // Complex nesting leaves no exploitable patterns + const result2 = stripHtmlTags('ipt>alert(1)'); + expect(result2).not.toMatch(/