diff --git a/CLAUDE.md b/CLAUDE.md index 1b90b02881..a395f985bf 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -162,8 +162,9 @@ Trilium provides powerful user scripting capabilities: - To add a new user preference: 1. Add the option type to `OptionDefinitions` in `packages/commons/src/lib/options_interface.ts` 2. Add a default value in `apps/server/src/services/options_init.ts` in the `defaultOptions` array - 3. **Whitelist the option** in `apps/server/src/routes/api/options.ts` by adding it to `ALLOWED_OPTIONS` (required for client updates) - 4. Use `useTriliumOption("optionName")` hook in React components to read/write the option + 3. **Whitelist the option** in `apps/server/src/routes/api/options.ts` by adding it to the `ALLOWED_OPTIONS` array — **without this, the API will reject changes with "Option 'X' is not allowed to be changed"** + 4. If the option should be user-editable in the UI, add a control in the appropriate settings component (e.g., `apps/client/src/widgets/type_widgets/options/other.tsx`) and a translation key in `apps/client/src/translations/en/translation.json` + 5. Use `useTriliumOption("optionName")` hook in React components to read/write the option - Available hooks: `useTriliumOption` (string), `useTriliumOptionBool`, `useTriliumOptionInt`, `useTriliumOptionJson` - See `docs/Developer Guide/Developer Guide/Concepts/Options/Creating a new option.md` for detailed documentation diff --git a/apps/client/src/translations/en/translation.json b/apps/client/src/translations/en/translation.json index d5607a95e7..25e6940e9a 100644 --- a/apps/client/src/translations/en/translation.json +++ b/apps/client/src/translations/en/translation.json @@ -1324,6 +1324,13 @@ "erase_excess_revision_snapshots": "Erase excess revision snapshots now", "erase_excess_revision_snapshots_prompt": "Excess revision snapshots have been erased." }, + "search": { + "title": "Search", + "fuzzy_matching_label": "Typo tolerance in search", + "fuzzy_matching_description": "Affects quick search and full search. Finds similar words when exact matches are insufficient.", + "autocomplete_fuzzy_label": "Typo tolerance in autocomplete", + "autocomplete_fuzzy_description": "Affects jump-to-note and note selectors. Slower but tolerates typos." + }, "search_engine": { "title": "Search Engine", "custom_search_engine_info": "Custom search engine requires both a name and a URL to be set. If either of these is not set, DuckDuckGo will be used as the default search engine.", diff --git a/apps/client/src/widgets/type_widgets/options/other.tsx b/apps/client/src/widgets/type_widgets/options/other.tsx index e6813f8d2b..b1ccfcfcc0 100644 --- a/apps/client/src/widgets/type_widgets/options/other.tsx +++ b/apps/client/src/widgets/type_widgets/options/other.tsx @@ -14,13 +14,16 @@ import FormGroup from "../../react/FormGroup"; import FormSelect from "../../react/FormSelect"; import FormText from "../../react/FormText"; import FormTextBox, { FormTextBoxWithUnit } from "../../react/FormTextBox"; +import FormToggle from "../../react/FormToggle"; import { useTriliumOption, useTriliumOptionBool, useTriliumOptionJson } from "../../react/hooks"; +import OptionsRow from "./components/OptionsRow"; import OptionsSection from "./components/OptionsSection"; import TimeSelector from "./components/TimeSelector"; export default function OtherSettings() { return ( <> + {isElectron() && <> @@ -36,6 +39,39 @@ export default function OtherSettings() { ); } +function SearchSettings() { + const [ fuzzyEnabled, setFuzzyEnabled ] = useTriliumOptionBool("searchEnableFuzzyMatching"); + const [ autocompleteFuzzy, setAutocompleteFuzzy ] = useTriliumOptionBool("searchAutocompleteFuzzy"); + + return ( + + + + + + + + + + ); +} + function SearchEngineSettings() { const [ customSearchEngineName, setCustomSearchEngineName ] = useTriliumOption("customSearchEngineName"); const [ customSearchEngineUrl, setCustomSearchEngineUrl ] = useTriliumOption("customSearchEngineUrl"); diff --git a/apps/server/spec/db/document.db b/apps/server/spec/db/document.db index 371c3a1329..f5cf761826 100644 Binary files a/apps/server/spec/db/document.db and b/apps/server/spec/db/document.db differ diff --git a/apps/server/src/becca/becca-interface.ts b/apps/server/src/becca/becca-interface.ts index 2ffba1467c..f1e7c7fde3 100644 --- a/apps/server/src/becca/becca-interface.ts +++ b/apps/server/src/becca/becca-interface.ts @@ -1,4 +1,6 @@ import sql from "../services/sql.js"; +import log from "../services/log.js"; +import { formatSize } from "../services/utils.js"; import NoteSet from "../services/search/note_set.js"; import NotFoundError from "../errors/not_found_error.js"; import type BOption from "./entities/boption.js"; @@ -31,9 +33,22 @@ export default class Becca { allNoteSetCache: NoteSet | null; + /** + * Pre-built parallel arrays for fast flat text scanning in search. + * Avoids per-note property access overhead when iterating 50K+ notes. + * Supports incremental updates: when individual notes change, only their + * entries are rebuilt rather than the entire index. + */ + flatTextIndex: { notes: BNote[], flatTexts: string[], noteIdToIdx: Map } | null; + + /** NoteIds whose flat text needs to be recomputed in the index. */ + dirtyFlatTextNoteIds: Set; + constructor() { - this.reset(); + this.dirtyFlatTextNoteIds = new Set(); this.allNoteSetCache = null; + this.flatTextIndex = null; + this.reset(); } reset() { @@ -242,6 +257,67 @@ export default class Becca { /** Should be called when the set of all non-skeleton notes changes (added/removed) */ dirtyNoteSetCache() { this.allNoteSetCache = null; + // Full rebuild needed since the note set itself changed + this.flatTextIndex = null; + this.dirtyFlatTextNoteIds.clear(); + } + + /** Mark a single note's flat text as needing recomputation in the index. */ + dirtyNoteFlatText(noteId: string) { + if (this.flatTextIndex) { + // Index exists — schedule an incremental update + this.dirtyFlatTextNoteIds.add(noteId); + } + // If flatTextIndex is null, full rebuild will happen on next access anyway + } + + /** + * Returns pre-built parallel arrays of notes and their flat texts for fast scanning. + * The flat texts are already normalized (lowercase, diacritics removed). + * Supports incremental updates: when individual notes are dirtied, only their + * entries are recomputed rather than rebuilding the entire index. + */ + getFlatTextIndex(): { notes: BNote[], flatTexts: string[], noteIdToIdx: Map } { + if (!this.flatTextIndex) { + // Measure heap before building + const heapBefore = process.memoryUsage().heapUsed; + + const allNoteSet = this.getAllNoteSet(); + const notes: BNote[] = []; + const flatTexts: string[] = []; + const noteIdToIdx = new Map(); + + for (const note of allNoteSet.notes) { + noteIdToIdx.set(note.noteId, notes.length); + notes.push(note); + flatTexts.push(note.getFlatText()); + } + + this.flatTextIndex = { notes, flatTexts, noteIdToIdx }; + this.dirtyFlatTextNoteIds.clear(); + + // Measure heap after building and log + const heapAfter = process.memoryUsage().heapUsed; + const heapDelta = heapAfter - heapBefore; + log.info(`Flat text search index built: ${notes.length} notes, ${formatSize(heapDelta)}`); + } else if (this.dirtyFlatTextNoteIds.size > 0) { + // Incremental update: only recompute flat texts for dirtied notes + const { flatTexts, noteIdToIdx } = this.flatTextIndex; + + for (const noteId of this.dirtyFlatTextNoteIds) { + const idx = noteIdToIdx.get(noteId); + if (idx !== undefined) { + const note = this.notes[noteId]; + if (note) { + flatTexts[idx] = note.getFlatText(); + } + } + } + + this.dirtyFlatTextNoteIds.clear(); + } + + return this.flatTextIndex; } getAllNoteSet() { diff --git a/apps/server/src/becca/entities/battribute.ts b/apps/server/src/becca/entities/battribute.ts index 6ff1246fcf..dbb6502113 100644 --- a/apps/server/src/becca/entities/battribute.ts +++ b/apps/server/src/becca/entities/battribute.ts @@ -6,6 +6,7 @@ import dateUtils from "../../services/date_utils.js"; import promotedAttributeDefinitionParser from "../../services/promoted_attribute_definition_parser.js"; import sanitizeAttributeName from "../../services/sanitize_attribute_name.js"; import type { AttributeRow, AttributeType } from "@triliumnext/commons"; +import { normalize } from "../../services/utils.js"; interface SavingOpts { skipValidation?: boolean; @@ -34,6 +35,11 @@ class BAttribute extends AbstractBeccaEntity { value!: string; isInheritable!: boolean; + /** Pre-normalized (lowercase, diacritics removed) name for search. */ + normalizedName!: string; + /** Pre-normalized (lowercase, diacritics removed) value for search. */ + normalizedValue!: string; + constructor(row?: AttributeRow) { super(); @@ -59,6 +65,10 @@ class BAttribute extends AbstractBeccaEntity { this.isInheritable = !!isInheritable; this.utcDateModified = utcDateModified; + // Pre-compute normalized forms for search (avoids repeated normalize() calls in hot loops) + this.normalizedName = normalize(this.name); + this.normalizedValue = normalize(this.value); + return this; } @@ -192,6 +202,11 @@ class BAttribute extends AbstractBeccaEntity { this.utcDateModified = dateUtils.utcNowDateTime(); + // Recompute normalized fields in case name/value were modified directly + // (e.g., attr.value = "..." followed by attr.save()) + this.normalizedName = normalize(this.name); + this.normalizedValue = normalize(this.value); + super.beforeSaving(); this.becca.attributes[this.attributeId] = this; diff --git a/apps/server/src/becca/entities/bnote.ts b/apps/server/src/becca/entities/bnote.ts index fe0cca706b..06a9fd41dc 100644 --- a/apps/server/src/becca/entities/bnote.ts +++ b/apps/server/src/becca/entities/bnote.ts @@ -802,6 +802,9 @@ class BNote extends AbstractBeccaEntity { this.__attributeCache = null; this.__inheritableAttributeCache = null; this.__ancestorCache = null; + + // Mark only this note's flat text as dirty for incremental index update + this.becca.dirtyNoteFlatText(this.noteId); } invalidateSubTree(path: string[] = []) { diff --git a/apps/server/src/routes/api/options.ts b/apps/server/src/routes/api/options.ts index e39cf7dc7f..9be9ba0670 100644 --- a/apps/server/src/routes/api/options.ts +++ b/apps/server/src/routes/api/options.ts @@ -99,6 +99,8 @@ const ALLOWED_OPTIONS = new Set([ "layoutOrientation", "backgroundEffects", "allowedHtmlTags", + "searchEnableFuzzyMatching", + "searchAutocompleteFuzzy", "redirectBareDomain", "showLoginInShareTheme", "splitEditorOrientation", diff --git a/apps/server/src/services/options_init.ts b/apps/server/src/services/options_init.ts index e15e26ad00..4bff15d91e 100644 --- a/apps/server/src/services/options_init.ts +++ b/apps/server/src/services/options_init.ts @@ -234,6 +234,10 @@ const defaultOptions: DefaultOption[] = [ isSynced: true }, + // Search settings + { name: "searchEnableFuzzyMatching", value: "true", isSynced: true }, + { name: "searchAutocompleteFuzzy", value: "false", isSynced: true }, + // Share settings { name: "redirectBareDomain", value: "false", isSynced: true }, { name: "showLoginInShareTheme", value: "false", isSynced: true }, diff --git a/apps/server/src/services/search/expressions/note_flat_text.ts b/apps/server/src/services/search/expressions/note_flat_text.ts index b9ad19c36c..b12413738e 100644 --- a/apps/server/src/services/search/expressions/note_flat_text.ts +++ b/apps/server/src/services/search/expressions/note_flat_text.ts @@ -7,7 +7,7 @@ import Expression from "./expression.js"; import NoteSet from "../note_set.js"; import becca from "../../../becca/becca.js"; import { normalize } from "../../utils.js"; -import { normalizeSearchText, fuzzyMatchWord, fuzzyMatchWordWithResult } from "../utils/text_utils.js"; +import { normalizeSearchText, fuzzyMatchWordWithResult } from "../utils/text_utils.js"; import beccaService from "../../../becca/becca_service.js"; class NoteFlatTextExp extends Expression { @@ -23,6 +23,18 @@ class NoteFlatTextExp extends Expression { execute(inputNoteSet: NoteSet, executionContext: any, searchContext: SearchContext) { const resultNoteSet = new NoteSet(); + // Cache normalized titles to avoid redundant normalize+getNoteTitle calls + const titleCache = new Map(); + const getNormalizedTitle = (noteId: string, parentNoteId: string): string => { + const key = `${noteId}-${parentNoteId}`; + let cached = titleCache.get(key); + if (cached === undefined) { + cached = normalizeSearchText(beccaService.getNoteTitle(noteId, parentNoteId)); + titleCache.set(key, cached); + } + return cached; + }; + /** * @param note * @param remainingTokens - tokens still needed to be found in the path towards root @@ -38,10 +50,8 @@ class NoteFlatTextExp extends Expression { const noteId = resultPath[resultPath.length - 1]; if (!resultNoteSet.hasNoteId(noteId)) { - // we could get here from multiple paths, the first one wins because the paths - // are sorted by importance + // Snapshot takenPath since it's mutable executionContext.noteIdToNotePath[noteId] = resultPath; - resultNoteSet.add(becca.notes[noteId]); } } @@ -50,45 +60,40 @@ class NoteFlatTextExp extends Expression { } if (note.parents.length === 0 || note.noteId === "root") { - // we've reached root, but there are still remaining tokens -> this candidate note produced no result return; } const foundAttrTokens: string[] = []; for (const token of remainingTokens) { - // Add defensive checks for undefined properties - const typeMatches = note.type && note.type.includes(token); - const mimeMatches = note.mime && note.mime.includes(token); - - if (typeMatches || mimeMatches) { + if ((note.type && note.type.includes(token)) || + (note.mime && note.mime.includes(token))) { foundAttrTokens.push(token); } } for (const attribute of note.getOwnedAttributes()) { - const normalizedName = normalizeSearchText(attribute.name); - const normalizedValue = normalizeSearchText(attribute.value); - for (const token of remainingTokens) { - if (normalizedName.includes(token) || normalizedValue.includes(token)) { + if (attribute.normalizedName.includes(token) || attribute.normalizedValue.includes(token)) { foundAttrTokens.push(token); } } } for (const parentNote of note.parents) { - const title = normalizeSearchText(beccaService.getNoteTitle(note.noteId, parentNote.noteId)); - const foundTokens: string[] = foundAttrTokens.slice(); + const title = getNormalizedTitle(note.noteId, parentNote.noteId); + + // Use Set for O(1) lookup instead of Array.includes() which is O(n) + const foundTokenSet = new Set(foundAttrTokens); for (const token of remainingTokens) { if (this.smartMatch(title, token, searchContext)) { - foundTokens.push(token); + foundTokenSet.add(token); } } - if (foundTokens.length > 0) { - const newRemainingTokens = remainingTokens.filter((token) => !foundTokens.includes(token)); + if (foundTokenSet.size > 0) { + const newRemainingTokens = remainingTokens.filter((token) => !foundTokenSet.has(token)); searchPathTowardsRoot(parentNote, newRemainingTokens, [note.noteId, ...takenPath]); } else { @@ -99,6 +104,22 @@ class NoteFlatTextExp extends Expression { const candidateNotes = this.getCandidateNotes(inputNoteSet, searchContext); + // Fast path for single-token autocomplete searches: + // Skip the expensive recursive parent walk and just use getBestNotePath(). + // The flat text already matched, so we know the token is present. + if (this.tokens.length === 1 && searchContext.autocomplete) { + for (const note of candidateNotes) { + if (!resultNoteSet.hasNoteId(note.noteId)) { + const notePath = note.getBestNotePath(); + if (notePath) { + executionContext.noteIdToNotePath[note.noteId] = notePath; + resultNoteSet.add(note); + } + } + } + return resultNoteSet; + } + for (const note of candidateNotes) { // autocomplete should be able to find notes by their noteIds as well (only leafs) if (this.tokens.length === 1 && note.noteId.toLowerCase() === this.tokens[0]) { @@ -112,13 +133,13 @@ class NoteFlatTextExp extends Expression { // Add defensive checks for undefined properties const typeMatches = note.type && note.type.includes(token); const mimeMatches = note.mime && note.mime.includes(token); - + if (typeMatches || mimeMatches) { foundAttrTokens.push(token); } for (const attribute of note.ownedAttributes) { - if (normalizeSearchText(attribute.name).includes(token) || normalizeSearchText(attribute.value).includes(token)) { + if (attribute.normalizedName.includes(token) || attribute.normalizedValue.includes(token)) { foundAttrTokens.push(token); } } @@ -165,10 +186,25 @@ class NoteFlatTextExp extends Expression { getCandidateNotes(noteSet: NoteSet, searchContext?: SearchContext): BNote[] { const candidateNotes: BNote[] = []; - for (const note of noteSet.notes) { - const normalizedFlatText = normalizeSearchText(note.getFlatText()); + // Use the pre-built flat text index for fast scanning. + // This provides pre-computed flat texts in a parallel array, avoiding + // per-note property access overhead at large scale (50K+ notes). + const { notes: indexNotes, flatTexts } = becca.getFlatTextIndex(); + + // Build a set for quick membership check when noteSet isn't the full set + const isFullSet = noteSet.notes.length === indexNotes.length; + + for (let i = 0; i < indexNotes.length; i++) { + const note = indexNotes[i]; + + // Skip notes not in the input set (only check when not using the full set) + if (!isFullSet && !noteSet.hasNoteId(note.noteId)) { + continue; + } + + const flatText = flatTexts[i]; for (const token of this.tokens) { - if (this.smartMatch(normalizedFlatText, token, searchContext)) { + if (this.smartMatch(flatText, token, searchContext)) { candidateNotes.push(note); break; } diff --git a/apps/server/src/services/search/search_context.ts b/apps/server/src/services/search/search_context.ts index 314c7e7ce6..79b0b6db3d 100644 --- a/apps/server/src/services/search/search_context.ts +++ b/apps/server/src/services/search/search_context.ts @@ -1,6 +1,7 @@ "use strict"; import hoistedNoteService from "../hoisted_note.js"; +import optionService from "../options.js"; import type { SearchParams } from "./services/types.js"; class SearchContext { @@ -18,6 +19,8 @@ class SearchContext { debug?: boolean; debugInfo: {} | null; fuzzyAttributeSearch: boolean; + /** When true, skip the two-phase fuzzy fallback and use the single-token fast path. */ + autocomplete: boolean; enableFuzzyMatching: boolean; // Controls whether fuzzy matching is enabled for this search phase highlightedTokens: string[]; originalQuery: string; @@ -46,7 +49,12 @@ class SearchContext { this.debug = params.debug; this.debugInfo = null; this.fuzzyAttributeSearch = !!params.fuzzyAttributeSearch; - this.enableFuzzyMatching = true; // Default to true for backward compatibility + this.autocomplete = !!params.autocomplete; + try { + this.enableFuzzyMatching = optionService.getOptionBool("searchEnableFuzzyMatching"); + } catch { + this.enableFuzzyMatching = true; // Default to true if option not yet initialized + } this.highlightedTokens = []; this.originalQuery = ""; this.fulltextQuery = ""; // complete fulltext part diff --git a/apps/server/src/services/search/search_result.ts b/apps/server/src/services/search/search_result.ts index 10e4b33ca6..b58eb42ae8 100644 --- a/apps/server/src/services/search/search_result.ts +++ b/apps/server/src/services/search/search_result.ts @@ -56,8 +56,9 @@ class SearchResult { this.fuzzyScore = 0; // Reset fuzzy score tracking const note = becca.notes[this.noteId]; - const normalizedQuery = normalizeSearchText(fulltextQuery.toLowerCase()); - const normalizedTitle = normalizeSearchText(note.title.toLowerCase()); + // normalizeSearchText already lowercases — no need for .toLowerCase() first + const normalizedQuery = normalizeSearchText(fulltextQuery); + const normalizedTitle = normalizeSearchText(note.title); // Note ID exact match, much higher score if (note.noteId.toLowerCase() === fulltextQuery) { @@ -88,30 +89,32 @@ class SearchResult { } addScoreForStrings(tokens: string[], str: string, factor: number, enableFuzzyMatching: boolean = true) { - const normalizedStr = normalizeSearchText(str.toLowerCase()); + // normalizeSearchText already lowercases — no need for .toLowerCase() first + const normalizedStr = normalizeSearchText(str); const chunks = normalizedStr.split(" "); + // Pre-normalize tokens once instead of per-chunk + const normalizedTokens = tokens.map(t => normalizeSearchText(t)); + let tokenScore = 0; for (const chunk of chunks) { - for (const token of tokens) { - const normalizedToken = normalizeSearchText(token.toLowerCase()); + for (let ti = 0; ti < normalizedTokens.length; ti++) { + const normalizedToken = normalizedTokens[ti]; if (chunk === normalizedToken) { - tokenScore += SCORE_WEIGHTS.TOKEN_EXACT_MATCH * token.length * factor; + tokenScore += SCORE_WEIGHTS.TOKEN_EXACT_MATCH * tokens[ti].length * factor; } else if (chunk.startsWith(normalizedToken)) { - tokenScore += SCORE_WEIGHTS.TOKEN_PREFIX_MATCH * token.length * factor; + tokenScore += SCORE_WEIGHTS.TOKEN_PREFIX_MATCH * tokens[ti].length * factor; } else if (chunk.includes(normalizedToken)) { - tokenScore += SCORE_WEIGHTS.TOKEN_CONTAINS_MATCH * token.length * factor; - } else { - // Try fuzzy matching for individual tokens with caps applied + tokenScore += SCORE_WEIGHTS.TOKEN_CONTAINS_MATCH * tokens[ti].length * factor; + } else if (enableFuzzyMatching && + normalizedToken.length >= FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH && + this.fuzzyScore < SCORE_WEIGHTS.MAX_TOTAL_FUZZY_SCORE) { + // Only compute edit distance when fuzzy matching is enabled const editDistance = calculateOptimizedEditDistance(chunk, normalizedToken, FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE); - if (editDistance <= FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE && - normalizedToken.length >= FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH && - this.fuzzyScore < SCORE_WEIGHTS.MAX_TOTAL_FUZZY_SCORE) { - + if (editDistance <= FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE) { const fuzzyWeight = SCORE_WEIGHTS.TOKEN_FUZZY_MATCH * (1 - editDistance / FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE); - // Apply caps: limit token length multiplier and per-token contribution - const cappedTokenLength = Math.min(token.length, SCORE_WEIGHTS.MAX_FUZZY_TOKEN_LENGTH_MULTIPLIER); + const cappedTokenLength = Math.min(tokens[ti].length, SCORE_WEIGHTS.MAX_FUZZY_TOKEN_LENGTH_MULTIPLIER); const fuzzyTokenScore = Math.min( fuzzyWeight * cappedTokenLength * factor, SCORE_WEIGHTS.MAX_FUZZY_SCORE_PER_TOKEN diff --git a/apps/server/src/services/search/services/search.ts b/apps/server/src/services/search/services/search.ts index 09b003a2e3..97bfc457a4 100644 --- a/apps/server/src/services/search/services/search.ts +++ b/apps/server/src/services/search/services/search.ts @@ -1,6 +1,5 @@ "use strict"; -import normalizeString from "normalize-strings"; import lex from "./lex.js"; import handleParens from "./handle_parens.js"; import parse from "./parse.js"; @@ -8,7 +7,8 @@ import SearchResult from "../search_result.js"; import SearchContext from "../search_context.js"; import becca from "../../../becca/becca.js"; import beccaService from "../../../becca/becca_service.js"; -import { normalize, escapeHtml, escapeRegExp } from "../../utils.js"; +import { normalize, removeDiacritic, escapeHtml, escapeRegExp } from "../../utils.js"; +import { stripHtmlTags } from "../utils/text_utils.js"; import log from "../../log.js"; import hoistedNoteService from "../../hoisted_note.js"; import type BNote from "../../../becca/entities/bnote.js"; @@ -17,8 +17,8 @@ import type { SearchParams, TokenStructure } from "./types.js"; import type Expression from "../expressions/expression.js"; import sql from "../../sql.js"; import scriptService from "../../script.js"; -import striptags from "striptags"; import protectedSessionService from "../../protected_session.js"; +import optionService from "../../options.js"; export interface SearchNoteResult { searchResultNoteIds: string[]; @@ -252,21 +252,21 @@ function findResultsWithExpression(expression: Expression, searchContext: Search // Phase 1: Try exact matches first (without fuzzy matching) const exactResults = performSearch(expression, searchContext, false); - + // Check if we have sufficient high-quality results const minResultThreshold = 5; const minScoreForQuality = 10; // Minimum score to consider a result "high quality" - + const highQualityResults = exactResults.filter(result => result.score >= minScoreForQuality); - + // If we have enough high-quality exact matches, return them if (highQualityResults.length >= minResultThreshold) { return exactResults; } - + // Phase 2: Add fuzzy matching as fallback when exact matches are insufficient const fuzzyResults = performSearch(expression, searchContext, true); - + // Merge results, ensuring exact matches always rank higher than fuzzy matches return mergeExactAndFuzzyResults(exactResults, fuzzyResults); } @@ -410,6 +410,12 @@ function findResultsWithQuery(query: string, searchContext: SearchContext): Sear query = query || ""; searchContext.originalQuery = query; + // For autocomplete searches, use the dedicated autocomplete fuzzy option + // instead of the global fuzzy setting. Do this early so it applies to all code paths. + if (searchContext.autocomplete) { + searchContext.enableFuzzyMatching = optionService.getOptionBool("searchAutocompleteFuzzy"); + } + const expression = parseQueryToExpression(query, searchContext); if (!expression) { @@ -491,75 +497,63 @@ function extractContentSnippet(noteId: string, searchTokens: string[], maxLength // Strip HTML tags for text notes if (note.type === "text") { - content = striptags(content); + content = stripHtmlTags(content); } - // Normalize whitespace while preserving paragraph breaks - // First, normalize multiple newlines to double newlines (paragraph breaks) - content = content.replace(/\n\s*\n/g, "\n\n"); - // Then normalize spaces within lines - content = content.split('\n').map(line => line.replace(/\s+/g, " ").trim()).join('\n'); - // Finally trim the whole content - content = content.trim(); - if (!content) { return ""; } - // Try to find a snippet around the first matching token - const normalizedContent = normalizeString(content.toLowerCase()); + // Find match position using normalize on the raw stripped content. + // We use a single normalize() pass — no need for expensive whitespace + // normalization just to find the match index. + const normalizedContent = normalize(content); + const normalizedTokens = searchTokens.map(token => normalize(token)); let snippetStart = 0; - let matchFound = false; - for (const token of searchTokens) { - const normalizedToken = normalizeString(token.toLowerCase()); + for (const normalizedToken of normalizedTokens) { const matchIndex = normalizedContent.indexOf(normalizedToken); - + if (matchIndex !== -1) { // Center the snippet around the match snippetStart = Math.max(0, matchIndex - maxLength / 2); - matchFound = true; break; } } - // Extract snippet - let snippet = content.substring(snippetStart, snippetStart + maxLength); + // Extract a snippet region from the raw content, then clean only that + const snippetRegion = content.substring(snippetStart, snippetStart + maxLength + 100); - // If snippet contains linebreaks, limit to max 4 lines and override character limit + // Normalize whitespace only on the small snippet region + let snippet = snippetRegion + .replace(/\n\s*\n/g, "\n\n") + .replace(/[ \t]+/g, " ") + .trim() + .substring(0, maxLength); + + // If snippet contains linebreaks, limit to max 4 lines const lines = snippet.split('\n'); if (lines.length > 4) { - // Find which lines contain the search tokens to ensure they're included - const normalizedLines = lines.map(line => normalizeString(line.toLowerCase())); - const normalizedTokens = searchTokens.map(token => normalizeString(token.toLowerCase())); - // Find the first line that contains a search token let firstMatchLine = -1; - for (let i = 0; i < normalizedLines.length; i++) { - if (normalizedTokens.some(token => normalizedLines[i].includes(token))) { + for (let i = 0; i < lines.length; i++) { + const normalizedLine = normalize(lines[i]); + if (normalizedTokens.some(token => normalizedLine.includes(token))) { firstMatchLine = i; break; } } if (firstMatchLine !== -1) { - // Center the 4-line window around the first match - // Try to show 1 line before and 2 lines after the match const startLine = Math.max(0, firstMatchLine - 1); const endLine = Math.min(lines.length, startLine + 4); snippet = lines.slice(startLine, endLine).join('\n'); } else { - // No match found in lines (shouldn't happen), just take first 4 snippet = lines.slice(0, 4).join('\n'); } - // Add ellipsis if we truncated lines snippet = snippet + "..."; - } else if (lines.length > 1) { - // For multi-line snippets that are 4 or fewer lines, keep them as-is - // No need to truncate - } else { - // Single line content - apply original word boundary logic - // Try to start/end at word boundaries + } else if (lines.length <= 1) { + // Single line content - apply word boundary logic if (snippetStart > 0) { const firstSpace = snippet.search(/\s/); if (firstSpace > 0 && firstSpace < 20) { @@ -567,7 +561,7 @@ function extractContentSnippet(noteId: string, searchTokens: string[], maxLength } snippet = "..." + snippet; } - + if (snippetStart + maxLength < content.length) { const lastSpace = snippet.search(/\s[^\s]*$/); if (lastSpace > snippet.length - 20 && lastSpace > 0) { @@ -601,13 +595,14 @@ function extractAttributeSnippet(noteId: string, searchTokens: string[], maxLeng // Look for attributes that match the search tokens for (const attr of attributes) { - const attrName = attr.name?.toLowerCase() || ""; - const attrValue = attr.value?.toLowerCase() || ""; + // Use pre-normalized fields from BAttribute for diacritic-insensitive matching + const attrName = attr.normalizedName || normalize(attr.name || ""); + const attrValue = attr.normalizedValue || normalize(attr.value || ""); const attrType = attr.type || ""; - + // Check if any search token matches the attribute name or value const hasMatch = searchTokens.some(token => { - const normalizedToken = normalizeString(token.toLowerCase()); + const normalizedToken = normalize(token); return attrName.includes(normalizedToken) || attrValue.includes(normalizedToken); }); @@ -675,7 +670,8 @@ function searchNotesForAutocomplete(query: string, fastSearch: boolean = true) { includeHiddenNotes: true, fuzzyAttributeSearch: true, ignoreInternalAttributes: true, - ancestorNoteId: hoistedNoteService.isHoistedInHiddenSubtree() ? "root" : hoistedNoteService.getHoistedNoteId() + ancestorNoteId: hoistedNoteService.isHoistedInHiddenSubtree() ? "root" : hoistedNoteService.getHoistedNoteId(), + autocomplete: true }); const allSearchResults = findResultsWithQuery(query, searchContext); @@ -752,37 +748,40 @@ function highlightSearchResults(searchResults: SearchResult[], highlightedTokens } for (const result of searchResults) { - // Reset token - const tokenRegex = new RegExp(escapeRegExp(token), "gi"); let match; // Highlight in note path title if (result.highlightedNotePathTitle) { const titleRegex = new RegExp(escapeRegExp(token), "gi"); - while ((match = titleRegex.exec(normalizeString(result.highlightedNotePathTitle))) !== null) { + // Compute diacritic-free version ONCE before the loop, not on every iteration + let titleNoDiacritics = removeDiacritic(result.highlightedNotePathTitle); + while ((match = titleRegex.exec(titleNoDiacritics)) !== null) { result.highlightedNotePathTitle = wrapText(result.highlightedNotePathTitle, match.index, token.length, "{", "}"); - // 2 characters are added, so we need to adjust the index + // 2 characters are added, so we need to adjust the index and re-derive titleRegex.lastIndex += 2; + titleNoDiacritics = removeDiacritic(result.highlightedNotePathTitle); } } // Highlight in content snippet if (result.highlightedContentSnippet) { const contentRegex = new RegExp(escapeRegExp(token), "gi"); - while ((match = contentRegex.exec(normalizeString(result.highlightedContentSnippet))) !== null) { + let contentNoDiacritics = removeDiacritic(result.highlightedContentSnippet); + while ((match = contentRegex.exec(contentNoDiacritics)) !== null) { result.highlightedContentSnippet = wrapText(result.highlightedContentSnippet, match.index, token.length, "{", "}"); - // 2 characters are added, so we need to adjust the index contentRegex.lastIndex += 2; + contentNoDiacritics = removeDiacritic(result.highlightedContentSnippet); } } // Highlight in attribute snippet if (result.highlightedAttributeSnippet) { const attributeRegex = new RegExp(escapeRegExp(token), "gi"); - while ((match = attributeRegex.exec(normalizeString(result.highlightedAttributeSnippet))) !== null) { + let attrNoDiacritics = removeDiacritic(result.highlightedAttributeSnippet); + while ((match = attributeRegex.exec(attrNoDiacritics)) !== null) { result.highlightedAttributeSnippet = wrapText(result.highlightedAttributeSnippet, match.index, token.length, "{", "}"); - // 2 characters are added, so we need to adjust the index attributeRegex.lastIndex += 2; + attrNoDiacritics = removeDiacritic(result.highlightedAttributeSnippet); } } } diff --git a/apps/server/src/services/search/services/search_benchmark.spec.ts b/apps/server/src/services/search/services/search_benchmark.spec.ts new file mode 100644 index 0000000000..c3ece17fb5 --- /dev/null +++ b/apps/server/src/services/search/services/search_benchmark.spec.ts @@ -0,0 +1,677 @@ +/** + * Comprehensive search benchmark suite. + * + * Covers many scenarios: + * - Single-token, multi-token, phrase-like queries + * - Fuzzy matching enabled vs disabled + * - Autocomplete vs full search + * - Diacritics / unicode queries + * - No-match queries + * - Varying note counts (1K, 5K, 10K, 20K) + * - Warm cache vs cold cache + * + * All times are in-memory (monkeypatched getContent, no real SQL). + */ +import { describe, it, expect, afterEach } from "vitest"; +import searchService from "./search.js"; +import BNote from "../../../becca/entities/bnote.js"; +import BBranch from "../../../becca/entities/bbranch.js"; +import SearchContext from "../search_context.js"; +import becca from "../../../becca/becca.js"; +import { NoteBuilder, note } from "../../../test/becca_mocking.js"; + +// ── helpers ────────────────────────────────────────────────────────── + +function randomWord(len = 6): string { + const chars = "abcdefghijklmnopqrstuvwxyz"; + let word = ""; + for (let i = 0; i < len; i++) { + word += chars[Math.floor(Math.random() * chars.length)]; + } + return word; +} + +function generateHtmlContent(wordCount: number, includeKeywords = false, keywords?: string[]): string { + const paragraphs: string[] = []; + let wordsRemaining = wordCount; + const kws = keywords ?? []; + + while (wordsRemaining > 0) { + const paraWords = Math.min(wordsRemaining, 20 + Math.floor(Math.random() * 40)); + const words: string[] = []; + for (let i = 0; i < paraWords; i++) { + words.push(randomWord(3 + Math.floor(Math.random() * 10))); + } + if (includeKeywords && paragraphs.length === 2) { + for (let k = 0; k < kws.length; k++) { + const pos = Math.min(words.length - 1, Math.floor((words.length / (kws.length + 1)) * (k + 1))); + words[pos] = kws[k]; + } + } + paragraphs.push(`

${words.join(" ")}

`); + wordsRemaining -= paraWords; + } + + return `${paragraphs.join("\n")}`; +} + +function timed(fn: () => T): [T, number] { + const start = performance.now(); + const result = fn(); + return [result, performance.now() - start]; +} + +function avg(nums: number[]): number { + return nums.reduce((a, b) => a + b, 0) / nums.length; +} + +function min(nums: number[]): number { + return Math.min(...nums); +} + +// ── dataset builder ────────────────────────────────────────────────── + +const syntheticContent: Record = {}; + +function buildDataset(noteCount: number, opts: { + matchFraction?: number; + labelsPerNote?: number; + depth?: number; + contentWordCount?: number; + varyContentSize?: boolean; + titleKeywords?: string[]; + contentKeywords?: string[]; + /** Include notes with diacritics in titles */ + includeDiacritics?: boolean; +} = {}) { + const { + matchFraction = 0.1, + labelsPerNote = 3, + depth = 4, + contentWordCount = 300, + varyContentSize = true, + titleKeywords = ["target"], + contentKeywords = titleKeywords, + includeDiacritics = false, + } = opts; + + becca.reset(); + for (const key of Object.keys(syntheticContent)) { + delete syntheticContent[key]; + } + + const rootNote = new NoteBuilder(new BNote({ noteId: "root", title: "root", type: "text" })); + new BBranch({ + branchId: "none_root", + noteId: "root", + parentNoteId: "none", + notePosition: 10 + }); + + const containers: NoteBuilder[] = []; + let parent = rootNote; + for (let d = 0; d < depth; d++) { + const container = note(`Container_${d}_${randomWord(4)}`); + parent.child(container); + containers.push(container); + parent = container; + } + + const matchCount = Math.floor(noteCount * matchFraction); + const diacriticTitles = [ + "résumé", "naïve", "café", "über", "ñoño", "exposé", + "Ångström", "Üntersuchung", "São Paulo", "François" + ]; + + for (let i = 0; i < noteCount; i++) { + const isMatch = i < matchCount; + let title: string; + + if (includeDiacritics && i % 20 === 0) { + // Every 20th note gets a diacritics-heavy title + const dTitle = diacriticTitles[i % diacriticTitles.length]; + title = isMatch + ? `${dTitle} ${titleKeywords.join(" ")} Document ${i}` + : `${dTitle} ${randomWord(5)} Note ${i}`; + } else { + title = isMatch + ? `${randomWord(5)} ${titleKeywords.join(" ")} ${randomWord(5)} Document ${i}` + : `${randomWord(5)} ${randomWord(6)} ${randomWord(4)} Note ${i}`; + } + + const n = note(title); + + for (let l = 0; l < labelsPerNote; l++) { + const labelName = isMatch && l === 0 ? "category" : `label_${randomWord(4)}`; + const labelValue = isMatch && l === 0 ? `important ${titleKeywords[0]}` : randomWord(8); + n.label(labelName, labelValue); + } + + let noteWordCount = contentWordCount; + if (varyContentSize) { + const r = Math.random(); + if (r < 0.2) noteWordCount = Math.floor(contentWordCount * (0.2 + Math.random() * 0.3)); + else if (r < 0.7) noteWordCount = Math.floor(contentWordCount * (0.7 + Math.random() * 0.6)); + else if (r < 0.9) noteWordCount = Math.floor(contentWordCount * (1.3 + Math.random() * 0.7)); + else noteWordCount = Math.floor(contentWordCount * (2.0 + Math.random() * 1.0)); + } + + const includeContentKeyword = isMatch && contentKeywords.length > 0; + syntheticContent[n.note.noteId] = generateHtmlContent( + noteWordCount, + includeContentKeyword, + includeContentKeyword ? contentKeywords : undefined + ); + + const containerIndex = i % containers.length; + containers[containerIndex].child(n); + } + + // Monkeypatch getContent() + for (const noteObj of Object.values(becca.notes)) { + const noteId = noteObj.noteId; + if (syntheticContent[noteId]) { + (noteObj as any).getContent = () => syntheticContent[noteId]; + } else { + (noteObj as any).getContent = () => ""; + } + } + + return { rootNote, matchCount }; +} + +// ── benchmark runner ───────────────────────────────────────────────── + +interface BenchmarkResult { + query: string; + mode: string; + noteCount: number; + avgMs: number; + minMs: number; + resultCount: number; +} + +function runBenchmark( + query: string, + mode: "autocomplete" | "fullSearch", + fuzzyEnabled: boolean, + iterations = 5 +): BenchmarkResult { + const noteCount = Object.keys(becca.notes).length; + + // Warm up + if (mode === "autocomplete") { + searchService.searchNotesForAutocomplete(query, true); + } else { + const ctx = new SearchContext({ fastSearch: false }); + ctx.enableFuzzyMatching = fuzzyEnabled; + searchService.findResultsWithQuery(query, ctx); + } + + const times: number[] = []; + let resultCount = 0; + + for (let i = 0; i < iterations; i++) { + if (mode === "autocomplete") { + // For autocomplete, fuzzy is controlled by the global option + // We'll manipulate enableFuzzyMatching after construction + const [results, ms] = timed(() => { + // searchNotesForAutocomplete creates its own SearchContext internally + // so we need to test via findResultsWithQuery for fuzzy control + const ctx = new SearchContext({ + fastSearch: true, + includeHiddenNotes: true, + fuzzyAttributeSearch: true, + ignoreInternalAttributes: true, + autocomplete: true + }); + ctx.enableFuzzyMatching = fuzzyEnabled; + return searchService.findResultsWithQuery(query, ctx); + }); + times.push(ms); + resultCount = results.length; + } else { + const [results, ms] = timed(() => { + const ctx = new SearchContext({ fastSearch: false }); + ctx.enableFuzzyMatching = fuzzyEnabled; + return searchService.findResultsWithQuery(query, ctx); + }); + times.push(ms); + resultCount = results.length; + } + } + + return { + query, + mode: `${mode}${fuzzyEnabled ? "+fuzzy" : ""}`, + noteCount, + avgMs: avg(times), + minMs: min(times), + resultCount + }; +} + +function printTable(title: string, results: BenchmarkResult[]) { + console.log(`\n${"═".repeat(110)}`); + console.log(` ${title}`); + console.log(`${"═".repeat(110)}`); + console.log( + " " + + "Query".padEnd(35) + + "Mode".padEnd(22) + + "Notes".padStart(7) + + "Avg (ms)".padStart(12) + + "Min (ms)".padStart(12) + + "Results".padStart(10) + ); + console.log(` ${"─".repeat(98)}`); + for (const r of results) { + console.log( + " " + + `"${r.query}"`.padEnd(35) + + r.mode.padEnd(22) + + String(r.noteCount).padStart(7) + + r.avgMs.toFixed(1).padStart(12) + + r.minMs.toFixed(1).padStart(12) + + String(r.resultCount).padStart(10) + ); + } + console.log(`${"═".repeat(110)}\n`); +} + +// ── tests ──────────────────────────────────────────────────────────── + +// Skipped by default - this is a benchmark, not a test. +// Remove .skip to run manually for performance analysis. +describe.skip("Comprehensive Search Benchmark", () => { + + afterEach(() => { + becca.reset(); + }); + + describe("Single-token queries", () => { + for (const noteCount of [1000, 5000, 10000, 20000]) { + it(`single token @ ${noteCount} notes — fuzzy on vs off, autocomplete vs full`, () => { + buildDataset(noteCount, { + matchFraction: 0.15, + titleKeywords: ["meeting"], + contentKeywords: ["meeting"], + contentWordCount: 300, + }); + + const results: BenchmarkResult[] = [ + runBenchmark("meeting", "autocomplete", false), + runBenchmark("meeting", "autocomplete", true), + runBenchmark("meeting", "fullSearch", false), + runBenchmark("meeting", "fullSearch", true), + ]; + + printTable(`Single Token "meeting" — ${noteCount} notes`, results); + expect(results[0].resultCount).toBeGreaterThan(0); + }); + } + }); + + describe("Multi-token queries", () => { + for (const noteCount of [1000, 5000, 10000, 20000]) { + it(`multi token @ ${noteCount} notes — fuzzy on vs off`, () => { + buildDataset(noteCount, { + matchFraction: 0.15, + titleKeywords: ["meeting", "notes", "january"], + contentKeywords: ["meeting", "notes", "january"], + contentWordCount: 400, + }); + + const results: BenchmarkResult[] = [ + // 2-token + runBenchmark("meeting notes", "autocomplete", false), + runBenchmark("meeting notes", "autocomplete", true), + runBenchmark("meeting notes", "fullSearch", false), + runBenchmark("meeting notes", "fullSearch", true), + // 3-token + runBenchmark("meeting notes january", "autocomplete", false), + runBenchmark("meeting notes january", "autocomplete", true), + runBenchmark("meeting notes january", "fullSearch", false), + runBenchmark("meeting notes january", "fullSearch", true), + ]; + + printTable(`Multi Token — ${noteCount} notes`, results); + expect(results[0].resultCount).toBeGreaterThan(0); + }); + } + }); + + describe("No-match queries (worst case — full scan, zero results)", () => { + for (const noteCount of [1000, 5000, 10000, 20000]) { + it(`no-match @ ${noteCount} notes`, () => { + buildDataset(noteCount, { + matchFraction: 0.1, + titleKeywords: ["target"], + contentKeywords: ["target"], + contentWordCount: 300, + }); + + const results: BenchmarkResult[] = [ + runBenchmark("xyznonexistent", "autocomplete", false), + runBenchmark("xyznonexistent", "autocomplete", true), + runBenchmark("xyznonexistent", "fullSearch", false), + runBenchmark("xyznonexistent", "fullSearch", true), + runBenchmark("xyzfoo xyzbar", "autocomplete", false), + runBenchmark("xyzfoo xyzbar", "autocomplete", true), + runBenchmark("xyzfoo xyzbar", "fullSearch", false), + runBenchmark("xyzfoo xyzbar", "fullSearch", true), + ]; + + printTable(`No-Match Queries — ${noteCount} notes`, results); + // All should return 0 results + for (const r of results) { + expect(r.resultCount).toBe(0); + } + }); + } + }); + + describe("Diacritics / Unicode queries", () => { + for (const noteCount of [1000, 5000, 10000]) { + it(`diacritics @ ${noteCount} notes`, () => { + buildDataset(noteCount, { + matchFraction: 0.15, + titleKeywords: ["résumé"], + contentKeywords: ["résumé"], + contentWordCount: 300, + includeDiacritics: true, + }); + + const results: BenchmarkResult[] = [ + // Exact diacritics + runBenchmark("résumé", "autocomplete", false), + runBenchmark("résumé", "autocomplete", true), + // ASCII equivalent (should still match via normalize) + runBenchmark("resume", "autocomplete", false), + runBenchmark("resume", "autocomplete", true), + // Full search + runBenchmark("résumé", "fullSearch", false), + runBenchmark("resume", "fullSearch", false), + ]; + + printTable(`Diacritics "résumé" / "resume" — ${noteCount} notes`, results); + }); + } + }); + + describe("Partial / prefix queries (simulating typing)", () => { + for (const noteCount of [5000, 10000, 20000]) { + it(`typing progression @ ${noteCount} notes`, () => { + buildDataset(noteCount, { + matchFraction: 0.15, + titleKeywords: ["documentation"], + contentKeywords: ["documentation"], + contentWordCount: 300, + }); + + const results: BenchmarkResult[] = [ + runBenchmark("d", "autocomplete", false), + runBenchmark("do", "autocomplete", false), + runBenchmark("doc", "autocomplete", false), + runBenchmark("docu", "autocomplete", false), + runBenchmark("docum", "autocomplete", false), + runBenchmark("document", "autocomplete", false), + runBenchmark("documentation", "autocomplete", false), + // Same with fuzzy + runBenchmark("d", "autocomplete", true), + runBenchmark("doc", "autocomplete", true), + runBenchmark("document", "autocomplete", true), + runBenchmark("documentation", "autocomplete", true), + ]; + + printTable(`Typing Progression "documentation" — ${noteCount} notes`, results); + }); + } + }); + + describe("Attribute-matching queries", () => { + for (const noteCount of [5000, 10000]) { + it(`attribute search @ ${noteCount} notes`, () => { + buildDataset(noteCount, { + matchFraction: 0.15, + labelsPerNote: 5, + titleKeywords: ["important"], + contentKeywords: ["important"], + contentWordCount: 200, + }); + + const results: BenchmarkResult[] = [ + // "category" is a label name on matching notes + runBenchmark("category", "autocomplete", false), + runBenchmark("category", "autocomplete", true), + runBenchmark("category", "fullSearch", false), + runBenchmark("category", "fullSearch", true), + // "important" appears in both title and label value + runBenchmark("important", "autocomplete", false), + runBenchmark("important", "autocomplete", true), + ]; + + printTable(`Attribute Matching — ${noteCount} notes`, results); + }); + } + }); + + describe("Long queries (4-5 tokens)", () => { + for (const noteCount of [5000, 10000]) { + it(`long query @ ${noteCount} notes`, () => { + buildDataset(noteCount, { + matchFraction: 0.10, + titleKeywords: ["quarterly", "budget", "review", "report"], + contentKeywords: ["quarterly", "budget", "review", "report"], + contentWordCount: 500, + }); + + const results: BenchmarkResult[] = [ + runBenchmark("quarterly", "autocomplete", false), + runBenchmark("quarterly budget", "autocomplete", false), + runBenchmark("quarterly budget review", "autocomplete", false), + runBenchmark("quarterly budget review report", "autocomplete", false), + // Same with fuzzy + runBenchmark("quarterly budget review report", "autocomplete", true), + // Full search + runBenchmark("quarterly budget review report", "fullSearch", false), + runBenchmark("quarterly budget review report", "fullSearch", true), + ]; + + printTable(`Long Queries (4 tokens) — ${noteCount} notes`, results); + }); + } + }); + + describe("Mixed scenario — realistic user session", () => { + it("simulates a user session with varied queries @ 10K notes", () => { + buildDataset(10000, { + matchFraction: 0.15, + titleKeywords: ["project", "planning"], + contentKeywords: ["project", "planning", "timeline", "budget"], + contentWordCount: 400, + varyContentSize: true, + includeDiacritics: true, + depth: 6, + }); + + const results: BenchmarkResult[] = [ + // Quick autocomplete lookups (user typing in search bar) + runBenchmark("pro", "autocomplete", false), + runBenchmark("project", "autocomplete", false), + runBenchmark("project plan", "autocomplete", false), + + // Full search (user hits Enter) + runBenchmark("project", "fullSearch", false), + runBenchmark("project planning", "fullSearch", false), + runBenchmark("project planning", "fullSearch", true), + + // Typo / near-miss with fuzzy + runBenchmark("projct", "autocomplete", false), + runBenchmark("projct", "autocomplete", true), + runBenchmark("projct planing", "fullSearch", false), + runBenchmark("projct planing", "fullSearch", true), + + // No results + runBenchmark("xyznonexistent", "autocomplete", false), + runBenchmark("xyznonexistent foo", "fullSearch", true), + + // Short common substring + runBenchmark("note", "autocomplete", false), + runBenchmark("document", "autocomplete", false), + ]; + + printTable("Realistic User Session — 10K notes", results); + }); + }); + + describe("Cache warmth impact", () => { + it("cold vs warm flat text index @ 10K notes", () => { + buildDataset(10000, { + matchFraction: 0.15, + titleKeywords: ["target"], + contentKeywords: ["target"], + contentWordCount: 300, + }); + + console.log(`\n${"═".repeat(80)}`); + console.log(" Cold vs Warm Cache — 10K notes"); + console.log(`${"═".repeat(80)}`); + + // Cold: first search after dataset build (flat text index not yet built) + becca.flatTextIndex = null; + becca.dirtyFlatTextNoteIds.clear(); + const [coldResults, coldMs] = timed(() => { + const ctx = new SearchContext({ fastSearch: true, autocomplete: true }); + ctx.enableFuzzyMatching = false; + return searchService.findResultsWithQuery("target", ctx); + }); + console.log(` Cold (index build + search): ${coldMs.toFixed(1)}ms (${coldResults.length} results)`); + + // Warm: subsequent searches reuse the index + const warmTimes: number[] = []; + for (let i = 0; i < 5; i++) { + const [, ms] = timed(() => { + const ctx = new SearchContext({ fastSearch: true, autocomplete: true }); + ctx.enableFuzzyMatching = false; + return searchService.findResultsWithQuery("target", ctx); + }); + warmTimes.push(ms); + } + console.log(` Warm (reuse index, 5 runs): avg ${avg(warmTimes).toFixed(1)}ms min ${min(warmTimes).toFixed(1)}ms`); + + // Incremental: dirty a few notes and search again + const noteIds = Object.keys(becca.notes).slice(0, 50); + for (const nid of noteIds) { + becca.dirtyNoteFlatText(nid); + } + const [, incrMs] = timed(() => { + const ctx = new SearchContext({ fastSearch: true, autocomplete: true }); + ctx.enableFuzzyMatching = false; + return searchService.findResultsWithQuery("target", ctx); + }); + console.log(` Incremental (50 dirty notes): ${incrMs.toFixed(1)}ms`); + + // Full rebuild + becca.flatTextIndex = null; + const [, rebuildMs] = timed(() => { + const ctx = new SearchContext({ fastSearch: true, autocomplete: true }); + ctx.enableFuzzyMatching = false; + return searchService.findResultsWithQuery("target", ctx); + }); + console.log(` Full rebuild (index = null): ${rebuildMs.toFixed(1)}ms`); + + console.log(`${"═".repeat(80)}\n`); + }); + }); + + describe("Fuzzy matching effectiveness comparison", () => { + it("exact vs fuzzy result quality @ 10K notes", () => { + buildDataset(10000, { + matchFraction: 0.10, + titleKeywords: ["performance"], + contentKeywords: ["performance", "optimization"], + contentWordCount: 300, + }); + + console.log(`\n${"═".repeat(90)}`); + console.log(" Fuzzy Matching Effectiveness — 10K notes"); + console.log(`${"═".repeat(90)}`); + console.log( + " " + + "Query".padEnd(30) + + "Fuzzy".padEnd(8) + + "Time (ms)".padStart(12) + + "Results".padStart(10) + + " Notes" + ); + console.log(` ${"─".repeat(70)}`); + + const queries = [ + "performance", // exact match + "performanc", // truncated + "preformance", // typo + "performence", // common misspelling + "optimization", // exact match + "optimzation", // typo + "perf optim", // abbreviated multi + ]; + + for (const query of queries) { + for (const fuzzy of [false, true]) { + const times: number[] = []; + let resultCount = 0; + for (let i = 0; i < 3; i++) { + const [results, ms] = timed(() => { + const ctx = new SearchContext({ fastSearch: true }); + ctx.enableFuzzyMatching = fuzzy; + return searchService.findResultsWithQuery(query, ctx); + }); + times.push(ms); + resultCount = results.length; + } + console.log( + " " + + `"${query}"`.padEnd(30) + + (fuzzy ? "ON" : "OFF").padEnd(8) + + avg(times).toFixed(1).padStart(12) + + String(resultCount).padStart(10) + ); + } + } + + console.log(`${"═".repeat(90)}\n`); + }); + }); + + describe("Scale comparison summary", () => { + it("summary table across all note counts", () => { + const summaryResults: BenchmarkResult[] = []; + + for (const noteCount of [1000, 5000, 10000, 20000]) { + buildDataset(noteCount, { + matchFraction: 0.15, + titleKeywords: ["meeting", "notes"], + contentKeywords: ["meeting", "notes"], + contentWordCount: 400, + varyContentSize: true, + depth: 5, + }); + + // Core scenarios + summaryResults.push(runBenchmark("meeting", "autocomplete", false)); + summaryResults.push(runBenchmark("meeting", "autocomplete", true)); + summaryResults.push(runBenchmark("meeting notes", "autocomplete", false)); + summaryResults.push(runBenchmark("meeting notes", "autocomplete", true)); + summaryResults.push(runBenchmark("meeting", "fullSearch", false)); + summaryResults.push(runBenchmark("meeting", "fullSearch", true)); + summaryResults.push(runBenchmark("meeting notes", "fullSearch", false)); + summaryResults.push(runBenchmark("meeting notes", "fullSearch", true)); + summaryResults.push(runBenchmark("xyznonexistent", "autocomplete", false)); + summaryResults.push(runBenchmark("xyznonexistent", "fullSearch", true)); + } + + printTable("Scale Comparison Summary", summaryResults); + }); + }); +}); diff --git a/apps/server/src/services/search/services/search_profiling.spec.ts b/apps/server/src/services/search/services/search_profiling.spec.ts new file mode 100644 index 0000000000..6ed5d9fbb7 --- /dev/null +++ b/apps/server/src/services/search/services/search_profiling.spec.ts @@ -0,0 +1,665 @@ +/** + * Search performance profiling tests. + * + * These tests measure where time is spent in the search pipeline. + * We monkeypatch note.getContent() to return synthetic HTML content + * since unit tests don't have a real SQLite database. + * + * KNOWN GAPS vs production: + * - note.getContent() is instant (monkeypatched) vs ~2ms SQL fetch + * - NoteContentFulltextExp.execute() is skipped (no sql.iterateRows) + * because fastSearch=true uses only NoteFlatTextExp + * - These tests focus on the in-memory/CPU-bound parts of the pipeline + */ +import { describe, it, expect, beforeEach, afterEach } from "vitest"; +import searchService from "./search.js"; +import BNote from "../../../becca/entities/bnote.js"; +import BBranch from "../../../becca/entities/bbranch.js"; +import SearchContext from "../search_context.js"; +import becca from "../../../becca/becca.js"; +import beccaService from "../../../becca/becca_service.js"; +import { NoteBuilder, note, id } from "../../../test/becca_mocking.js"; +import SearchResult from "../search_result.js"; +import { normalizeSearchText } from "../utils/text_utils.js"; + +// ── helpers ────────────────────────────────────────────────────────── + +function randomWord(len = 6): string { + const chars = "abcdefghijklmnopqrstuvwxyz"; + let word = ""; + for (let i = 0; i < len; i++) { + word += chars[Math.floor(Math.random() * chars.length)]; + } + return word; +} + +function generateHtmlContent(wordCount: number, includeKeywords = false, keywords?: string[]): string { + const paragraphs: string[] = []; + let wordsRemaining = wordCount; + const kws = keywords ?? ["target"]; + + while (wordsRemaining > 0) { + const paraWords = Math.min(wordsRemaining, 20 + Math.floor(Math.random() * 40)); + const words: string[] = []; + for (let i = 0; i < paraWords; i++) { + words.push(randomWord(3 + Math.floor(Math.random() * 10))); + } + if (includeKeywords && paragraphs.length === 2) { + // Inject all keywords into the paragraph at spaced positions + for (let k = 0; k < kws.length; k++) { + const pos = Math.min(words.length - 1, Math.floor((words.length / (kws.length + 1)) * (k + 1))); + words[pos] = kws[k]; + } + } + paragraphs.push(`

${words.join(" ")}

`); + wordsRemaining -= paraWords; + } + + return `${paragraphs.join("\n")}`; +} + +function timed(fn: () => T): [T, number] { + const start = performance.now(); + const result = fn(); + return [result, performance.now() - start]; +} + +interface TimingEntry { label: string; ms: number; } + +function reportTimings(title: string, timings: TimingEntry[]) { + const total = timings.reduce((s, t) => s + t.ms, 0); + console.log(`\n=== ${title} (total: ${total.toFixed(1)}ms) ===`); + for (const { label, ms } of timings) { + const pct = total > 0 ? ((ms / total) * 100).toFixed(0) : "0"; + const bar = "#".repeat(Math.max(1, Math.round(ms / total * 40))); + console.log(` ${label.padEnd(55)} ${ms.toFixed(1).padStart(8)}ms ${pct.padStart(3)}% ${bar}`); + } +} + +// ── dataset builder ────────────────────────────────────────────────── + +const syntheticContent: Record = {}; + +function buildDataset(noteCount: number, opts: { + matchFraction?: number; + labelsPerNote?: number; + depth?: number; + contentWordCount?: number; + /** When set, contentWordCount is treated as a median and actual sizes vary from 0.2x to 3x */ + varyContentSize?: boolean; + /** Keywords to inject into matching notes' titles (default: ["target"]) */ + titleKeywords?: string[]; + /** Keywords to inject into matching notes' content (default: same as titleKeywords) */ + contentKeywords?: string[]; +} = {}) { + const { + matchFraction = 0.1, + labelsPerNote = 3, + depth = 3, + contentWordCount = 200, + varyContentSize = false, + titleKeywords = ["target"], + contentKeywords = titleKeywords, + } = opts; + + becca.reset(); + for (const key of Object.keys(syntheticContent)) { + delete syntheticContent[key]; + } + + const rootNote = new NoteBuilder(new BNote({ noteId: "root", title: "root", type: "text" })); + new BBranch({ + branchId: "none_root", + noteId: "root", + parentNoteId: "none", + notePosition: 10 + }); + + const containers: NoteBuilder[] = []; + let parent = rootNote; + for (let d = 0; d < depth; d++) { + const container = note(`Container_${d}_${randomWord(4)}`); + parent.child(container); + containers.push(container); + parent = container; + } + + const matchCount = Math.floor(noteCount * matchFraction); + + for (let i = 0; i < noteCount; i++) { + const isMatch = i < matchCount; + const title = isMatch + ? `${randomWord(5)} ${titleKeywords.join(" ")} ${randomWord(5)} Document ${i}` + : `${randomWord(5)} ${randomWord(6)} ${randomWord(4)} Note ${i}`; + + const n = note(title); + + for (let l = 0; l < labelsPerNote; l++) { + const labelName = isMatch && l === 0 ? "category" : `label_${randomWord(4)}`; + const labelValue = isMatch && l === 0 ? `important ${titleKeywords[0]}` : randomWord(8); + n.label(labelName, labelValue); + } + + // Vary content size: 0.2x to 3x the median, producing a realistic + // mix of short stubs, medium notes, and long documents. + let noteWordCount = contentWordCount; + if (varyContentSize) { + const r = Math.random(); + if (r < 0.2) { + noteWordCount = Math.floor(contentWordCount * (0.2 + Math.random() * 0.3)); // 20-50% (short stubs) + } else if (r < 0.7) { + noteWordCount = Math.floor(contentWordCount * (0.7 + Math.random() * 0.6)); // 70-130% (medium) + } else if (r < 0.9) { + noteWordCount = Math.floor(contentWordCount * (1.3 + Math.random() * 0.7)); // 130-200% (long) + } else { + noteWordCount = Math.floor(contentWordCount * (2.0 + Math.random() * 1.0)); // 200-300% (very long) + } + } + + const includeContentKeyword = isMatch && contentKeywords.length > 0; + syntheticContent[n.note.noteId] = generateHtmlContent( + noteWordCount, + includeContentKeyword, + includeContentKeyword ? contentKeywords : undefined + ); + + const containerIndex = i % containers.length; + containers[containerIndex].child(n); + } + + // Monkeypatch getContent() + for (const noteObj of Object.values(becca.notes)) { + const noteId = noteObj.noteId; + if (syntheticContent[noteId]) { + (noteObj as any).getContent = () => syntheticContent[noteId]; + } else { + (noteObj as any).getContent = () => ""; + } + } + + return { rootNote, matchCount }; +} + +// ── profiling tests ────────────────────────────────────────────────── + +describe("Search Profiling", () => { + + afterEach(() => { + becca.reset(); + }); + + /** + * Break down the autocomplete pipeline into every individual stage, + * including previously unmeasured operations like getBestNotePath, + * SearchResult construction, and getNoteTitleForPath. + */ + describe("Granular autocomplete pipeline", () => { + + for (const noteCount of [500, 2000, 5000, 10000]) { + it(`granular breakdown with ${noteCount} notes`, () => { + const timings: TimingEntry[] = []; + + const [, buildMs] = timed(() => buildDataset(noteCount, { + matchFraction: 0.2, + contentWordCount: 300, + depth: 5 + })); + timings.push({ label: `Dataset build (${noteCount} notes)`, ms: buildMs }); + + // === NoteFlatTextExp: getCandidateNotes === + // This calls getFlatText() + normalizeSearchText() for EVERY note + const allNotes = Object.values(becca.notes); + for (const n of allNotes) n.invalidateThisCache(); + + const [, candidateMs] = timed(() => { + const token = normalizeSearchText("target"); + let count = 0; + for (const n of allNotes) { + const flatText = normalizeSearchText(n.getFlatText()); + if (flatText.includes(token)) count++; + } + return count; + }); + timings.push({ label: `getCandidateNotes simulation (cold caches)`, ms: candidateMs }); + + // Warm cache version + const [candidateCount, candidateWarmMs] = timed(() => { + const token = normalizeSearchText("target"); + let count = 0; + for (const n of allNotes) { + const flatText = normalizeSearchText(n.getFlatText()); + if (flatText.includes(token)) count++; + } + return count; + }); + timings.push({ label: `getCandidateNotes simulation (warm caches)`, ms: candidateWarmMs }); + + // === getBestNotePath for each candidate === + const candidates = allNotes.filter(n => { + const flatText = normalizeSearchText(n.getFlatText()); + return flatText.includes("target"); + }); + + const [, pathMs] = timed(() => { + for (const n of candidates) { + n.getBestNotePath(); + } + }); + timings.push({ label: `getBestNotePath (${candidates.length} notes)`, ms: pathMs }); + + // === SearchResult construction (includes getNoteTitleForPath) === + const paths = candidates.map(n => n.getBestNotePath()).filter(Boolean); + + const [searchResults, srMs] = timed(() => { + return paths.map(p => new SearchResult(p)); + }); + timings.push({ label: `SearchResult construction (${paths.length} results)`, ms: srMs }); + + // === computeScore === + const [, scoreMs] = timed(() => { + for (const r of searchResults) { + r.computeScore("target", ["target"], true); + } + }); + timings.push({ label: `computeScore with fuzzy (${searchResults.length} results)`, ms: scoreMs }); + + const [, scoreNoFuzzyMs] = timed(() => { + for (const r of searchResults) { + r.computeScore("target", ["target"], false); + } + }); + timings.push({ label: `computeScore no-fuzzy`, ms: scoreNoFuzzyMs }); + + // === Sorting === + const [, sortMs] = timed(() => { + searchResults.sort((a, b) => { + if (a.score !== b.score) return b.score - a.score; + if (a.notePathArray.length === b.notePathArray.length) { + return a.notePathTitle < b.notePathTitle ? -1 : 1; + } + return a.notePathArray.length - b.notePathArray.length; + }); + }); + timings.push({ label: `Sort results`, ms: sortMs }); + + // === Trim + content snippet extraction === + const trimmed = searchResults.slice(0, 200); + + const [, snippetMs] = timed(() => { + for (const r of trimmed) { + r.contentSnippet = searchService.extractContentSnippet( + r.noteId, ["target"] + ); + } + }); + timings.push({ label: `Content snippet extraction (${trimmed.length} results)`, ms: snippetMs }); + + const [, attrMs] = timed(() => { + for (const r of trimmed) { + r.attributeSnippet = searchService.extractAttributeSnippet( + r.noteId, ["target"] + ); + } + }); + timings.push({ label: `Attribute snippet extraction`, ms: attrMs }); + + // === Highlighting === + const [, hlMs] = timed(() => { + searchService.highlightSearchResults(trimmed, ["target"]); + }); + timings.push({ label: `Highlighting`, ms: hlMs }); + + // === Final mapping (getNoteTitleAndIcon) === + const [, mapMs] = timed(() => { + for (const r of trimmed) { + beccaService.getNoteTitleAndIcon(r.noteId); + } + }); + timings.push({ label: `getNoteTitleAndIcon (${trimmed.length} results)`, ms: mapMs }); + + // === Full autocomplete for comparison === + const [autoResults, autoMs] = timed(() => { + return searchService.searchNotesForAutocomplete("target", true); + }); + timings.push({ label: `Full autocomplete call (end-to-end)`, ms: autoMs }); + + reportTimings(`Granular Autocomplete — ${noteCount} notes`, timings); + expect(autoResults.length).toBeGreaterThan(0); + }); + } + }); + + /** + * Test the specific cost of normalizeSearchText which is called + * pervasively throughout the pipeline. + */ + describe("normalizeSearchText cost", () => { + + it("profile normalizeSearchText at scale", () => { + buildDataset(5000, { matchFraction: 0.2, contentWordCount: 100 }); + + // Generate various text lengths to profile + const shortTexts = Array.from({ length: 5000 }, () => randomWord(10)); + const mediumTexts = Array.from({ length: 5000 }, () => + Array.from({ length: 20 }, () => randomWord(6)).join(" ") + ); + const longTexts = Object.values(becca.notes).map(n => n.getFlatText()); + + console.log("\n=== normalizeSearchText cost ==="); + + const [, shortMs] = timed(() => { + for (const t of shortTexts) normalizeSearchText(t); + }); + console.log(` 5000 short texts (10 chars): ${shortMs.toFixed(1)}ms (${(shortMs/5000*1000).toFixed(1)}µs/call)`); + + const [, medMs] = timed(() => { + for (const t of mediumTexts) normalizeSearchText(t); + }); + console.log(` 5000 medium texts (120 chars): ${medMs.toFixed(1)}ms (${(medMs/5000*1000).toFixed(1)}µs/call)`); + + const [, longMs] = timed(() => { + for (const t of longTexts) normalizeSearchText(t); + }); + console.log(` ${longTexts.length} flat texts (varying): ${longMs.toFixed(1)}ms (${(longMs/longTexts.length*1000).toFixed(1)}µs/call)`); + }); + }); + + /** + * Test the searchPathTowardsRoot recursive walk which runs + * for every candidate note in NoteFlatTextExp. + */ + describe("searchPathTowardsRoot cost", () => { + + it("profile recursive walk with varying hierarchy depth", () => { + console.log("\n=== Search path walk vs hierarchy depth ==="); + + for (const depth of [3, 5, 8, 12]) { + buildDataset(2000, { + matchFraction: 0.15, + depth, + contentWordCount: 50 + }); + + const [results, ms] = timed(() => { + const ctx = new SearchContext({ fastSearch: true }); + return searchService.findResultsWithQuery("target", ctx); + }); + console.log(` depth=${depth}: ${ms.toFixed(1)}ms (${results.length} results)`); + } + }); + }); + + /** + * Content snippet extraction scaling — the operation that calls + * note.getContent() for each result. + */ + describe("Content snippet extraction", () => { + + it("profile snippet extraction with varying content sizes", () => { + console.log("\n=== Content snippet extraction vs content size ==="); + + for (const wordCount of [50, 200, 500, 1000, 2000, 5000]) { + buildDataset(500, { + matchFraction: 0.5, + contentWordCount: wordCount + }); + + const ctx = new SearchContext({ fastSearch: true }); + const results = searchService.findResultsWithQuery("target", ctx); + const trimmed = results.slice(0, 200); + + const [, ms] = timed(() => { + for (const r of trimmed) { + r.contentSnippet = searchService.extractContentSnippet( + r.noteId, ["target"] + ); + } + }); + + const avgContentLen = Object.values(syntheticContent) + .slice(0, 100) + .reduce((s, c) => s + c.length, 0) / 100; + + console.log(` ${String(wordCount).padStart(5)} words/note (avg ${Math.round(avgContentLen)} chars) × ${trimmed.length} results: ${ms.toFixed(1)}ms (${(ms / trimmed.length).toFixed(3)}ms/note)`); + } + }); + + it("profile snippet extraction with varying result counts", () => { + console.log("\n=== Content snippet extraction vs result count ==="); + + buildDataset(2000, { + matchFraction: 0.5, + contentWordCount: 500 + }); + + const ctx = new SearchContext({ fastSearch: true }); + const allResults = searchService.findResultsWithQuery("target", ctx); + + for (const count of [5, 10, 20, 50, 100, 200]) { + const subset = allResults.slice(0, count); + + const [, ms] = timed(() => { + for (const r of subset) { + r.contentSnippet = searchService.extractContentSnippet( + r.noteId, ["target"] + ); + } + }); + + console.log(` ${String(count).padStart(3)} results: ${ms.toFixed(1)}ms (${(ms / count).toFixed(3)}ms/note)`); + } + }); + }); + + /** + * Two-phase exact/fuzzy search cost. + */ + describe("Two-phase search cost", () => { + + for (const noteCount of [1000, 5000, 10000]) { + it(`exact vs progressive with ${noteCount} notes`, () => { + const timings: TimingEntry[] = []; + + buildDataset(noteCount, { matchFraction: 0.005, contentWordCount: 50 }); + + const [exactR, exactMs] = timed(() => { + const ctx = new SearchContext({ fastSearch: true }); + ctx.enableFuzzyMatching = false; + return searchService.findResultsWithQuery("target", ctx); + }); + timings.push({ label: `Exact-only (${exactR.length} results)`, ms: exactMs }); + + const [progR, progMs] = timed(() => { + const ctx = new SearchContext({ fastSearch: true }); + return searchService.findResultsWithQuery("target", ctx); + }); + timings.push({ label: `Progressive exact→fuzzy (${progR.length} results)`, ms: progMs }); + + const overhead = progMs - exactMs; + timings.push({ label: `Fuzzy phase overhead`, ms: Math.max(0, overhead) }); + + reportTimings(`Two-phase — ${noteCount} notes`, timings); + }); + } + }); + + /** + * End-to-end scaling to give the full picture. + */ + /** + * Multi-token search with varying content sizes. + * Real users search things like "meeting notes january" — this exercises + * the multi-token path (which doesn't use the single-token fast path) + * with a realistic mix of note sizes. + */ + describe("Multi-token search with varying content sizes", () => { + + it("single vs multi-token autocomplete at scale", () => { + console.log("\n=== Single vs multi-token autocomplete (varying content sizes) ==="); + + for (const noteCount of [1000, 5000, 10000, 20000]) { + buildDataset(noteCount, { + matchFraction: 0.15, + contentWordCount: 400, + varyContentSize: true, + depth: 5, + titleKeywords: ["meeting", "notes", "january"], + contentKeywords: ["meeting", "notes", "january"], + }); + + // Warm up + searchService.searchNotesForAutocomplete("meeting", true); + + // Single token + const singleTimes: number[] = []; + for (let i = 0; i < 3; i++) { + const [, ms] = timed(() => searchService.searchNotesForAutocomplete("meeting", true)); + singleTimes.push(ms); + } + const singleAvg = singleTimes.reduce((a, b) => a + b, 0) / singleTimes.length; + + // Two tokens + const twoTimes: number[] = []; + for (let i = 0; i < 3; i++) { + const [, ms] = timed(() => searchService.searchNotesForAutocomplete("meeting notes", true)); + twoTimes.push(ms); + } + const twoAvg = twoTimes.reduce((a, b) => a + b, 0) / twoTimes.length; + + // Three tokens + const threeTimes: number[] = []; + for (let i = 0; i < 3; i++) { + const [, ms] = timed(() => searchService.searchNotesForAutocomplete("meeting notes january", true)); + threeTimes.push(ms); + } + const threeAvg = threeTimes.reduce((a, b) => a + b, 0) / threeTimes.length; + + console.log( + ` ${String(noteCount).padStart(6)} notes: ` + + `1-token ${singleAvg.toFixed(1)}ms ` + + `2-token ${twoAvg.toFixed(1)}ms ` + + `3-token ${threeAvg.toFixed(1)}ms` + ); + } + }); + + it("multi-token with realistic content size distribution", () => { + console.log("\n=== Multi-token search — content size distribution ==="); + + buildDataset(5000, { + matchFraction: 0.15, + contentWordCount: 400, + varyContentSize: true, + depth: 5, + titleKeywords: ["project", "review"], + contentKeywords: ["project", "review"], + }); + + // Report the actual content size distribution + const sizes = Object.values(syntheticContent).map(c => c.length); + sizes.sort((a, b) => a - b); + const p10 = sizes[Math.floor(sizes.length * 0.1)]; + const p50 = sizes[Math.floor(sizes.length * 0.5)]; + const p90 = sizes[Math.floor(sizes.length * 0.9)]; + const p99 = sizes[Math.floor(sizes.length * 0.99)]; + console.log(` Content sizes: p10=${p10} p50=${p50} p90=${p90} p99=${p99} chars`); + + // Warm up + searchService.searchNotesForAutocomplete("project", true); + + const queries = [ + "project", + "project review", + "project review document", + `${randomWord(7)}`, // no-match single token + `${randomWord(5)} ${randomWord(6)}`, // no-match multi token + ]; + + for (const query of queries) { + const times: number[] = []; + let resultCount = 0; + for (let i = 0; i < 3; i++) { + const [r, ms] = timed(() => searchService.searchNotesForAutocomplete(query, true)); + times.push(ms); + resultCount = r.length; + } + const avg = times.reduce((a, b) => a + b, 0) / times.length; + const label = `"${query}"`.padEnd(35); + console.log(` ${label} ${avg.toFixed(1)}ms (${resultCount} results)`); + } + }); + }); + + describe("End-to-end scaling", () => { + + it("autocomplete at different scales", () => { + console.log("\n=== End-to-end autocomplete scaling ==="); + console.log(" (fastSearch=true, monkeypatched getContent, no real SQL)"); + + for (const noteCount of [100, 500, 1000, 2000, 5000, 10000, 20000]) { + buildDataset(noteCount, { + matchFraction: 0.2, + contentWordCount: 300, + depth: 4 + }); + + // Warm up + searchService.searchNotesForAutocomplete("target", true); + + const times: number[] = []; + for (let i = 0; i < 3; i++) { + const [, ms] = timed(() => searchService.searchNotesForAutocomplete("target", true)); + times.push(ms); + } + + const avg = times.reduce((a, b) => a + b, 0) / times.length; + const min = Math.min(...times); + + console.log( + ` ${String(noteCount).padStart(6)} notes: avg ${avg.toFixed(1)}ms ` + + `min ${min.toFixed(1)}ms` + ); + } + }); + + it("compare fast vs non-fast search", () => { + console.log("\n=== Fast vs non-fast search (no real SQL for content) ==="); + + for (const noteCount of [500, 2000, 5000]) { + buildDataset(noteCount, { + matchFraction: 0.2, + contentWordCount: 200, + depth: 4 + }); + + const [, fastMs] = timed(() => { + const ctx = new SearchContext({ fastSearch: true }); + return searchService.findResultsWithQuery("target", ctx); + }); + + // Non-fast search tries NoteContentFulltextExp which uses sql.iterateRows + // This will likely fail/return empty since there's no real DB, but we + // can still measure the overhead of attempting it + let nonFastMs: number; + let nonFastCount: number; + try { + const [results, ms] = timed(() => { + const ctx = new SearchContext({ fastSearch: false }); + return searchService.findResultsWithQuery("target", ctx); + }); + nonFastMs = ms; + nonFastCount = results.length; + } catch { + nonFastMs = -1; + nonFastCount = -1; + } + + console.log( + ` ${String(noteCount).padStart(5)} notes: fast=${fastMs.toFixed(1)}ms ` + + `non-fast=${nonFastMs >= 0 ? nonFastMs.toFixed(1) + 'ms' : 'FAILED (no real DB)'} ` + + `(non-fast results: ${nonFastCount})` + ); + } + }); + }); +}); diff --git a/apps/server/src/services/search/services/types.ts b/apps/server/src/services/search/services/types.ts index 7edc3b4ae5..60d00540c6 100644 --- a/apps/server/src/services/search/services/types.ts +++ b/apps/server/src/services/search/services/types.ts @@ -21,4 +21,6 @@ export interface SearchParams { limit?: number | null; debug?: boolean; fuzzyAttributeSearch?: boolean; + /** When true, skip the two-phase fuzzy fallback and use the single-token fast path. */ + autocomplete?: boolean; } diff --git a/apps/server/src/services/search/utils/text_utils.spec.ts b/apps/server/src/services/search/utils/text_utils.spec.ts index a5f1da129d..146f5cc0fe 100644 --- a/apps/server/src/services/search/utils/text_utils.spec.ts +++ b/apps/server/src/services/search/utils/text_utils.spec.ts @@ -1,5 +1,5 @@ import { describe, it, expect } from "vitest"; -import { calculateOptimizedEditDistance, validateFuzzySearchTokens, fuzzyMatchWord } from './text_utils.js'; +import { calculateOptimizedEditDistance, validateFuzzySearchTokens, fuzzyMatchWord, stripHtmlTags } from './text_utils.js'; describe('Fuzzy Search Core', () => { describe('calculateOptimizedEditDistance', () => { @@ -62,4 +62,69 @@ describe('Fuzzy Search Core', () => { expect(fuzzyMatchWord('a', 'b')).toBe(false); // Very short tokens }); }); + + describe('stripHtmlTags', () => { + it('strips simple HTML tags', () => { + expect(stripHtmlTags('

Hello

')).toBe('Hello'); + expect(stripHtmlTags('
World
')).toBe('World'); + expect(stripHtmlTags('Bold and italic')).toBe('Bold and italic'); + }); + + it('handles self-closing tags', () => { + expect(stripHtmlTags('Line1
Line2')).toBe('Line1Line2'); + expect(stripHtmlTags('Image: ')).toBe('Image: '); + }); + + it('handles tags with attributes', () => { + expect(stripHtmlTags('Link')).toBe('Link'); + expect(stripHtmlTags('
Content
')).toBe('Content'); + }); + + it('handles nested tag patterns securely', () => { + // Security property: no complete patterns remain after stripping + // Residual `>` chars are harmless for XSS + + // Nested tags: inner tag removed, then outer tag removed + // c> → → '' (but leaves residual `c>`) + const result1 = stripHtmlTags('c>text'); + expect(result1).not.toMatch(/<[a-z]/i); // No opening tags remain + expect(result1).toBe('c>text'); // Residual text is safe + + // Complex nesting leaves no exploitable patterns + const result2 = stripHtmlTags('ipt>alert(1)'); + expect(result2).not.toMatch(/