mirror of
https://github.com/zadam/trilium.git
synced 2026-05-07 00:36:59 +02:00
feat(search): try to improve search performance through some creative mechanisms... (#9034)
This commit is contained in:
@@ -162,8 +162,9 @@ Trilium provides powerful user scripting capabilities:
|
||||
- To add a new user preference:
|
||||
1. Add the option type to `OptionDefinitions` in `packages/commons/src/lib/options_interface.ts`
|
||||
2. Add a default value in `apps/server/src/services/options_init.ts` in the `defaultOptions` array
|
||||
3. **Whitelist the option** in `apps/server/src/routes/api/options.ts` by adding it to `ALLOWED_OPTIONS` (required for client updates)
|
||||
4. Use `useTriliumOption("optionName")` hook in React components to read/write the option
|
||||
3. **Whitelist the option** in `apps/server/src/routes/api/options.ts` by adding it to the `ALLOWED_OPTIONS` array — **without this, the API will reject changes with "Option 'X' is not allowed to be changed"**
|
||||
4. If the option should be user-editable in the UI, add a control in the appropriate settings component (e.g., `apps/client/src/widgets/type_widgets/options/other.tsx`) and a translation key in `apps/client/src/translations/en/translation.json`
|
||||
5. Use `useTriliumOption("optionName")` hook in React components to read/write the option
|
||||
- Available hooks: `useTriliumOption` (string), `useTriliumOptionBool`, `useTriliumOptionInt`, `useTriliumOptionJson`
|
||||
- See `docs/Developer Guide/Developer Guide/Concepts/Options/Creating a new option.md` for detailed documentation
|
||||
|
||||
|
||||
@@ -1324,6 +1324,13 @@
|
||||
"erase_excess_revision_snapshots": "Erase excess revision snapshots now",
|
||||
"erase_excess_revision_snapshots_prompt": "Excess revision snapshots have been erased."
|
||||
},
|
||||
"search": {
|
||||
"title": "Search",
|
||||
"fuzzy_matching_label": "Typo tolerance in search",
|
||||
"fuzzy_matching_description": "Affects quick search and full search. Finds similar words when exact matches are insufficient.",
|
||||
"autocomplete_fuzzy_label": "Typo tolerance in autocomplete",
|
||||
"autocomplete_fuzzy_description": "Affects jump-to-note and note selectors. Slower but tolerates typos."
|
||||
},
|
||||
"search_engine": {
|
||||
"title": "Search Engine",
|
||||
"custom_search_engine_info": "Custom search engine requires both a name and a URL to be set. If either of these is not set, DuckDuckGo will be used as the default search engine.",
|
||||
|
||||
@@ -14,13 +14,16 @@ import FormGroup from "../../react/FormGroup";
|
||||
import FormSelect from "../../react/FormSelect";
|
||||
import FormText from "../../react/FormText";
|
||||
import FormTextBox, { FormTextBoxWithUnit } from "../../react/FormTextBox";
|
||||
import FormToggle from "../../react/FormToggle";
|
||||
import { useTriliumOption, useTriliumOptionBool, useTriliumOptionJson } from "../../react/hooks";
|
||||
import OptionsRow from "./components/OptionsRow";
|
||||
import OptionsSection from "./components/OptionsSection";
|
||||
import TimeSelector from "./components/TimeSelector";
|
||||
|
||||
export default function OtherSettings() {
|
||||
return (
|
||||
<>
|
||||
<SearchSettings />
|
||||
{isElectron() && <>
|
||||
<SearchEngineSettings />
|
||||
<TrayOptionsSettings />
|
||||
@@ -36,6 +39,39 @@ export default function OtherSettings() {
|
||||
);
|
||||
}
|
||||
|
||||
function SearchSettings() {
|
||||
const [ fuzzyEnabled, setFuzzyEnabled ] = useTriliumOptionBool("searchEnableFuzzyMatching");
|
||||
const [ autocompleteFuzzy, setAutocompleteFuzzy ] = useTriliumOptionBool("searchAutocompleteFuzzy");
|
||||
|
||||
return (
|
||||
<OptionsSection title={t("search.title")}>
|
||||
<OptionsRow
|
||||
name="search-fuzzy-matching"
|
||||
label={t("search.fuzzy_matching_label")}
|
||||
description={t("search.fuzzy_matching_description")}
|
||||
>
|
||||
<FormToggle
|
||||
switchOnName="" switchOffName=""
|
||||
currentValue={fuzzyEnabled}
|
||||
onChange={setFuzzyEnabled}
|
||||
/>
|
||||
</OptionsRow>
|
||||
|
||||
<OptionsRow
|
||||
name="search-autocomplete-fuzzy"
|
||||
label={t("search.autocomplete_fuzzy_label")}
|
||||
description={t("search.autocomplete_fuzzy_description")}
|
||||
>
|
||||
<FormToggle
|
||||
switchOnName="" switchOffName=""
|
||||
currentValue={autocompleteFuzzy}
|
||||
onChange={setAutocompleteFuzzy}
|
||||
/>
|
||||
</OptionsRow>
|
||||
</OptionsSection>
|
||||
);
|
||||
}
|
||||
|
||||
function SearchEngineSettings() {
|
||||
const [ customSearchEngineName, setCustomSearchEngineName ] = useTriliumOption("customSearchEngineName");
|
||||
const [ customSearchEngineUrl, setCustomSearchEngineUrl ] = useTriliumOption("customSearchEngineUrl");
|
||||
|
||||
Binary file not shown.
@@ -1,4 +1,6 @@
|
||||
import sql from "../services/sql.js";
|
||||
import log from "../services/log.js";
|
||||
import { formatSize } from "../services/utils.js";
|
||||
import NoteSet from "../services/search/note_set.js";
|
||||
import NotFoundError from "../errors/not_found_error.js";
|
||||
import type BOption from "./entities/boption.js";
|
||||
@@ -31,9 +33,22 @@ export default class Becca {
|
||||
|
||||
allNoteSetCache: NoteSet | null;
|
||||
|
||||
/**
|
||||
* Pre-built parallel arrays for fast flat text scanning in search.
|
||||
* Avoids per-note property access overhead when iterating 50K+ notes.
|
||||
* Supports incremental updates: when individual notes change, only their
|
||||
* entries are rebuilt rather than the entire index.
|
||||
*/
|
||||
flatTextIndex: { notes: BNote[], flatTexts: string[], noteIdToIdx: Map<string, number> } | null;
|
||||
|
||||
/** NoteIds whose flat text needs to be recomputed in the index. */
|
||||
dirtyFlatTextNoteIds: Set<string>;
|
||||
|
||||
constructor() {
|
||||
this.reset();
|
||||
this.dirtyFlatTextNoteIds = new Set();
|
||||
this.allNoteSetCache = null;
|
||||
this.flatTextIndex = null;
|
||||
this.reset();
|
||||
}
|
||||
|
||||
reset() {
|
||||
@@ -242,6 +257,67 @@ export default class Becca {
|
||||
/** Should be called when the set of all non-skeleton notes changes (added/removed) */
|
||||
dirtyNoteSetCache() {
|
||||
this.allNoteSetCache = null;
|
||||
// Full rebuild needed since the note set itself changed
|
||||
this.flatTextIndex = null;
|
||||
this.dirtyFlatTextNoteIds.clear();
|
||||
}
|
||||
|
||||
/** Mark a single note's flat text as needing recomputation in the index. */
|
||||
dirtyNoteFlatText(noteId: string) {
|
||||
if (this.flatTextIndex) {
|
||||
// Index exists — schedule an incremental update
|
||||
this.dirtyFlatTextNoteIds.add(noteId);
|
||||
}
|
||||
// If flatTextIndex is null, full rebuild will happen on next access anyway
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns pre-built parallel arrays of notes and their flat texts for fast scanning.
|
||||
* The flat texts are already normalized (lowercase, diacritics removed).
|
||||
* Supports incremental updates: when individual notes are dirtied, only their
|
||||
* entries are recomputed rather than rebuilding the entire index.
|
||||
*/
|
||||
getFlatTextIndex(): { notes: BNote[], flatTexts: string[], noteIdToIdx: Map<string, number> } {
|
||||
if (!this.flatTextIndex) {
|
||||
// Measure heap before building
|
||||
const heapBefore = process.memoryUsage().heapUsed;
|
||||
|
||||
const allNoteSet = this.getAllNoteSet();
|
||||
const notes: BNote[] = [];
|
||||
const flatTexts: string[] = [];
|
||||
const noteIdToIdx = new Map<string, number>();
|
||||
|
||||
for (const note of allNoteSet.notes) {
|
||||
noteIdToIdx.set(note.noteId, notes.length);
|
||||
notes.push(note);
|
||||
flatTexts.push(note.getFlatText());
|
||||
}
|
||||
|
||||
this.flatTextIndex = { notes, flatTexts, noteIdToIdx };
|
||||
this.dirtyFlatTextNoteIds.clear();
|
||||
|
||||
// Measure heap after building and log
|
||||
const heapAfter = process.memoryUsage().heapUsed;
|
||||
const heapDelta = heapAfter - heapBefore;
|
||||
log.info(`Flat text search index built: ${notes.length} notes, ${formatSize(heapDelta)}`);
|
||||
} else if (this.dirtyFlatTextNoteIds.size > 0) {
|
||||
// Incremental update: only recompute flat texts for dirtied notes
|
||||
const { flatTexts, noteIdToIdx } = this.flatTextIndex;
|
||||
|
||||
for (const noteId of this.dirtyFlatTextNoteIds) {
|
||||
const idx = noteIdToIdx.get(noteId);
|
||||
if (idx !== undefined) {
|
||||
const note = this.notes[noteId];
|
||||
if (note) {
|
||||
flatTexts[idx] = note.getFlatText();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
this.dirtyFlatTextNoteIds.clear();
|
||||
}
|
||||
|
||||
return this.flatTextIndex;
|
||||
}
|
||||
|
||||
getAllNoteSet() {
|
||||
|
||||
@@ -6,6 +6,7 @@ import dateUtils from "../../services/date_utils.js";
|
||||
import promotedAttributeDefinitionParser from "../../services/promoted_attribute_definition_parser.js";
|
||||
import sanitizeAttributeName from "../../services/sanitize_attribute_name.js";
|
||||
import type { AttributeRow, AttributeType } from "@triliumnext/commons";
|
||||
import { normalize } from "../../services/utils.js";
|
||||
|
||||
interface SavingOpts {
|
||||
skipValidation?: boolean;
|
||||
@@ -34,6 +35,11 @@ class BAttribute extends AbstractBeccaEntity<BAttribute> {
|
||||
value!: string;
|
||||
isInheritable!: boolean;
|
||||
|
||||
/** Pre-normalized (lowercase, diacritics removed) name for search. */
|
||||
normalizedName!: string;
|
||||
/** Pre-normalized (lowercase, diacritics removed) value for search. */
|
||||
normalizedValue!: string;
|
||||
|
||||
constructor(row?: AttributeRow) {
|
||||
super();
|
||||
|
||||
@@ -59,6 +65,10 @@ class BAttribute extends AbstractBeccaEntity<BAttribute> {
|
||||
this.isInheritable = !!isInheritable;
|
||||
this.utcDateModified = utcDateModified;
|
||||
|
||||
// Pre-compute normalized forms for search (avoids repeated normalize() calls in hot loops)
|
||||
this.normalizedName = normalize(this.name);
|
||||
this.normalizedValue = normalize(this.value);
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
@@ -192,6 +202,11 @@ class BAttribute extends AbstractBeccaEntity<BAttribute> {
|
||||
|
||||
this.utcDateModified = dateUtils.utcNowDateTime();
|
||||
|
||||
// Recompute normalized fields in case name/value were modified directly
|
||||
// (e.g., attr.value = "..." followed by attr.save())
|
||||
this.normalizedName = normalize(this.name);
|
||||
this.normalizedValue = normalize(this.value);
|
||||
|
||||
super.beforeSaving();
|
||||
|
||||
this.becca.attributes[this.attributeId] = this;
|
||||
|
||||
@@ -802,6 +802,9 @@ class BNote extends AbstractBeccaEntity<BNote> {
|
||||
this.__attributeCache = null;
|
||||
this.__inheritableAttributeCache = null;
|
||||
this.__ancestorCache = null;
|
||||
|
||||
// Mark only this note's flat text as dirty for incremental index update
|
||||
this.becca.dirtyNoteFlatText(this.noteId);
|
||||
}
|
||||
|
||||
invalidateSubTree(path: string[] = []) {
|
||||
|
||||
@@ -99,6 +99,8 @@ const ALLOWED_OPTIONS = new Set<OptionNames>([
|
||||
"layoutOrientation",
|
||||
"backgroundEffects",
|
||||
"allowedHtmlTags",
|
||||
"searchEnableFuzzyMatching",
|
||||
"searchAutocompleteFuzzy",
|
||||
"redirectBareDomain",
|
||||
"showLoginInShareTheme",
|
||||
"splitEditorOrientation",
|
||||
|
||||
@@ -234,6 +234,10 @@ const defaultOptions: DefaultOption[] = [
|
||||
isSynced: true
|
||||
},
|
||||
|
||||
// Search settings
|
||||
{ name: "searchEnableFuzzyMatching", value: "true", isSynced: true },
|
||||
{ name: "searchAutocompleteFuzzy", value: "false", isSynced: true },
|
||||
|
||||
// Share settings
|
||||
{ name: "redirectBareDomain", value: "false", isSynced: true },
|
||||
{ name: "showLoginInShareTheme", value: "false", isSynced: true },
|
||||
|
||||
@@ -7,7 +7,7 @@ import Expression from "./expression.js";
|
||||
import NoteSet from "../note_set.js";
|
||||
import becca from "../../../becca/becca.js";
|
||||
import { normalize } from "../../utils.js";
|
||||
import { normalizeSearchText, fuzzyMatchWord, fuzzyMatchWordWithResult } from "../utils/text_utils.js";
|
||||
import { normalizeSearchText, fuzzyMatchWordWithResult } from "../utils/text_utils.js";
|
||||
import beccaService from "../../../becca/becca_service.js";
|
||||
|
||||
class NoteFlatTextExp extends Expression {
|
||||
@@ -23,6 +23,18 @@ class NoteFlatTextExp extends Expression {
|
||||
execute(inputNoteSet: NoteSet, executionContext: any, searchContext: SearchContext) {
|
||||
const resultNoteSet = new NoteSet();
|
||||
|
||||
// Cache normalized titles to avoid redundant normalize+getNoteTitle calls
|
||||
const titleCache = new Map<string, string>();
|
||||
const getNormalizedTitle = (noteId: string, parentNoteId: string): string => {
|
||||
const key = `${noteId}-${parentNoteId}`;
|
||||
let cached = titleCache.get(key);
|
||||
if (cached === undefined) {
|
||||
cached = normalizeSearchText(beccaService.getNoteTitle(noteId, parentNoteId));
|
||||
titleCache.set(key, cached);
|
||||
}
|
||||
return cached;
|
||||
};
|
||||
|
||||
/**
|
||||
* @param note
|
||||
* @param remainingTokens - tokens still needed to be found in the path towards root
|
||||
@@ -38,10 +50,8 @@ class NoteFlatTextExp extends Expression {
|
||||
const noteId = resultPath[resultPath.length - 1];
|
||||
|
||||
if (!resultNoteSet.hasNoteId(noteId)) {
|
||||
// we could get here from multiple paths, the first one wins because the paths
|
||||
// are sorted by importance
|
||||
// Snapshot takenPath since it's mutable
|
||||
executionContext.noteIdToNotePath[noteId] = resultPath;
|
||||
|
||||
resultNoteSet.add(becca.notes[noteId]);
|
||||
}
|
||||
}
|
||||
@@ -50,45 +60,40 @@ class NoteFlatTextExp extends Expression {
|
||||
}
|
||||
|
||||
if (note.parents.length === 0 || note.noteId === "root") {
|
||||
// we've reached root, but there are still remaining tokens -> this candidate note produced no result
|
||||
return;
|
||||
}
|
||||
|
||||
const foundAttrTokens: string[] = [];
|
||||
|
||||
for (const token of remainingTokens) {
|
||||
// Add defensive checks for undefined properties
|
||||
const typeMatches = note.type && note.type.includes(token);
|
||||
const mimeMatches = note.mime && note.mime.includes(token);
|
||||
|
||||
if (typeMatches || mimeMatches) {
|
||||
if ((note.type && note.type.includes(token)) ||
|
||||
(note.mime && note.mime.includes(token))) {
|
||||
foundAttrTokens.push(token);
|
||||
}
|
||||
}
|
||||
|
||||
for (const attribute of note.getOwnedAttributes()) {
|
||||
const normalizedName = normalizeSearchText(attribute.name);
|
||||
const normalizedValue = normalizeSearchText(attribute.value);
|
||||
|
||||
for (const token of remainingTokens) {
|
||||
if (normalizedName.includes(token) || normalizedValue.includes(token)) {
|
||||
if (attribute.normalizedName.includes(token) || attribute.normalizedValue.includes(token)) {
|
||||
foundAttrTokens.push(token);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (const parentNote of note.parents) {
|
||||
const title = normalizeSearchText(beccaService.getNoteTitle(note.noteId, parentNote.noteId));
|
||||
const foundTokens: string[] = foundAttrTokens.slice();
|
||||
const title = getNormalizedTitle(note.noteId, parentNote.noteId);
|
||||
|
||||
// Use Set for O(1) lookup instead of Array.includes() which is O(n)
|
||||
const foundTokenSet = new Set<string>(foundAttrTokens);
|
||||
|
||||
for (const token of remainingTokens) {
|
||||
if (this.smartMatch(title, token, searchContext)) {
|
||||
foundTokens.push(token);
|
||||
foundTokenSet.add(token);
|
||||
}
|
||||
}
|
||||
|
||||
if (foundTokens.length > 0) {
|
||||
const newRemainingTokens = remainingTokens.filter((token) => !foundTokens.includes(token));
|
||||
if (foundTokenSet.size > 0) {
|
||||
const newRemainingTokens = remainingTokens.filter((token) => !foundTokenSet.has(token));
|
||||
|
||||
searchPathTowardsRoot(parentNote, newRemainingTokens, [note.noteId, ...takenPath]);
|
||||
} else {
|
||||
@@ -99,6 +104,22 @@ class NoteFlatTextExp extends Expression {
|
||||
|
||||
const candidateNotes = this.getCandidateNotes(inputNoteSet, searchContext);
|
||||
|
||||
// Fast path for single-token autocomplete searches:
|
||||
// Skip the expensive recursive parent walk and just use getBestNotePath().
|
||||
// The flat text already matched, so we know the token is present.
|
||||
if (this.tokens.length === 1 && searchContext.autocomplete) {
|
||||
for (const note of candidateNotes) {
|
||||
if (!resultNoteSet.hasNoteId(note.noteId)) {
|
||||
const notePath = note.getBestNotePath();
|
||||
if (notePath) {
|
||||
executionContext.noteIdToNotePath[note.noteId] = notePath;
|
||||
resultNoteSet.add(note);
|
||||
}
|
||||
}
|
||||
}
|
||||
return resultNoteSet;
|
||||
}
|
||||
|
||||
for (const note of candidateNotes) {
|
||||
// autocomplete should be able to find notes by their noteIds as well (only leafs)
|
||||
if (this.tokens.length === 1 && note.noteId.toLowerCase() === this.tokens[0]) {
|
||||
@@ -112,13 +133,13 @@ class NoteFlatTextExp extends Expression {
|
||||
// Add defensive checks for undefined properties
|
||||
const typeMatches = note.type && note.type.includes(token);
|
||||
const mimeMatches = note.mime && note.mime.includes(token);
|
||||
|
||||
|
||||
if (typeMatches || mimeMatches) {
|
||||
foundAttrTokens.push(token);
|
||||
}
|
||||
|
||||
for (const attribute of note.ownedAttributes) {
|
||||
if (normalizeSearchText(attribute.name).includes(token) || normalizeSearchText(attribute.value).includes(token)) {
|
||||
if (attribute.normalizedName.includes(token) || attribute.normalizedValue.includes(token)) {
|
||||
foundAttrTokens.push(token);
|
||||
}
|
||||
}
|
||||
@@ -165,10 +186,25 @@ class NoteFlatTextExp extends Expression {
|
||||
getCandidateNotes(noteSet: NoteSet, searchContext?: SearchContext): BNote[] {
|
||||
const candidateNotes: BNote[] = [];
|
||||
|
||||
for (const note of noteSet.notes) {
|
||||
const normalizedFlatText = normalizeSearchText(note.getFlatText());
|
||||
// Use the pre-built flat text index for fast scanning.
|
||||
// This provides pre-computed flat texts in a parallel array, avoiding
|
||||
// per-note property access overhead at large scale (50K+ notes).
|
||||
const { notes: indexNotes, flatTexts } = becca.getFlatTextIndex();
|
||||
|
||||
// Build a set for quick membership check when noteSet isn't the full set
|
||||
const isFullSet = noteSet.notes.length === indexNotes.length;
|
||||
|
||||
for (let i = 0; i < indexNotes.length; i++) {
|
||||
const note = indexNotes[i];
|
||||
|
||||
// Skip notes not in the input set (only check when not using the full set)
|
||||
if (!isFullSet && !noteSet.hasNoteId(note.noteId)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const flatText = flatTexts[i];
|
||||
for (const token of this.tokens) {
|
||||
if (this.smartMatch(normalizedFlatText, token, searchContext)) {
|
||||
if (this.smartMatch(flatText, token, searchContext)) {
|
||||
candidateNotes.push(note);
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
"use strict";
|
||||
|
||||
import hoistedNoteService from "../hoisted_note.js";
|
||||
import optionService from "../options.js";
|
||||
import type { SearchParams } from "./services/types.js";
|
||||
|
||||
class SearchContext {
|
||||
@@ -18,6 +19,8 @@ class SearchContext {
|
||||
debug?: boolean;
|
||||
debugInfo: {} | null;
|
||||
fuzzyAttributeSearch: boolean;
|
||||
/** When true, skip the two-phase fuzzy fallback and use the single-token fast path. */
|
||||
autocomplete: boolean;
|
||||
enableFuzzyMatching: boolean; // Controls whether fuzzy matching is enabled for this search phase
|
||||
highlightedTokens: string[];
|
||||
originalQuery: string;
|
||||
@@ -46,7 +49,12 @@ class SearchContext {
|
||||
this.debug = params.debug;
|
||||
this.debugInfo = null;
|
||||
this.fuzzyAttributeSearch = !!params.fuzzyAttributeSearch;
|
||||
this.enableFuzzyMatching = true; // Default to true for backward compatibility
|
||||
this.autocomplete = !!params.autocomplete;
|
||||
try {
|
||||
this.enableFuzzyMatching = optionService.getOptionBool("searchEnableFuzzyMatching");
|
||||
} catch {
|
||||
this.enableFuzzyMatching = true; // Default to true if option not yet initialized
|
||||
}
|
||||
this.highlightedTokens = [];
|
||||
this.originalQuery = "";
|
||||
this.fulltextQuery = ""; // complete fulltext part
|
||||
|
||||
@@ -56,8 +56,9 @@ class SearchResult {
|
||||
this.fuzzyScore = 0; // Reset fuzzy score tracking
|
||||
|
||||
const note = becca.notes[this.noteId];
|
||||
const normalizedQuery = normalizeSearchText(fulltextQuery.toLowerCase());
|
||||
const normalizedTitle = normalizeSearchText(note.title.toLowerCase());
|
||||
// normalizeSearchText already lowercases — no need for .toLowerCase() first
|
||||
const normalizedQuery = normalizeSearchText(fulltextQuery);
|
||||
const normalizedTitle = normalizeSearchText(note.title);
|
||||
|
||||
// Note ID exact match, much higher score
|
||||
if (note.noteId.toLowerCase() === fulltextQuery) {
|
||||
@@ -88,30 +89,32 @@ class SearchResult {
|
||||
}
|
||||
|
||||
addScoreForStrings(tokens: string[], str: string, factor: number, enableFuzzyMatching: boolean = true) {
|
||||
const normalizedStr = normalizeSearchText(str.toLowerCase());
|
||||
// normalizeSearchText already lowercases — no need for .toLowerCase() first
|
||||
const normalizedStr = normalizeSearchText(str);
|
||||
const chunks = normalizedStr.split(" ");
|
||||
|
||||
// Pre-normalize tokens once instead of per-chunk
|
||||
const normalizedTokens = tokens.map(t => normalizeSearchText(t));
|
||||
|
||||
let tokenScore = 0;
|
||||
for (const chunk of chunks) {
|
||||
for (const token of tokens) {
|
||||
const normalizedToken = normalizeSearchText(token.toLowerCase());
|
||||
for (let ti = 0; ti < normalizedTokens.length; ti++) {
|
||||
const normalizedToken = normalizedTokens[ti];
|
||||
|
||||
if (chunk === normalizedToken) {
|
||||
tokenScore += SCORE_WEIGHTS.TOKEN_EXACT_MATCH * token.length * factor;
|
||||
tokenScore += SCORE_WEIGHTS.TOKEN_EXACT_MATCH * tokens[ti].length * factor;
|
||||
} else if (chunk.startsWith(normalizedToken)) {
|
||||
tokenScore += SCORE_WEIGHTS.TOKEN_PREFIX_MATCH * token.length * factor;
|
||||
tokenScore += SCORE_WEIGHTS.TOKEN_PREFIX_MATCH * tokens[ti].length * factor;
|
||||
} else if (chunk.includes(normalizedToken)) {
|
||||
tokenScore += SCORE_WEIGHTS.TOKEN_CONTAINS_MATCH * token.length * factor;
|
||||
} else {
|
||||
// Try fuzzy matching for individual tokens with caps applied
|
||||
tokenScore += SCORE_WEIGHTS.TOKEN_CONTAINS_MATCH * tokens[ti].length * factor;
|
||||
} else if (enableFuzzyMatching &&
|
||||
normalizedToken.length >= FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH &&
|
||||
this.fuzzyScore < SCORE_WEIGHTS.MAX_TOTAL_FUZZY_SCORE) {
|
||||
// Only compute edit distance when fuzzy matching is enabled
|
||||
const editDistance = calculateOptimizedEditDistance(chunk, normalizedToken, FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE);
|
||||
if (editDistance <= FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE &&
|
||||
normalizedToken.length >= FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH &&
|
||||
this.fuzzyScore < SCORE_WEIGHTS.MAX_TOTAL_FUZZY_SCORE) {
|
||||
|
||||
if (editDistance <= FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE) {
|
||||
const fuzzyWeight = SCORE_WEIGHTS.TOKEN_FUZZY_MATCH * (1 - editDistance / FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE);
|
||||
// Apply caps: limit token length multiplier and per-token contribution
|
||||
const cappedTokenLength = Math.min(token.length, SCORE_WEIGHTS.MAX_FUZZY_TOKEN_LENGTH_MULTIPLIER);
|
||||
const cappedTokenLength = Math.min(tokens[ti].length, SCORE_WEIGHTS.MAX_FUZZY_TOKEN_LENGTH_MULTIPLIER);
|
||||
const fuzzyTokenScore = Math.min(
|
||||
fuzzyWeight * cappedTokenLength * factor,
|
||||
SCORE_WEIGHTS.MAX_FUZZY_SCORE_PER_TOKEN
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
"use strict";
|
||||
|
||||
import normalizeString from "normalize-strings";
|
||||
import lex from "./lex.js";
|
||||
import handleParens from "./handle_parens.js";
|
||||
import parse from "./parse.js";
|
||||
@@ -8,7 +7,8 @@ import SearchResult from "../search_result.js";
|
||||
import SearchContext from "../search_context.js";
|
||||
import becca from "../../../becca/becca.js";
|
||||
import beccaService from "../../../becca/becca_service.js";
|
||||
import { normalize, escapeHtml, escapeRegExp } from "../../utils.js";
|
||||
import { normalize, removeDiacritic, escapeHtml, escapeRegExp } from "../../utils.js";
|
||||
import { stripHtmlTags } from "../utils/text_utils.js";
|
||||
import log from "../../log.js";
|
||||
import hoistedNoteService from "../../hoisted_note.js";
|
||||
import type BNote from "../../../becca/entities/bnote.js";
|
||||
@@ -17,8 +17,8 @@ import type { SearchParams, TokenStructure } from "./types.js";
|
||||
import type Expression from "../expressions/expression.js";
|
||||
import sql from "../../sql.js";
|
||||
import scriptService from "../../script.js";
|
||||
import striptags from "striptags";
|
||||
import protectedSessionService from "../../protected_session.js";
|
||||
import optionService from "../../options.js";
|
||||
|
||||
export interface SearchNoteResult {
|
||||
searchResultNoteIds: string[];
|
||||
@@ -252,21 +252,21 @@ function findResultsWithExpression(expression: Expression, searchContext: Search
|
||||
|
||||
// Phase 1: Try exact matches first (without fuzzy matching)
|
||||
const exactResults = performSearch(expression, searchContext, false);
|
||||
|
||||
|
||||
// Check if we have sufficient high-quality results
|
||||
const minResultThreshold = 5;
|
||||
const minScoreForQuality = 10; // Minimum score to consider a result "high quality"
|
||||
|
||||
|
||||
const highQualityResults = exactResults.filter(result => result.score >= minScoreForQuality);
|
||||
|
||||
|
||||
// If we have enough high-quality exact matches, return them
|
||||
if (highQualityResults.length >= minResultThreshold) {
|
||||
return exactResults;
|
||||
}
|
||||
|
||||
|
||||
// Phase 2: Add fuzzy matching as fallback when exact matches are insufficient
|
||||
const fuzzyResults = performSearch(expression, searchContext, true);
|
||||
|
||||
|
||||
// Merge results, ensuring exact matches always rank higher than fuzzy matches
|
||||
return mergeExactAndFuzzyResults(exactResults, fuzzyResults);
|
||||
}
|
||||
@@ -410,6 +410,12 @@ function findResultsWithQuery(query: string, searchContext: SearchContext): Sear
|
||||
query = query || "";
|
||||
searchContext.originalQuery = query;
|
||||
|
||||
// For autocomplete searches, use the dedicated autocomplete fuzzy option
|
||||
// instead of the global fuzzy setting. Do this early so it applies to all code paths.
|
||||
if (searchContext.autocomplete) {
|
||||
searchContext.enableFuzzyMatching = optionService.getOptionBool("searchAutocompleteFuzzy");
|
||||
}
|
||||
|
||||
const expression = parseQueryToExpression(query, searchContext);
|
||||
|
||||
if (!expression) {
|
||||
@@ -491,75 +497,63 @@ function extractContentSnippet(noteId: string, searchTokens: string[], maxLength
|
||||
|
||||
// Strip HTML tags for text notes
|
||||
if (note.type === "text") {
|
||||
content = striptags(content);
|
||||
content = stripHtmlTags(content);
|
||||
}
|
||||
|
||||
// Normalize whitespace while preserving paragraph breaks
|
||||
// First, normalize multiple newlines to double newlines (paragraph breaks)
|
||||
content = content.replace(/\n\s*\n/g, "\n\n");
|
||||
// Then normalize spaces within lines
|
||||
content = content.split('\n').map(line => line.replace(/\s+/g, " ").trim()).join('\n');
|
||||
// Finally trim the whole content
|
||||
content = content.trim();
|
||||
|
||||
if (!content) {
|
||||
return "";
|
||||
}
|
||||
|
||||
// Try to find a snippet around the first matching token
|
||||
const normalizedContent = normalizeString(content.toLowerCase());
|
||||
// Find match position using normalize on the raw stripped content.
|
||||
// We use a single normalize() pass — no need for expensive whitespace
|
||||
// normalization just to find the match index.
|
||||
const normalizedContent = normalize(content);
|
||||
const normalizedTokens = searchTokens.map(token => normalize(token));
|
||||
let snippetStart = 0;
|
||||
let matchFound = false;
|
||||
|
||||
for (const token of searchTokens) {
|
||||
const normalizedToken = normalizeString(token.toLowerCase());
|
||||
for (const normalizedToken of normalizedTokens) {
|
||||
const matchIndex = normalizedContent.indexOf(normalizedToken);
|
||||
|
||||
|
||||
if (matchIndex !== -1) {
|
||||
// Center the snippet around the match
|
||||
snippetStart = Math.max(0, matchIndex - maxLength / 2);
|
||||
matchFound = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Extract snippet
|
||||
let snippet = content.substring(snippetStart, snippetStart + maxLength);
|
||||
// Extract a snippet region from the raw content, then clean only that
|
||||
const snippetRegion = content.substring(snippetStart, snippetStart + maxLength + 100);
|
||||
|
||||
// If snippet contains linebreaks, limit to max 4 lines and override character limit
|
||||
// Normalize whitespace only on the small snippet region
|
||||
let snippet = snippetRegion
|
||||
.replace(/\n\s*\n/g, "\n\n")
|
||||
.replace(/[ \t]+/g, " ")
|
||||
.trim()
|
||||
.substring(0, maxLength);
|
||||
|
||||
// If snippet contains linebreaks, limit to max 4 lines
|
||||
const lines = snippet.split('\n');
|
||||
if (lines.length > 4) {
|
||||
// Find which lines contain the search tokens to ensure they're included
|
||||
const normalizedLines = lines.map(line => normalizeString(line.toLowerCase()));
|
||||
const normalizedTokens = searchTokens.map(token => normalizeString(token.toLowerCase()));
|
||||
|
||||
// Find the first line that contains a search token
|
||||
let firstMatchLine = -1;
|
||||
for (let i = 0; i < normalizedLines.length; i++) {
|
||||
if (normalizedTokens.some(token => normalizedLines[i].includes(token))) {
|
||||
for (let i = 0; i < lines.length; i++) {
|
||||
const normalizedLine = normalize(lines[i]);
|
||||
if (normalizedTokens.some(token => normalizedLine.includes(token))) {
|
||||
firstMatchLine = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (firstMatchLine !== -1) {
|
||||
// Center the 4-line window around the first match
|
||||
// Try to show 1 line before and 2 lines after the match
|
||||
const startLine = Math.max(0, firstMatchLine - 1);
|
||||
const endLine = Math.min(lines.length, startLine + 4);
|
||||
snippet = lines.slice(startLine, endLine).join('\n');
|
||||
} else {
|
||||
// No match found in lines (shouldn't happen), just take first 4
|
||||
snippet = lines.slice(0, 4).join('\n');
|
||||
}
|
||||
// Add ellipsis if we truncated lines
|
||||
snippet = snippet + "...";
|
||||
} else if (lines.length > 1) {
|
||||
// For multi-line snippets that are 4 or fewer lines, keep them as-is
|
||||
// No need to truncate
|
||||
} else {
|
||||
// Single line content - apply original word boundary logic
|
||||
// Try to start/end at word boundaries
|
||||
} else if (lines.length <= 1) {
|
||||
// Single line content - apply word boundary logic
|
||||
if (snippetStart > 0) {
|
||||
const firstSpace = snippet.search(/\s/);
|
||||
if (firstSpace > 0 && firstSpace < 20) {
|
||||
@@ -567,7 +561,7 @@ function extractContentSnippet(noteId: string, searchTokens: string[], maxLength
|
||||
}
|
||||
snippet = "..." + snippet;
|
||||
}
|
||||
|
||||
|
||||
if (snippetStart + maxLength < content.length) {
|
||||
const lastSpace = snippet.search(/\s[^\s]*$/);
|
||||
if (lastSpace > snippet.length - 20 && lastSpace > 0) {
|
||||
@@ -601,13 +595,14 @@ function extractAttributeSnippet(noteId: string, searchTokens: string[], maxLeng
|
||||
|
||||
// Look for attributes that match the search tokens
|
||||
for (const attr of attributes) {
|
||||
const attrName = attr.name?.toLowerCase() || "";
|
||||
const attrValue = attr.value?.toLowerCase() || "";
|
||||
// Use pre-normalized fields from BAttribute for diacritic-insensitive matching
|
||||
const attrName = attr.normalizedName || normalize(attr.name || "");
|
||||
const attrValue = attr.normalizedValue || normalize(attr.value || "");
|
||||
const attrType = attr.type || "";
|
||||
|
||||
|
||||
// Check if any search token matches the attribute name or value
|
||||
const hasMatch = searchTokens.some(token => {
|
||||
const normalizedToken = normalizeString(token.toLowerCase());
|
||||
const normalizedToken = normalize(token);
|
||||
return attrName.includes(normalizedToken) || attrValue.includes(normalizedToken);
|
||||
});
|
||||
|
||||
@@ -675,7 +670,8 @@ function searchNotesForAutocomplete(query: string, fastSearch: boolean = true) {
|
||||
includeHiddenNotes: true,
|
||||
fuzzyAttributeSearch: true,
|
||||
ignoreInternalAttributes: true,
|
||||
ancestorNoteId: hoistedNoteService.isHoistedInHiddenSubtree() ? "root" : hoistedNoteService.getHoistedNoteId()
|
||||
ancestorNoteId: hoistedNoteService.isHoistedInHiddenSubtree() ? "root" : hoistedNoteService.getHoistedNoteId(),
|
||||
autocomplete: true
|
||||
});
|
||||
|
||||
const allSearchResults = findResultsWithQuery(query, searchContext);
|
||||
@@ -752,37 +748,40 @@ function highlightSearchResults(searchResults: SearchResult[], highlightedTokens
|
||||
}
|
||||
|
||||
for (const result of searchResults) {
|
||||
// Reset token
|
||||
const tokenRegex = new RegExp(escapeRegExp(token), "gi");
|
||||
let match;
|
||||
|
||||
// Highlight in note path title
|
||||
if (result.highlightedNotePathTitle) {
|
||||
const titleRegex = new RegExp(escapeRegExp(token), "gi");
|
||||
while ((match = titleRegex.exec(normalizeString(result.highlightedNotePathTitle))) !== null) {
|
||||
// Compute diacritic-free version ONCE before the loop, not on every iteration
|
||||
let titleNoDiacritics = removeDiacritic(result.highlightedNotePathTitle);
|
||||
while ((match = titleRegex.exec(titleNoDiacritics)) !== null) {
|
||||
result.highlightedNotePathTitle = wrapText(result.highlightedNotePathTitle, match.index, token.length, "{", "}");
|
||||
// 2 characters are added, so we need to adjust the index
|
||||
// 2 characters are added, so we need to adjust the index and re-derive
|
||||
titleRegex.lastIndex += 2;
|
||||
titleNoDiacritics = removeDiacritic(result.highlightedNotePathTitle);
|
||||
}
|
||||
}
|
||||
|
||||
// Highlight in content snippet
|
||||
if (result.highlightedContentSnippet) {
|
||||
const contentRegex = new RegExp(escapeRegExp(token), "gi");
|
||||
while ((match = contentRegex.exec(normalizeString(result.highlightedContentSnippet))) !== null) {
|
||||
let contentNoDiacritics = removeDiacritic(result.highlightedContentSnippet);
|
||||
while ((match = contentRegex.exec(contentNoDiacritics)) !== null) {
|
||||
result.highlightedContentSnippet = wrapText(result.highlightedContentSnippet, match.index, token.length, "{", "}");
|
||||
// 2 characters are added, so we need to adjust the index
|
||||
contentRegex.lastIndex += 2;
|
||||
contentNoDiacritics = removeDiacritic(result.highlightedContentSnippet);
|
||||
}
|
||||
}
|
||||
|
||||
// Highlight in attribute snippet
|
||||
if (result.highlightedAttributeSnippet) {
|
||||
const attributeRegex = new RegExp(escapeRegExp(token), "gi");
|
||||
while ((match = attributeRegex.exec(normalizeString(result.highlightedAttributeSnippet))) !== null) {
|
||||
let attrNoDiacritics = removeDiacritic(result.highlightedAttributeSnippet);
|
||||
while ((match = attributeRegex.exec(attrNoDiacritics)) !== null) {
|
||||
result.highlightedAttributeSnippet = wrapText(result.highlightedAttributeSnippet, match.index, token.length, "{", "}");
|
||||
// 2 characters are added, so we need to adjust the index
|
||||
attributeRegex.lastIndex += 2;
|
||||
attrNoDiacritics = removeDiacritic(result.highlightedAttributeSnippet);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,677 @@
|
||||
/**
|
||||
* Comprehensive search benchmark suite.
|
||||
*
|
||||
* Covers many scenarios:
|
||||
* - Single-token, multi-token, phrase-like queries
|
||||
* - Fuzzy matching enabled vs disabled
|
||||
* - Autocomplete vs full search
|
||||
* - Diacritics / unicode queries
|
||||
* - No-match queries
|
||||
* - Varying note counts (1K, 5K, 10K, 20K)
|
||||
* - Warm cache vs cold cache
|
||||
*
|
||||
* All times are in-memory (monkeypatched getContent, no real SQL).
|
||||
*/
|
||||
import { describe, it, expect, afterEach } from "vitest";
|
||||
import searchService from "./search.js";
|
||||
import BNote from "../../../becca/entities/bnote.js";
|
||||
import BBranch from "../../../becca/entities/bbranch.js";
|
||||
import SearchContext from "../search_context.js";
|
||||
import becca from "../../../becca/becca.js";
|
||||
import { NoteBuilder, note } from "../../../test/becca_mocking.js";
|
||||
|
||||
// ── helpers ──────────────────────────────────────────────────────────
|
||||
|
||||
function randomWord(len = 6): string {
|
||||
const chars = "abcdefghijklmnopqrstuvwxyz";
|
||||
let word = "";
|
||||
for (let i = 0; i < len; i++) {
|
||||
word += chars[Math.floor(Math.random() * chars.length)];
|
||||
}
|
||||
return word;
|
||||
}
|
||||
|
||||
function generateHtmlContent(wordCount: number, includeKeywords = false, keywords?: string[]): string {
|
||||
const paragraphs: string[] = [];
|
||||
let wordsRemaining = wordCount;
|
||||
const kws = keywords ?? [];
|
||||
|
||||
while (wordsRemaining > 0) {
|
||||
const paraWords = Math.min(wordsRemaining, 20 + Math.floor(Math.random() * 40));
|
||||
const words: string[] = [];
|
||||
for (let i = 0; i < paraWords; i++) {
|
||||
words.push(randomWord(3 + Math.floor(Math.random() * 10)));
|
||||
}
|
||||
if (includeKeywords && paragraphs.length === 2) {
|
||||
for (let k = 0; k < kws.length; k++) {
|
||||
const pos = Math.min(words.length - 1, Math.floor((words.length / (kws.length + 1)) * (k + 1)));
|
||||
words[pos] = kws[k];
|
||||
}
|
||||
}
|
||||
paragraphs.push(`<p>${words.join(" ")}</p>`);
|
||||
wordsRemaining -= paraWords;
|
||||
}
|
||||
|
||||
return `<html><body>${paragraphs.join("\n")}</body></html>`;
|
||||
}
|
||||
|
||||
function timed<T>(fn: () => T): [T, number] {
|
||||
const start = performance.now();
|
||||
const result = fn();
|
||||
return [result, performance.now() - start];
|
||||
}
|
||||
|
||||
function avg(nums: number[]): number {
|
||||
return nums.reduce((a, b) => a + b, 0) / nums.length;
|
||||
}
|
||||
|
||||
function min(nums: number[]): number {
|
||||
return Math.min(...nums);
|
||||
}
|
||||
|
||||
// ── dataset builder ──────────────────────────────────────────────────
|
||||
|
||||
const syntheticContent: Record<string, string> = {};
|
||||
|
||||
function buildDataset(noteCount: number, opts: {
|
||||
matchFraction?: number;
|
||||
labelsPerNote?: number;
|
||||
depth?: number;
|
||||
contentWordCount?: number;
|
||||
varyContentSize?: boolean;
|
||||
titleKeywords?: string[];
|
||||
contentKeywords?: string[];
|
||||
/** Include notes with diacritics in titles */
|
||||
includeDiacritics?: boolean;
|
||||
} = {}) {
|
||||
const {
|
||||
matchFraction = 0.1,
|
||||
labelsPerNote = 3,
|
||||
depth = 4,
|
||||
contentWordCount = 300,
|
||||
varyContentSize = true,
|
||||
titleKeywords = ["target"],
|
||||
contentKeywords = titleKeywords,
|
||||
includeDiacritics = false,
|
||||
} = opts;
|
||||
|
||||
becca.reset();
|
||||
for (const key of Object.keys(syntheticContent)) {
|
||||
delete syntheticContent[key];
|
||||
}
|
||||
|
||||
const rootNote = new NoteBuilder(new BNote({ noteId: "root", title: "root", type: "text" }));
|
||||
new BBranch({
|
||||
branchId: "none_root",
|
||||
noteId: "root",
|
||||
parentNoteId: "none",
|
||||
notePosition: 10
|
||||
});
|
||||
|
||||
const containers: NoteBuilder[] = [];
|
||||
let parent = rootNote;
|
||||
for (let d = 0; d < depth; d++) {
|
||||
const container = note(`Container_${d}_${randomWord(4)}`);
|
||||
parent.child(container);
|
||||
containers.push(container);
|
||||
parent = container;
|
||||
}
|
||||
|
||||
const matchCount = Math.floor(noteCount * matchFraction);
|
||||
const diacriticTitles = [
|
||||
"résumé", "naïve", "café", "über", "ñoño", "exposé",
|
||||
"Ångström", "Üntersuchung", "São Paulo", "François"
|
||||
];
|
||||
|
||||
for (let i = 0; i < noteCount; i++) {
|
||||
const isMatch = i < matchCount;
|
||||
let title: string;
|
||||
|
||||
if (includeDiacritics && i % 20 === 0) {
|
||||
// Every 20th note gets a diacritics-heavy title
|
||||
const dTitle = diacriticTitles[i % diacriticTitles.length];
|
||||
title = isMatch
|
||||
? `${dTitle} ${titleKeywords.join(" ")} Document ${i}`
|
||||
: `${dTitle} ${randomWord(5)} Note ${i}`;
|
||||
} else {
|
||||
title = isMatch
|
||||
? `${randomWord(5)} ${titleKeywords.join(" ")} ${randomWord(5)} Document ${i}`
|
||||
: `${randomWord(5)} ${randomWord(6)} ${randomWord(4)} Note ${i}`;
|
||||
}
|
||||
|
||||
const n = note(title);
|
||||
|
||||
for (let l = 0; l < labelsPerNote; l++) {
|
||||
const labelName = isMatch && l === 0 ? "category" : `label_${randomWord(4)}`;
|
||||
const labelValue = isMatch && l === 0 ? `important ${titleKeywords[0]}` : randomWord(8);
|
||||
n.label(labelName, labelValue);
|
||||
}
|
||||
|
||||
let noteWordCount = contentWordCount;
|
||||
if (varyContentSize) {
|
||||
const r = Math.random();
|
||||
if (r < 0.2) noteWordCount = Math.floor(contentWordCount * (0.2 + Math.random() * 0.3));
|
||||
else if (r < 0.7) noteWordCount = Math.floor(contentWordCount * (0.7 + Math.random() * 0.6));
|
||||
else if (r < 0.9) noteWordCount = Math.floor(contentWordCount * (1.3 + Math.random() * 0.7));
|
||||
else noteWordCount = Math.floor(contentWordCount * (2.0 + Math.random() * 1.0));
|
||||
}
|
||||
|
||||
const includeContentKeyword = isMatch && contentKeywords.length > 0;
|
||||
syntheticContent[n.note.noteId] = generateHtmlContent(
|
||||
noteWordCount,
|
||||
includeContentKeyword,
|
||||
includeContentKeyword ? contentKeywords : undefined
|
||||
);
|
||||
|
||||
const containerIndex = i % containers.length;
|
||||
containers[containerIndex].child(n);
|
||||
}
|
||||
|
||||
// Monkeypatch getContent()
|
||||
for (const noteObj of Object.values(becca.notes)) {
|
||||
const noteId = noteObj.noteId;
|
||||
if (syntheticContent[noteId]) {
|
||||
(noteObj as any).getContent = () => syntheticContent[noteId];
|
||||
} else {
|
||||
(noteObj as any).getContent = () => "";
|
||||
}
|
||||
}
|
||||
|
||||
return { rootNote, matchCount };
|
||||
}
|
||||
|
||||
// ── benchmark runner ─────────────────────────────────────────────────
|
||||
|
||||
interface BenchmarkResult {
|
||||
query: string;
|
||||
mode: string;
|
||||
noteCount: number;
|
||||
avgMs: number;
|
||||
minMs: number;
|
||||
resultCount: number;
|
||||
}
|
||||
|
||||
function runBenchmark(
|
||||
query: string,
|
||||
mode: "autocomplete" | "fullSearch",
|
||||
fuzzyEnabled: boolean,
|
||||
iterations = 5
|
||||
): BenchmarkResult {
|
||||
const noteCount = Object.keys(becca.notes).length;
|
||||
|
||||
// Warm up
|
||||
if (mode === "autocomplete") {
|
||||
searchService.searchNotesForAutocomplete(query, true);
|
||||
} else {
|
||||
const ctx = new SearchContext({ fastSearch: false });
|
||||
ctx.enableFuzzyMatching = fuzzyEnabled;
|
||||
searchService.findResultsWithQuery(query, ctx);
|
||||
}
|
||||
|
||||
const times: number[] = [];
|
||||
let resultCount = 0;
|
||||
|
||||
for (let i = 0; i < iterations; i++) {
|
||||
if (mode === "autocomplete") {
|
||||
// For autocomplete, fuzzy is controlled by the global option
|
||||
// We'll manipulate enableFuzzyMatching after construction
|
||||
const [results, ms] = timed(() => {
|
||||
// searchNotesForAutocomplete creates its own SearchContext internally
|
||||
// so we need to test via findResultsWithQuery for fuzzy control
|
||||
const ctx = new SearchContext({
|
||||
fastSearch: true,
|
||||
includeHiddenNotes: true,
|
||||
fuzzyAttributeSearch: true,
|
||||
ignoreInternalAttributes: true,
|
||||
autocomplete: true
|
||||
});
|
||||
ctx.enableFuzzyMatching = fuzzyEnabled;
|
||||
return searchService.findResultsWithQuery(query, ctx);
|
||||
});
|
||||
times.push(ms);
|
||||
resultCount = results.length;
|
||||
} else {
|
||||
const [results, ms] = timed(() => {
|
||||
const ctx = new SearchContext({ fastSearch: false });
|
||||
ctx.enableFuzzyMatching = fuzzyEnabled;
|
||||
return searchService.findResultsWithQuery(query, ctx);
|
||||
});
|
||||
times.push(ms);
|
||||
resultCount = results.length;
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
query,
|
||||
mode: `${mode}${fuzzyEnabled ? "+fuzzy" : ""}`,
|
||||
noteCount,
|
||||
avgMs: avg(times),
|
||||
minMs: min(times),
|
||||
resultCount
|
||||
};
|
||||
}
|
||||
|
||||
function printTable(title: string, results: BenchmarkResult[]) {
|
||||
console.log(`\n${"═".repeat(110)}`);
|
||||
console.log(` ${title}`);
|
||||
console.log(`${"═".repeat(110)}`);
|
||||
console.log(
|
||||
" " +
|
||||
"Query".padEnd(35) +
|
||||
"Mode".padEnd(22) +
|
||||
"Notes".padStart(7) +
|
||||
"Avg (ms)".padStart(12) +
|
||||
"Min (ms)".padStart(12) +
|
||||
"Results".padStart(10)
|
||||
);
|
||||
console.log(` ${"─".repeat(98)}`);
|
||||
for (const r of results) {
|
||||
console.log(
|
||||
" " +
|
||||
`"${r.query}"`.padEnd(35) +
|
||||
r.mode.padEnd(22) +
|
||||
String(r.noteCount).padStart(7) +
|
||||
r.avgMs.toFixed(1).padStart(12) +
|
||||
r.minMs.toFixed(1).padStart(12) +
|
||||
String(r.resultCount).padStart(10)
|
||||
);
|
||||
}
|
||||
console.log(`${"═".repeat(110)}\n`);
|
||||
}
|
||||
|
||||
// ── tests ────────────────────────────────────────────────────────────
|
||||
|
||||
// Skipped by default - this is a benchmark, not a test.
|
||||
// Remove .skip to run manually for performance analysis.
|
||||
describe.skip("Comprehensive Search Benchmark", () => {
|
||||
|
||||
afterEach(() => {
|
||||
becca.reset();
|
||||
});
|
||||
|
||||
describe("Single-token queries", () => {
|
||||
for (const noteCount of [1000, 5000, 10000, 20000]) {
|
||||
it(`single token @ ${noteCount} notes — fuzzy on vs off, autocomplete vs full`, () => {
|
||||
buildDataset(noteCount, {
|
||||
matchFraction: 0.15,
|
||||
titleKeywords: ["meeting"],
|
||||
contentKeywords: ["meeting"],
|
||||
contentWordCount: 300,
|
||||
});
|
||||
|
||||
const results: BenchmarkResult[] = [
|
||||
runBenchmark("meeting", "autocomplete", false),
|
||||
runBenchmark("meeting", "autocomplete", true),
|
||||
runBenchmark("meeting", "fullSearch", false),
|
||||
runBenchmark("meeting", "fullSearch", true),
|
||||
];
|
||||
|
||||
printTable(`Single Token "meeting" — ${noteCount} notes`, results);
|
||||
expect(results[0].resultCount).toBeGreaterThan(0);
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
describe("Multi-token queries", () => {
|
||||
for (const noteCount of [1000, 5000, 10000, 20000]) {
|
||||
it(`multi token @ ${noteCount} notes — fuzzy on vs off`, () => {
|
||||
buildDataset(noteCount, {
|
||||
matchFraction: 0.15,
|
||||
titleKeywords: ["meeting", "notes", "january"],
|
||||
contentKeywords: ["meeting", "notes", "january"],
|
||||
contentWordCount: 400,
|
||||
});
|
||||
|
||||
const results: BenchmarkResult[] = [
|
||||
// 2-token
|
||||
runBenchmark("meeting notes", "autocomplete", false),
|
||||
runBenchmark("meeting notes", "autocomplete", true),
|
||||
runBenchmark("meeting notes", "fullSearch", false),
|
||||
runBenchmark("meeting notes", "fullSearch", true),
|
||||
// 3-token
|
||||
runBenchmark("meeting notes january", "autocomplete", false),
|
||||
runBenchmark("meeting notes january", "autocomplete", true),
|
||||
runBenchmark("meeting notes january", "fullSearch", false),
|
||||
runBenchmark("meeting notes january", "fullSearch", true),
|
||||
];
|
||||
|
||||
printTable(`Multi Token — ${noteCount} notes`, results);
|
||||
expect(results[0].resultCount).toBeGreaterThan(0);
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
describe("No-match queries (worst case — full scan, zero results)", () => {
|
||||
for (const noteCount of [1000, 5000, 10000, 20000]) {
|
||||
it(`no-match @ ${noteCount} notes`, () => {
|
||||
buildDataset(noteCount, {
|
||||
matchFraction: 0.1,
|
||||
titleKeywords: ["target"],
|
||||
contentKeywords: ["target"],
|
||||
contentWordCount: 300,
|
||||
});
|
||||
|
||||
const results: BenchmarkResult[] = [
|
||||
runBenchmark("xyznonexistent", "autocomplete", false),
|
||||
runBenchmark("xyznonexistent", "autocomplete", true),
|
||||
runBenchmark("xyznonexistent", "fullSearch", false),
|
||||
runBenchmark("xyznonexistent", "fullSearch", true),
|
||||
runBenchmark("xyzfoo xyzbar", "autocomplete", false),
|
||||
runBenchmark("xyzfoo xyzbar", "autocomplete", true),
|
||||
runBenchmark("xyzfoo xyzbar", "fullSearch", false),
|
||||
runBenchmark("xyzfoo xyzbar", "fullSearch", true),
|
||||
];
|
||||
|
||||
printTable(`No-Match Queries — ${noteCount} notes`, results);
|
||||
// All should return 0 results
|
||||
for (const r of results) {
|
||||
expect(r.resultCount).toBe(0);
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
describe("Diacritics / Unicode queries", () => {
|
||||
for (const noteCount of [1000, 5000, 10000]) {
|
||||
it(`diacritics @ ${noteCount} notes`, () => {
|
||||
buildDataset(noteCount, {
|
||||
matchFraction: 0.15,
|
||||
titleKeywords: ["résumé"],
|
||||
contentKeywords: ["résumé"],
|
||||
contentWordCount: 300,
|
||||
includeDiacritics: true,
|
||||
});
|
||||
|
||||
const results: BenchmarkResult[] = [
|
||||
// Exact diacritics
|
||||
runBenchmark("résumé", "autocomplete", false),
|
||||
runBenchmark("résumé", "autocomplete", true),
|
||||
// ASCII equivalent (should still match via normalize)
|
||||
runBenchmark("resume", "autocomplete", false),
|
||||
runBenchmark("resume", "autocomplete", true),
|
||||
// Full search
|
||||
runBenchmark("résumé", "fullSearch", false),
|
||||
runBenchmark("resume", "fullSearch", false),
|
||||
];
|
||||
|
||||
printTable(`Diacritics "résumé" / "resume" — ${noteCount} notes`, results);
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
describe("Partial / prefix queries (simulating typing)", () => {
|
||||
for (const noteCount of [5000, 10000, 20000]) {
|
||||
it(`typing progression @ ${noteCount} notes`, () => {
|
||||
buildDataset(noteCount, {
|
||||
matchFraction: 0.15,
|
||||
titleKeywords: ["documentation"],
|
||||
contentKeywords: ["documentation"],
|
||||
contentWordCount: 300,
|
||||
});
|
||||
|
||||
const results: BenchmarkResult[] = [
|
||||
runBenchmark("d", "autocomplete", false),
|
||||
runBenchmark("do", "autocomplete", false),
|
||||
runBenchmark("doc", "autocomplete", false),
|
||||
runBenchmark("docu", "autocomplete", false),
|
||||
runBenchmark("docum", "autocomplete", false),
|
||||
runBenchmark("document", "autocomplete", false),
|
||||
runBenchmark("documentation", "autocomplete", false),
|
||||
// Same with fuzzy
|
||||
runBenchmark("d", "autocomplete", true),
|
||||
runBenchmark("doc", "autocomplete", true),
|
||||
runBenchmark("document", "autocomplete", true),
|
||||
runBenchmark("documentation", "autocomplete", true),
|
||||
];
|
||||
|
||||
printTable(`Typing Progression "documentation" — ${noteCount} notes`, results);
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
describe("Attribute-matching queries", () => {
|
||||
for (const noteCount of [5000, 10000]) {
|
||||
it(`attribute search @ ${noteCount} notes`, () => {
|
||||
buildDataset(noteCount, {
|
||||
matchFraction: 0.15,
|
||||
labelsPerNote: 5,
|
||||
titleKeywords: ["important"],
|
||||
contentKeywords: ["important"],
|
||||
contentWordCount: 200,
|
||||
});
|
||||
|
||||
const results: BenchmarkResult[] = [
|
||||
// "category" is a label name on matching notes
|
||||
runBenchmark("category", "autocomplete", false),
|
||||
runBenchmark("category", "autocomplete", true),
|
||||
runBenchmark("category", "fullSearch", false),
|
||||
runBenchmark("category", "fullSearch", true),
|
||||
// "important" appears in both title and label value
|
||||
runBenchmark("important", "autocomplete", false),
|
||||
runBenchmark("important", "autocomplete", true),
|
||||
];
|
||||
|
||||
printTable(`Attribute Matching — ${noteCount} notes`, results);
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
describe("Long queries (4-5 tokens)", () => {
|
||||
for (const noteCount of [5000, 10000]) {
|
||||
it(`long query @ ${noteCount} notes`, () => {
|
||||
buildDataset(noteCount, {
|
||||
matchFraction: 0.10,
|
||||
titleKeywords: ["quarterly", "budget", "review", "report"],
|
||||
contentKeywords: ["quarterly", "budget", "review", "report"],
|
||||
contentWordCount: 500,
|
||||
});
|
||||
|
||||
const results: BenchmarkResult[] = [
|
||||
runBenchmark("quarterly", "autocomplete", false),
|
||||
runBenchmark("quarterly budget", "autocomplete", false),
|
||||
runBenchmark("quarterly budget review", "autocomplete", false),
|
||||
runBenchmark("quarterly budget review report", "autocomplete", false),
|
||||
// Same with fuzzy
|
||||
runBenchmark("quarterly budget review report", "autocomplete", true),
|
||||
// Full search
|
||||
runBenchmark("quarterly budget review report", "fullSearch", false),
|
||||
runBenchmark("quarterly budget review report", "fullSearch", true),
|
||||
];
|
||||
|
||||
printTable(`Long Queries (4 tokens) — ${noteCount} notes`, results);
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
describe("Mixed scenario — realistic user session", () => {
|
||||
it("simulates a user session with varied queries @ 10K notes", () => {
|
||||
buildDataset(10000, {
|
||||
matchFraction: 0.15,
|
||||
titleKeywords: ["project", "planning"],
|
||||
contentKeywords: ["project", "planning", "timeline", "budget"],
|
||||
contentWordCount: 400,
|
||||
varyContentSize: true,
|
||||
includeDiacritics: true,
|
||||
depth: 6,
|
||||
});
|
||||
|
||||
const results: BenchmarkResult[] = [
|
||||
// Quick autocomplete lookups (user typing in search bar)
|
||||
runBenchmark("pro", "autocomplete", false),
|
||||
runBenchmark("project", "autocomplete", false),
|
||||
runBenchmark("project plan", "autocomplete", false),
|
||||
|
||||
// Full search (user hits Enter)
|
||||
runBenchmark("project", "fullSearch", false),
|
||||
runBenchmark("project planning", "fullSearch", false),
|
||||
runBenchmark("project planning", "fullSearch", true),
|
||||
|
||||
// Typo / near-miss with fuzzy
|
||||
runBenchmark("projct", "autocomplete", false),
|
||||
runBenchmark("projct", "autocomplete", true),
|
||||
runBenchmark("projct planing", "fullSearch", false),
|
||||
runBenchmark("projct planing", "fullSearch", true),
|
||||
|
||||
// No results
|
||||
runBenchmark("xyznonexistent", "autocomplete", false),
|
||||
runBenchmark("xyznonexistent foo", "fullSearch", true),
|
||||
|
||||
// Short common substring
|
||||
runBenchmark("note", "autocomplete", false),
|
||||
runBenchmark("document", "autocomplete", false),
|
||||
];
|
||||
|
||||
printTable("Realistic User Session — 10K notes", results);
|
||||
});
|
||||
});
|
||||
|
||||
describe("Cache warmth impact", () => {
|
||||
it("cold vs warm flat text index @ 10K notes", () => {
|
||||
buildDataset(10000, {
|
||||
matchFraction: 0.15,
|
||||
titleKeywords: ["target"],
|
||||
contentKeywords: ["target"],
|
||||
contentWordCount: 300,
|
||||
});
|
||||
|
||||
console.log(`\n${"═".repeat(80)}`);
|
||||
console.log(" Cold vs Warm Cache — 10K notes");
|
||||
console.log(`${"═".repeat(80)}`);
|
||||
|
||||
// Cold: first search after dataset build (flat text index not yet built)
|
||||
becca.flatTextIndex = null;
|
||||
becca.dirtyFlatTextNoteIds.clear();
|
||||
const [coldResults, coldMs] = timed(() => {
|
||||
const ctx = new SearchContext({ fastSearch: true, autocomplete: true });
|
||||
ctx.enableFuzzyMatching = false;
|
||||
return searchService.findResultsWithQuery("target", ctx);
|
||||
});
|
||||
console.log(` Cold (index build + search): ${coldMs.toFixed(1)}ms (${coldResults.length} results)`);
|
||||
|
||||
// Warm: subsequent searches reuse the index
|
||||
const warmTimes: number[] = [];
|
||||
for (let i = 0; i < 5; i++) {
|
||||
const [, ms] = timed(() => {
|
||||
const ctx = new SearchContext({ fastSearch: true, autocomplete: true });
|
||||
ctx.enableFuzzyMatching = false;
|
||||
return searchService.findResultsWithQuery("target", ctx);
|
||||
});
|
||||
warmTimes.push(ms);
|
||||
}
|
||||
console.log(` Warm (reuse index, 5 runs): avg ${avg(warmTimes).toFixed(1)}ms min ${min(warmTimes).toFixed(1)}ms`);
|
||||
|
||||
// Incremental: dirty a few notes and search again
|
||||
const noteIds = Object.keys(becca.notes).slice(0, 50);
|
||||
for (const nid of noteIds) {
|
||||
becca.dirtyNoteFlatText(nid);
|
||||
}
|
||||
const [, incrMs] = timed(() => {
|
||||
const ctx = new SearchContext({ fastSearch: true, autocomplete: true });
|
||||
ctx.enableFuzzyMatching = false;
|
||||
return searchService.findResultsWithQuery("target", ctx);
|
||||
});
|
||||
console.log(` Incremental (50 dirty notes): ${incrMs.toFixed(1)}ms`);
|
||||
|
||||
// Full rebuild
|
||||
becca.flatTextIndex = null;
|
||||
const [, rebuildMs] = timed(() => {
|
||||
const ctx = new SearchContext({ fastSearch: true, autocomplete: true });
|
||||
ctx.enableFuzzyMatching = false;
|
||||
return searchService.findResultsWithQuery("target", ctx);
|
||||
});
|
||||
console.log(` Full rebuild (index = null): ${rebuildMs.toFixed(1)}ms`);
|
||||
|
||||
console.log(`${"═".repeat(80)}\n`);
|
||||
});
|
||||
});
|
||||
|
||||
describe("Fuzzy matching effectiveness comparison", () => {
|
||||
it("exact vs fuzzy result quality @ 10K notes", () => {
|
||||
buildDataset(10000, {
|
||||
matchFraction: 0.10,
|
||||
titleKeywords: ["performance"],
|
||||
contentKeywords: ["performance", "optimization"],
|
||||
contentWordCount: 300,
|
||||
});
|
||||
|
||||
console.log(`\n${"═".repeat(90)}`);
|
||||
console.log(" Fuzzy Matching Effectiveness — 10K notes");
|
||||
console.log(`${"═".repeat(90)}`);
|
||||
console.log(
|
||||
" " +
|
||||
"Query".padEnd(30) +
|
||||
"Fuzzy".padEnd(8) +
|
||||
"Time (ms)".padStart(12) +
|
||||
"Results".padStart(10) +
|
||||
" Notes"
|
||||
);
|
||||
console.log(` ${"─".repeat(70)}`);
|
||||
|
||||
const queries = [
|
||||
"performance", // exact match
|
||||
"performanc", // truncated
|
||||
"preformance", // typo
|
||||
"performence", // common misspelling
|
||||
"optimization", // exact match
|
||||
"optimzation", // typo
|
||||
"perf optim", // abbreviated multi
|
||||
];
|
||||
|
||||
for (const query of queries) {
|
||||
for (const fuzzy of [false, true]) {
|
||||
const times: number[] = [];
|
||||
let resultCount = 0;
|
||||
for (let i = 0; i < 3; i++) {
|
||||
const [results, ms] = timed(() => {
|
||||
const ctx = new SearchContext({ fastSearch: true });
|
||||
ctx.enableFuzzyMatching = fuzzy;
|
||||
return searchService.findResultsWithQuery(query, ctx);
|
||||
});
|
||||
times.push(ms);
|
||||
resultCount = results.length;
|
||||
}
|
||||
console.log(
|
||||
" " +
|
||||
`"${query}"`.padEnd(30) +
|
||||
(fuzzy ? "ON" : "OFF").padEnd(8) +
|
||||
avg(times).toFixed(1).padStart(12) +
|
||||
String(resultCount).padStart(10)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`${"═".repeat(90)}\n`);
|
||||
});
|
||||
});
|
||||
|
||||
describe("Scale comparison summary", () => {
|
||||
it("summary table across all note counts", () => {
|
||||
const summaryResults: BenchmarkResult[] = [];
|
||||
|
||||
for (const noteCount of [1000, 5000, 10000, 20000]) {
|
||||
buildDataset(noteCount, {
|
||||
matchFraction: 0.15,
|
||||
titleKeywords: ["meeting", "notes"],
|
||||
contentKeywords: ["meeting", "notes"],
|
||||
contentWordCount: 400,
|
||||
varyContentSize: true,
|
||||
depth: 5,
|
||||
});
|
||||
|
||||
// Core scenarios
|
||||
summaryResults.push(runBenchmark("meeting", "autocomplete", false));
|
||||
summaryResults.push(runBenchmark("meeting", "autocomplete", true));
|
||||
summaryResults.push(runBenchmark("meeting notes", "autocomplete", false));
|
||||
summaryResults.push(runBenchmark("meeting notes", "autocomplete", true));
|
||||
summaryResults.push(runBenchmark("meeting", "fullSearch", false));
|
||||
summaryResults.push(runBenchmark("meeting", "fullSearch", true));
|
||||
summaryResults.push(runBenchmark("meeting notes", "fullSearch", false));
|
||||
summaryResults.push(runBenchmark("meeting notes", "fullSearch", true));
|
||||
summaryResults.push(runBenchmark("xyznonexistent", "autocomplete", false));
|
||||
summaryResults.push(runBenchmark("xyznonexistent", "fullSearch", true));
|
||||
}
|
||||
|
||||
printTable("Scale Comparison Summary", summaryResults);
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,665 @@
|
||||
/**
|
||||
* Search performance profiling tests.
|
||||
*
|
||||
* These tests measure where time is spent in the search pipeline.
|
||||
* We monkeypatch note.getContent() to return synthetic HTML content
|
||||
* since unit tests don't have a real SQLite database.
|
||||
*
|
||||
* KNOWN GAPS vs production:
|
||||
* - note.getContent() is instant (monkeypatched) vs ~2ms SQL fetch
|
||||
* - NoteContentFulltextExp.execute() is skipped (no sql.iterateRows)
|
||||
* because fastSearch=true uses only NoteFlatTextExp
|
||||
* - These tests focus on the in-memory/CPU-bound parts of the pipeline
|
||||
*/
|
||||
import { describe, it, expect, beforeEach, afterEach } from "vitest";
|
||||
import searchService from "./search.js";
|
||||
import BNote from "../../../becca/entities/bnote.js";
|
||||
import BBranch from "../../../becca/entities/bbranch.js";
|
||||
import SearchContext from "../search_context.js";
|
||||
import becca from "../../../becca/becca.js";
|
||||
import beccaService from "../../../becca/becca_service.js";
|
||||
import { NoteBuilder, note, id } from "../../../test/becca_mocking.js";
|
||||
import SearchResult from "../search_result.js";
|
||||
import { normalizeSearchText } from "../utils/text_utils.js";
|
||||
|
||||
// ── helpers ──────────────────────────────────────────────────────────
|
||||
|
||||
function randomWord(len = 6): string {
|
||||
const chars = "abcdefghijklmnopqrstuvwxyz";
|
||||
let word = "";
|
||||
for (let i = 0; i < len; i++) {
|
||||
word += chars[Math.floor(Math.random() * chars.length)];
|
||||
}
|
||||
return word;
|
||||
}
|
||||
|
||||
function generateHtmlContent(wordCount: number, includeKeywords = false, keywords?: string[]): string {
|
||||
const paragraphs: string[] = [];
|
||||
let wordsRemaining = wordCount;
|
||||
const kws = keywords ?? ["target"];
|
||||
|
||||
while (wordsRemaining > 0) {
|
||||
const paraWords = Math.min(wordsRemaining, 20 + Math.floor(Math.random() * 40));
|
||||
const words: string[] = [];
|
||||
for (let i = 0; i < paraWords; i++) {
|
||||
words.push(randomWord(3 + Math.floor(Math.random() * 10)));
|
||||
}
|
||||
if (includeKeywords && paragraphs.length === 2) {
|
||||
// Inject all keywords into the paragraph at spaced positions
|
||||
for (let k = 0; k < kws.length; k++) {
|
||||
const pos = Math.min(words.length - 1, Math.floor((words.length / (kws.length + 1)) * (k + 1)));
|
||||
words[pos] = kws[k];
|
||||
}
|
||||
}
|
||||
paragraphs.push(`<p>${words.join(" ")}</p>`);
|
||||
wordsRemaining -= paraWords;
|
||||
}
|
||||
|
||||
return `<html><body>${paragraphs.join("\n")}</body></html>`;
|
||||
}
|
||||
|
||||
function timed<T>(fn: () => T): [T, number] {
|
||||
const start = performance.now();
|
||||
const result = fn();
|
||||
return [result, performance.now() - start];
|
||||
}
|
||||
|
||||
interface TimingEntry { label: string; ms: number; }
|
||||
|
||||
function reportTimings(title: string, timings: TimingEntry[]) {
|
||||
const total = timings.reduce((s, t) => s + t.ms, 0);
|
||||
console.log(`\n=== ${title} (total: ${total.toFixed(1)}ms) ===`);
|
||||
for (const { label, ms } of timings) {
|
||||
const pct = total > 0 ? ((ms / total) * 100).toFixed(0) : "0";
|
||||
const bar = "#".repeat(Math.max(1, Math.round(ms / total * 40)));
|
||||
console.log(` ${label.padEnd(55)} ${ms.toFixed(1).padStart(8)}ms ${pct.padStart(3)}% ${bar}`);
|
||||
}
|
||||
}
|
||||
|
||||
// ── dataset builder ──────────────────────────────────────────────────
|
||||
|
||||
const syntheticContent: Record<string, string> = {};
|
||||
|
||||
function buildDataset(noteCount: number, opts: {
|
||||
matchFraction?: number;
|
||||
labelsPerNote?: number;
|
||||
depth?: number;
|
||||
contentWordCount?: number;
|
||||
/** When set, contentWordCount is treated as a median and actual sizes vary from 0.2x to 3x */
|
||||
varyContentSize?: boolean;
|
||||
/** Keywords to inject into matching notes' titles (default: ["target"]) */
|
||||
titleKeywords?: string[];
|
||||
/** Keywords to inject into matching notes' content (default: same as titleKeywords) */
|
||||
contentKeywords?: string[];
|
||||
} = {}) {
|
||||
const {
|
||||
matchFraction = 0.1,
|
||||
labelsPerNote = 3,
|
||||
depth = 3,
|
||||
contentWordCount = 200,
|
||||
varyContentSize = false,
|
||||
titleKeywords = ["target"],
|
||||
contentKeywords = titleKeywords,
|
||||
} = opts;
|
||||
|
||||
becca.reset();
|
||||
for (const key of Object.keys(syntheticContent)) {
|
||||
delete syntheticContent[key];
|
||||
}
|
||||
|
||||
const rootNote = new NoteBuilder(new BNote({ noteId: "root", title: "root", type: "text" }));
|
||||
new BBranch({
|
||||
branchId: "none_root",
|
||||
noteId: "root",
|
||||
parentNoteId: "none",
|
||||
notePosition: 10
|
||||
});
|
||||
|
||||
const containers: NoteBuilder[] = [];
|
||||
let parent = rootNote;
|
||||
for (let d = 0; d < depth; d++) {
|
||||
const container = note(`Container_${d}_${randomWord(4)}`);
|
||||
parent.child(container);
|
||||
containers.push(container);
|
||||
parent = container;
|
||||
}
|
||||
|
||||
const matchCount = Math.floor(noteCount * matchFraction);
|
||||
|
||||
for (let i = 0; i < noteCount; i++) {
|
||||
const isMatch = i < matchCount;
|
||||
const title = isMatch
|
||||
? `${randomWord(5)} ${titleKeywords.join(" ")} ${randomWord(5)} Document ${i}`
|
||||
: `${randomWord(5)} ${randomWord(6)} ${randomWord(4)} Note ${i}`;
|
||||
|
||||
const n = note(title);
|
||||
|
||||
for (let l = 0; l < labelsPerNote; l++) {
|
||||
const labelName = isMatch && l === 0 ? "category" : `label_${randomWord(4)}`;
|
||||
const labelValue = isMatch && l === 0 ? `important ${titleKeywords[0]}` : randomWord(8);
|
||||
n.label(labelName, labelValue);
|
||||
}
|
||||
|
||||
// Vary content size: 0.2x to 3x the median, producing a realistic
|
||||
// mix of short stubs, medium notes, and long documents.
|
||||
let noteWordCount = contentWordCount;
|
||||
if (varyContentSize) {
|
||||
const r = Math.random();
|
||||
if (r < 0.2) {
|
||||
noteWordCount = Math.floor(contentWordCount * (0.2 + Math.random() * 0.3)); // 20-50% (short stubs)
|
||||
} else if (r < 0.7) {
|
||||
noteWordCount = Math.floor(contentWordCount * (0.7 + Math.random() * 0.6)); // 70-130% (medium)
|
||||
} else if (r < 0.9) {
|
||||
noteWordCount = Math.floor(contentWordCount * (1.3 + Math.random() * 0.7)); // 130-200% (long)
|
||||
} else {
|
||||
noteWordCount = Math.floor(contentWordCount * (2.0 + Math.random() * 1.0)); // 200-300% (very long)
|
||||
}
|
||||
}
|
||||
|
||||
const includeContentKeyword = isMatch && contentKeywords.length > 0;
|
||||
syntheticContent[n.note.noteId] = generateHtmlContent(
|
||||
noteWordCount,
|
||||
includeContentKeyword,
|
||||
includeContentKeyword ? contentKeywords : undefined
|
||||
);
|
||||
|
||||
const containerIndex = i % containers.length;
|
||||
containers[containerIndex].child(n);
|
||||
}
|
||||
|
||||
// Monkeypatch getContent()
|
||||
for (const noteObj of Object.values(becca.notes)) {
|
||||
const noteId = noteObj.noteId;
|
||||
if (syntheticContent[noteId]) {
|
||||
(noteObj as any).getContent = () => syntheticContent[noteId];
|
||||
} else {
|
||||
(noteObj as any).getContent = () => "";
|
||||
}
|
||||
}
|
||||
|
||||
return { rootNote, matchCount };
|
||||
}
|
||||
|
||||
// ── profiling tests ──────────────────────────────────────────────────
|
||||
|
||||
describe("Search Profiling", () => {
|
||||
|
||||
afterEach(() => {
|
||||
becca.reset();
|
||||
});
|
||||
|
||||
/**
|
||||
* Break down the autocomplete pipeline into every individual stage,
|
||||
* including previously unmeasured operations like getBestNotePath,
|
||||
* SearchResult construction, and getNoteTitleForPath.
|
||||
*/
|
||||
describe("Granular autocomplete pipeline", () => {
|
||||
|
||||
for (const noteCount of [500, 2000, 5000, 10000]) {
|
||||
it(`granular breakdown with ${noteCount} notes`, () => {
|
||||
const timings: TimingEntry[] = [];
|
||||
|
||||
const [, buildMs] = timed(() => buildDataset(noteCount, {
|
||||
matchFraction: 0.2,
|
||||
contentWordCount: 300,
|
||||
depth: 5
|
||||
}));
|
||||
timings.push({ label: `Dataset build (${noteCount} notes)`, ms: buildMs });
|
||||
|
||||
// === NoteFlatTextExp: getCandidateNotes ===
|
||||
// This calls getFlatText() + normalizeSearchText() for EVERY note
|
||||
const allNotes = Object.values(becca.notes);
|
||||
for (const n of allNotes) n.invalidateThisCache();
|
||||
|
||||
const [, candidateMs] = timed(() => {
|
||||
const token = normalizeSearchText("target");
|
||||
let count = 0;
|
||||
for (const n of allNotes) {
|
||||
const flatText = normalizeSearchText(n.getFlatText());
|
||||
if (flatText.includes(token)) count++;
|
||||
}
|
||||
return count;
|
||||
});
|
||||
timings.push({ label: `getCandidateNotes simulation (cold caches)`, ms: candidateMs });
|
||||
|
||||
// Warm cache version
|
||||
const [candidateCount, candidateWarmMs] = timed(() => {
|
||||
const token = normalizeSearchText("target");
|
||||
let count = 0;
|
||||
for (const n of allNotes) {
|
||||
const flatText = normalizeSearchText(n.getFlatText());
|
||||
if (flatText.includes(token)) count++;
|
||||
}
|
||||
return count;
|
||||
});
|
||||
timings.push({ label: `getCandidateNotes simulation (warm caches)`, ms: candidateWarmMs });
|
||||
|
||||
// === getBestNotePath for each candidate ===
|
||||
const candidates = allNotes.filter(n => {
|
||||
const flatText = normalizeSearchText(n.getFlatText());
|
||||
return flatText.includes("target");
|
||||
});
|
||||
|
||||
const [, pathMs] = timed(() => {
|
||||
for (const n of candidates) {
|
||||
n.getBestNotePath();
|
||||
}
|
||||
});
|
||||
timings.push({ label: `getBestNotePath (${candidates.length} notes)`, ms: pathMs });
|
||||
|
||||
// === SearchResult construction (includes getNoteTitleForPath) ===
|
||||
const paths = candidates.map(n => n.getBestNotePath()).filter(Boolean);
|
||||
|
||||
const [searchResults, srMs] = timed(() => {
|
||||
return paths.map(p => new SearchResult(p));
|
||||
});
|
||||
timings.push({ label: `SearchResult construction (${paths.length} results)`, ms: srMs });
|
||||
|
||||
// === computeScore ===
|
||||
const [, scoreMs] = timed(() => {
|
||||
for (const r of searchResults) {
|
||||
r.computeScore("target", ["target"], true);
|
||||
}
|
||||
});
|
||||
timings.push({ label: `computeScore with fuzzy (${searchResults.length} results)`, ms: scoreMs });
|
||||
|
||||
const [, scoreNoFuzzyMs] = timed(() => {
|
||||
for (const r of searchResults) {
|
||||
r.computeScore("target", ["target"], false);
|
||||
}
|
||||
});
|
||||
timings.push({ label: `computeScore no-fuzzy`, ms: scoreNoFuzzyMs });
|
||||
|
||||
// === Sorting ===
|
||||
const [, sortMs] = timed(() => {
|
||||
searchResults.sort((a, b) => {
|
||||
if (a.score !== b.score) return b.score - a.score;
|
||||
if (a.notePathArray.length === b.notePathArray.length) {
|
||||
return a.notePathTitle < b.notePathTitle ? -1 : 1;
|
||||
}
|
||||
return a.notePathArray.length - b.notePathArray.length;
|
||||
});
|
||||
});
|
||||
timings.push({ label: `Sort results`, ms: sortMs });
|
||||
|
||||
// === Trim + content snippet extraction ===
|
||||
const trimmed = searchResults.slice(0, 200);
|
||||
|
||||
const [, snippetMs] = timed(() => {
|
||||
for (const r of trimmed) {
|
||||
r.contentSnippet = searchService.extractContentSnippet(
|
||||
r.noteId, ["target"]
|
||||
);
|
||||
}
|
||||
});
|
||||
timings.push({ label: `Content snippet extraction (${trimmed.length} results)`, ms: snippetMs });
|
||||
|
||||
const [, attrMs] = timed(() => {
|
||||
for (const r of trimmed) {
|
||||
r.attributeSnippet = searchService.extractAttributeSnippet(
|
||||
r.noteId, ["target"]
|
||||
);
|
||||
}
|
||||
});
|
||||
timings.push({ label: `Attribute snippet extraction`, ms: attrMs });
|
||||
|
||||
// === Highlighting ===
|
||||
const [, hlMs] = timed(() => {
|
||||
searchService.highlightSearchResults(trimmed, ["target"]);
|
||||
});
|
||||
timings.push({ label: `Highlighting`, ms: hlMs });
|
||||
|
||||
// === Final mapping (getNoteTitleAndIcon) ===
|
||||
const [, mapMs] = timed(() => {
|
||||
for (const r of trimmed) {
|
||||
beccaService.getNoteTitleAndIcon(r.noteId);
|
||||
}
|
||||
});
|
||||
timings.push({ label: `getNoteTitleAndIcon (${trimmed.length} results)`, ms: mapMs });
|
||||
|
||||
// === Full autocomplete for comparison ===
|
||||
const [autoResults, autoMs] = timed(() => {
|
||||
return searchService.searchNotesForAutocomplete("target", true);
|
||||
});
|
||||
timings.push({ label: `Full autocomplete call (end-to-end)`, ms: autoMs });
|
||||
|
||||
reportTimings(`Granular Autocomplete — ${noteCount} notes`, timings);
|
||||
expect(autoResults.length).toBeGreaterThan(0);
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* Test the specific cost of normalizeSearchText which is called
|
||||
* pervasively throughout the pipeline.
|
||||
*/
|
||||
describe("normalizeSearchText cost", () => {
|
||||
|
||||
it("profile normalizeSearchText at scale", () => {
|
||||
buildDataset(5000, { matchFraction: 0.2, contentWordCount: 100 });
|
||||
|
||||
// Generate various text lengths to profile
|
||||
const shortTexts = Array.from({ length: 5000 }, () => randomWord(10));
|
||||
const mediumTexts = Array.from({ length: 5000 }, () =>
|
||||
Array.from({ length: 20 }, () => randomWord(6)).join(" ")
|
||||
);
|
||||
const longTexts = Object.values(becca.notes).map(n => n.getFlatText());
|
||||
|
||||
console.log("\n=== normalizeSearchText cost ===");
|
||||
|
||||
const [, shortMs] = timed(() => {
|
||||
for (const t of shortTexts) normalizeSearchText(t);
|
||||
});
|
||||
console.log(` 5000 short texts (10 chars): ${shortMs.toFixed(1)}ms (${(shortMs/5000*1000).toFixed(1)}µs/call)`);
|
||||
|
||||
const [, medMs] = timed(() => {
|
||||
for (const t of mediumTexts) normalizeSearchText(t);
|
||||
});
|
||||
console.log(` 5000 medium texts (120 chars): ${medMs.toFixed(1)}ms (${(medMs/5000*1000).toFixed(1)}µs/call)`);
|
||||
|
||||
const [, longMs] = timed(() => {
|
||||
for (const t of longTexts) normalizeSearchText(t);
|
||||
});
|
||||
console.log(` ${longTexts.length} flat texts (varying): ${longMs.toFixed(1)}ms (${(longMs/longTexts.length*1000).toFixed(1)}µs/call)`);
|
||||
});
|
||||
});
|
||||
|
||||
/**
|
||||
* Test the searchPathTowardsRoot recursive walk which runs
|
||||
* for every candidate note in NoteFlatTextExp.
|
||||
*/
|
||||
describe("searchPathTowardsRoot cost", () => {
|
||||
|
||||
it("profile recursive walk with varying hierarchy depth", () => {
|
||||
console.log("\n=== Search path walk vs hierarchy depth ===");
|
||||
|
||||
for (const depth of [3, 5, 8, 12]) {
|
||||
buildDataset(2000, {
|
||||
matchFraction: 0.15,
|
||||
depth,
|
||||
contentWordCount: 50
|
||||
});
|
||||
|
||||
const [results, ms] = timed(() => {
|
||||
const ctx = new SearchContext({ fastSearch: true });
|
||||
return searchService.findResultsWithQuery("target", ctx);
|
||||
});
|
||||
console.log(` depth=${depth}: ${ms.toFixed(1)}ms (${results.length} results)`);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
/**
|
||||
* Content snippet extraction scaling — the operation that calls
|
||||
* note.getContent() for each result.
|
||||
*/
|
||||
describe("Content snippet extraction", () => {
|
||||
|
||||
it("profile snippet extraction with varying content sizes", () => {
|
||||
console.log("\n=== Content snippet extraction vs content size ===");
|
||||
|
||||
for (const wordCount of [50, 200, 500, 1000, 2000, 5000]) {
|
||||
buildDataset(500, {
|
||||
matchFraction: 0.5,
|
||||
contentWordCount: wordCount
|
||||
});
|
||||
|
||||
const ctx = new SearchContext({ fastSearch: true });
|
||||
const results = searchService.findResultsWithQuery("target", ctx);
|
||||
const trimmed = results.slice(0, 200);
|
||||
|
||||
const [, ms] = timed(() => {
|
||||
for (const r of trimmed) {
|
||||
r.contentSnippet = searchService.extractContentSnippet(
|
||||
r.noteId, ["target"]
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
const avgContentLen = Object.values(syntheticContent)
|
||||
.slice(0, 100)
|
||||
.reduce((s, c) => s + c.length, 0) / 100;
|
||||
|
||||
console.log(` ${String(wordCount).padStart(5)} words/note (avg ${Math.round(avgContentLen)} chars) × ${trimmed.length} results: ${ms.toFixed(1)}ms (${(ms / trimmed.length).toFixed(3)}ms/note)`);
|
||||
}
|
||||
});
|
||||
|
||||
it("profile snippet extraction with varying result counts", () => {
|
||||
console.log("\n=== Content snippet extraction vs result count ===");
|
||||
|
||||
buildDataset(2000, {
|
||||
matchFraction: 0.5,
|
||||
contentWordCount: 500
|
||||
});
|
||||
|
||||
const ctx = new SearchContext({ fastSearch: true });
|
||||
const allResults = searchService.findResultsWithQuery("target", ctx);
|
||||
|
||||
for (const count of [5, 10, 20, 50, 100, 200]) {
|
||||
const subset = allResults.slice(0, count);
|
||||
|
||||
const [, ms] = timed(() => {
|
||||
for (const r of subset) {
|
||||
r.contentSnippet = searchService.extractContentSnippet(
|
||||
r.noteId, ["target"]
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
console.log(` ${String(count).padStart(3)} results: ${ms.toFixed(1)}ms (${(ms / count).toFixed(3)}ms/note)`);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
/**
|
||||
* Two-phase exact/fuzzy search cost.
|
||||
*/
|
||||
describe("Two-phase search cost", () => {
|
||||
|
||||
for (const noteCount of [1000, 5000, 10000]) {
|
||||
it(`exact vs progressive with ${noteCount} notes`, () => {
|
||||
const timings: TimingEntry[] = [];
|
||||
|
||||
buildDataset(noteCount, { matchFraction: 0.005, contentWordCount: 50 });
|
||||
|
||||
const [exactR, exactMs] = timed(() => {
|
||||
const ctx = new SearchContext({ fastSearch: true });
|
||||
ctx.enableFuzzyMatching = false;
|
||||
return searchService.findResultsWithQuery("target", ctx);
|
||||
});
|
||||
timings.push({ label: `Exact-only (${exactR.length} results)`, ms: exactMs });
|
||||
|
||||
const [progR, progMs] = timed(() => {
|
||||
const ctx = new SearchContext({ fastSearch: true });
|
||||
return searchService.findResultsWithQuery("target", ctx);
|
||||
});
|
||||
timings.push({ label: `Progressive exact→fuzzy (${progR.length} results)`, ms: progMs });
|
||||
|
||||
const overhead = progMs - exactMs;
|
||||
timings.push({ label: `Fuzzy phase overhead`, ms: Math.max(0, overhead) });
|
||||
|
||||
reportTimings(`Two-phase — ${noteCount} notes`, timings);
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* End-to-end scaling to give the full picture.
|
||||
*/
|
||||
/**
|
||||
* Multi-token search with varying content sizes.
|
||||
* Real users search things like "meeting notes january" — this exercises
|
||||
* the multi-token path (which doesn't use the single-token fast path)
|
||||
* with a realistic mix of note sizes.
|
||||
*/
|
||||
describe("Multi-token search with varying content sizes", () => {
|
||||
|
||||
it("single vs multi-token autocomplete at scale", () => {
|
||||
console.log("\n=== Single vs multi-token autocomplete (varying content sizes) ===");
|
||||
|
||||
for (const noteCount of [1000, 5000, 10000, 20000]) {
|
||||
buildDataset(noteCount, {
|
||||
matchFraction: 0.15,
|
||||
contentWordCount: 400,
|
||||
varyContentSize: true,
|
||||
depth: 5,
|
||||
titleKeywords: ["meeting", "notes", "january"],
|
||||
contentKeywords: ["meeting", "notes", "january"],
|
||||
});
|
||||
|
||||
// Warm up
|
||||
searchService.searchNotesForAutocomplete("meeting", true);
|
||||
|
||||
// Single token
|
||||
const singleTimes: number[] = [];
|
||||
for (let i = 0; i < 3; i++) {
|
||||
const [, ms] = timed(() => searchService.searchNotesForAutocomplete("meeting", true));
|
||||
singleTimes.push(ms);
|
||||
}
|
||||
const singleAvg = singleTimes.reduce((a, b) => a + b, 0) / singleTimes.length;
|
||||
|
||||
// Two tokens
|
||||
const twoTimes: number[] = [];
|
||||
for (let i = 0; i < 3; i++) {
|
||||
const [, ms] = timed(() => searchService.searchNotesForAutocomplete("meeting notes", true));
|
||||
twoTimes.push(ms);
|
||||
}
|
||||
const twoAvg = twoTimes.reduce((a, b) => a + b, 0) / twoTimes.length;
|
||||
|
||||
// Three tokens
|
||||
const threeTimes: number[] = [];
|
||||
for (let i = 0; i < 3; i++) {
|
||||
const [, ms] = timed(() => searchService.searchNotesForAutocomplete("meeting notes january", true));
|
||||
threeTimes.push(ms);
|
||||
}
|
||||
const threeAvg = threeTimes.reduce((a, b) => a + b, 0) / threeTimes.length;
|
||||
|
||||
console.log(
|
||||
` ${String(noteCount).padStart(6)} notes: ` +
|
||||
`1-token ${singleAvg.toFixed(1)}ms ` +
|
||||
`2-token ${twoAvg.toFixed(1)}ms ` +
|
||||
`3-token ${threeAvg.toFixed(1)}ms`
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
it("multi-token with realistic content size distribution", () => {
|
||||
console.log("\n=== Multi-token search — content size distribution ===");
|
||||
|
||||
buildDataset(5000, {
|
||||
matchFraction: 0.15,
|
||||
contentWordCount: 400,
|
||||
varyContentSize: true,
|
||||
depth: 5,
|
||||
titleKeywords: ["project", "review"],
|
||||
contentKeywords: ["project", "review"],
|
||||
});
|
||||
|
||||
// Report the actual content size distribution
|
||||
const sizes = Object.values(syntheticContent).map(c => c.length);
|
||||
sizes.sort((a, b) => a - b);
|
||||
const p10 = sizes[Math.floor(sizes.length * 0.1)];
|
||||
const p50 = sizes[Math.floor(sizes.length * 0.5)];
|
||||
const p90 = sizes[Math.floor(sizes.length * 0.9)];
|
||||
const p99 = sizes[Math.floor(sizes.length * 0.99)];
|
||||
console.log(` Content sizes: p10=${p10} p50=${p50} p90=${p90} p99=${p99} chars`);
|
||||
|
||||
// Warm up
|
||||
searchService.searchNotesForAutocomplete("project", true);
|
||||
|
||||
const queries = [
|
||||
"project",
|
||||
"project review",
|
||||
"project review document",
|
||||
`${randomWord(7)}`, // no-match single token
|
||||
`${randomWord(5)} ${randomWord(6)}`, // no-match multi token
|
||||
];
|
||||
|
||||
for (const query of queries) {
|
||||
const times: number[] = [];
|
||||
let resultCount = 0;
|
||||
for (let i = 0; i < 3; i++) {
|
||||
const [r, ms] = timed(() => searchService.searchNotesForAutocomplete(query, true));
|
||||
times.push(ms);
|
||||
resultCount = r.length;
|
||||
}
|
||||
const avg = times.reduce((a, b) => a + b, 0) / times.length;
|
||||
const label = `"${query}"`.padEnd(35);
|
||||
console.log(` ${label} ${avg.toFixed(1)}ms (${resultCount} results)`);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
describe("End-to-end scaling", () => {
|
||||
|
||||
it("autocomplete at different scales", () => {
|
||||
console.log("\n=== End-to-end autocomplete scaling ===");
|
||||
console.log(" (fastSearch=true, monkeypatched getContent, no real SQL)");
|
||||
|
||||
for (const noteCount of [100, 500, 1000, 2000, 5000, 10000, 20000]) {
|
||||
buildDataset(noteCount, {
|
||||
matchFraction: 0.2,
|
||||
contentWordCount: 300,
|
||||
depth: 4
|
||||
});
|
||||
|
||||
// Warm up
|
||||
searchService.searchNotesForAutocomplete("target", true);
|
||||
|
||||
const times: number[] = [];
|
||||
for (let i = 0; i < 3; i++) {
|
||||
const [, ms] = timed(() => searchService.searchNotesForAutocomplete("target", true));
|
||||
times.push(ms);
|
||||
}
|
||||
|
||||
const avg = times.reduce((a, b) => a + b, 0) / times.length;
|
||||
const min = Math.min(...times);
|
||||
|
||||
console.log(
|
||||
` ${String(noteCount).padStart(6)} notes: avg ${avg.toFixed(1)}ms ` +
|
||||
`min ${min.toFixed(1)}ms`
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
it("compare fast vs non-fast search", () => {
|
||||
console.log("\n=== Fast vs non-fast search (no real SQL for content) ===");
|
||||
|
||||
for (const noteCount of [500, 2000, 5000]) {
|
||||
buildDataset(noteCount, {
|
||||
matchFraction: 0.2,
|
||||
contentWordCount: 200,
|
||||
depth: 4
|
||||
});
|
||||
|
||||
const [, fastMs] = timed(() => {
|
||||
const ctx = new SearchContext({ fastSearch: true });
|
||||
return searchService.findResultsWithQuery("target", ctx);
|
||||
});
|
||||
|
||||
// Non-fast search tries NoteContentFulltextExp which uses sql.iterateRows
|
||||
// This will likely fail/return empty since there's no real DB, but we
|
||||
// can still measure the overhead of attempting it
|
||||
let nonFastMs: number;
|
||||
let nonFastCount: number;
|
||||
try {
|
||||
const [results, ms] = timed(() => {
|
||||
const ctx = new SearchContext({ fastSearch: false });
|
||||
return searchService.findResultsWithQuery("target", ctx);
|
||||
});
|
||||
nonFastMs = ms;
|
||||
nonFastCount = results.length;
|
||||
} catch {
|
||||
nonFastMs = -1;
|
||||
nonFastCount = -1;
|
||||
}
|
||||
|
||||
console.log(
|
||||
` ${String(noteCount).padStart(5)} notes: fast=${fastMs.toFixed(1)}ms ` +
|
||||
`non-fast=${nonFastMs >= 0 ? nonFastMs.toFixed(1) + 'ms' : 'FAILED (no real DB)'} ` +
|
||||
`(non-fast results: ${nonFastCount})`
|
||||
);
|
||||
}
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -21,4 +21,6 @@ export interface SearchParams {
|
||||
limit?: number | null;
|
||||
debug?: boolean;
|
||||
fuzzyAttributeSearch?: boolean;
|
||||
/** When true, skip the two-phase fuzzy fallback and use the single-token fast path. */
|
||||
autocomplete?: boolean;
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import { describe, it, expect } from "vitest";
|
||||
import { calculateOptimizedEditDistance, validateFuzzySearchTokens, fuzzyMatchWord } from './text_utils.js';
|
||||
import { calculateOptimizedEditDistance, validateFuzzySearchTokens, fuzzyMatchWord, stripHtmlTags } from './text_utils.js';
|
||||
|
||||
describe('Fuzzy Search Core', () => {
|
||||
describe('calculateOptimizedEditDistance', () => {
|
||||
@@ -62,4 +62,69 @@ describe('Fuzzy Search Core', () => {
|
||||
expect(fuzzyMatchWord('a', 'b')).toBe(false); // Very short tokens
|
||||
});
|
||||
});
|
||||
|
||||
describe('stripHtmlTags', () => {
|
||||
it('strips simple HTML tags', () => {
|
||||
expect(stripHtmlTags('<p>Hello</p>')).toBe('Hello');
|
||||
expect(stripHtmlTags('<div><span>World</span></div>')).toBe('World');
|
||||
expect(stripHtmlTags('<b>Bold</b> and <i>italic</i>')).toBe('Bold and italic');
|
||||
});
|
||||
|
||||
it('handles self-closing tags', () => {
|
||||
expect(stripHtmlTags('Line1<br/>Line2')).toBe('Line1Line2');
|
||||
expect(stripHtmlTags('Image: <img src="x.png"/>')).toBe('Image: ');
|
||||
});
|
||||
|
||||
it('handles tags with attributes', () => {
|
||||
expect(stripHtmlTags('<a href="url">Link</a>')).toBe('Link');
|
||||
expect(stripHtmlTags('<div class="foo" id="bar">Content</div>')).toBe('Content');
|
||||
});
|
||||
|
||||
it('handles nested tag patterns securely', () => {
|
||||
// Security property: no complete <tag> patterns remain after stripping
|
||||
// Residual `>` chars are harmless for XSS
|
||||
|
||||
// Nested tags: inner tag removed, then outer tag removed
|
||||
// <a<b>c> → <ac> → '' (but leaves residual `c>`)
|
||||
const result1 = stripHtmlTags('<a<b>c>text');
|
||||
expect(result1).not.toMatch(/<[a-z]/i); // No opening tags remain
|
||||
expect(result1).toBe('c>text'); // Residual text is safe
|
||||
|
||||
// Complex nesting leaves no exploitable patterns
|
||||
const result2 = stripHtmlTags('<scr<script>ipt>alert(1)</script>');
|
||||
expect(result2).not.toMatch(/<script/i);
|
||||
expect(result2).not.toMatch(/<\/script/i);
|
||||
|
||||
// Double-nested removal
|
||||
const result3 = stripHtmlTags('<<b>script>code');
|
||||
expect(result3).toBe('script>code'); // <b> removed, then < alone doesn't match
|
||||
expect(result3).not.toMatch(/<[a-z]/i);
|
||||
});
|
||||
|
||||
it('handles unclosed tags', () => {
|
||||
expect(stripHtmlTags('<p>Unclosed paragraph')).toBe('Unclosed paragraph');
|
||||
expect(stripHtmlTags('Text with <b>unclosed bold')).toBe('Text with unclosed bold');
|
||||
});
|
||||
|
||||
it('handles empty and null input', () => {
|
||||
expect(stripHtmlTags('')).toBe('');
|
||||
expect(stripHtmlTags(null as any)).toBe('');
|
||||
expect(stripHtmlTags(undefined as any)).toBe('');
|
||||
});
|
||||
|
||||
it('returns plain text unchanged', () => {
|
||||
expect(stripHtmlTags('Just plain text')).toBe('Just plain text');
|
||||
expect(stripHtmlTags('No tags here!')).toBe('No tags here!');
|
||||
});
|
||||
|
||||
it('handles angle brackets in text', () => {
|
||||
// Standalone > without matching < is preserved
|
||||
expect(stripHtmlTags('Text > with > symbols')).toBe('Text > with > symbols');
|
||||
// Note: `< 10 >` looks like a tag to the regex - this is a known limitation
|
||||
// For search snippets, this is acceptable as it's still safe (no XSS)
|
||||
expect(stripHtmlTags('Math: 5 < 10 > 3')).toBe('Math: 5 3');
|
||||
// But properly escaped content works
|
||||
expect(stripHtmlTags('5 < 10')).toBe('5 < 10');
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -49,6 +49,30 @@ export function normalizeSearchText(text: string): string {
|
||||
return normalize(text);
|
||||
}
|
||||
|
||||
/**
|
||||
* Strips HTML tags from content for snippet extraction.
|
||||
* Uses iterative replacement to handle nested/malformed tags like `<scr<script>ipt>`.
|
||||
*
|
||||
* @param html The HTML content to strip
|
||||
* @returns Plain text with all HTML tags removed
|
||||
*/
|
||||
export function stripHtmlTags(html: string): string {
|
||||
if (!html || typeof html !== "string") {
|
||||
return "";
|
||||
}
|
||||
|
||||
let result = html;
|
||||
let previous: string;
|
||||
|
||||
// Loop until no more tags — handles nested cases like <scr<script>ipt>
|
||||
do {
|
||||
previous = result;
|
||||
result = result.replace(/<[^>]*>/g, "");
|
||||
} while (result !== previous);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Optimized edit distance calculation using single array and early termination.
|
||||
* This is significantly more memory efficient than the 2D matrix approach and includes
|
||||
@@ -275,19 +299,19 @@ export function fuzzyMatchWordWithResult(token: string, text: string, maxDistanc
|
||||
}
|
||||
|
||||
try {
|
||||
// Normalize both strings for comparison
|
||||
// Normalize for comparison — some callers pass pre-normalized text,
|
||||
// others don't, so this function must be self-contained.
|
||||
const normalizedToken = token.toLowerCase();
|
||||
const normalizedText = text.toLowerCase();
|
||||
|
||||
|
||||
// Exact match check first (most common case)
|
||||
if (normalizedText.includes(normalizedToken)) {
|
||||
// Find the exact match in the original text to preserve case
|
||||
const exactMatch = text.match(new RegExp(escapeRegExp(token), 'i'));
|
||||
return exactMatch ? exactMatch[0] : token;
|
||||
// Find the exact match position and return the original substring with case preserved
|
||||
const matchIndex = normalizedText.indexOf(normalizedToken);
|
||||
return text.substring(matchIndex, matchIndex + normalizedToken.length);
|
||||
}
|
||||
|
||||
// For fuzzy matching, we need to check individual words in the text
|
||||
// Split the text into words and check each word against the token
|
||||
|
||||
// For fuzzy matching, split into words and check each against the token
|
||||
const words = normalizedText.split(/\s+/).filter(word => word.length > 0);
|
||||
const originalWords = text.split(/\s+/).filter(word => word.length > 0);
|
||||
|
||||
|
||||
@@ -135,6 +135,12 @@ export interface OptionDefinitions extends KeyboardShortcutsOptions<KeyboardActi
|
||||
backgroundEffects: boolean;
|
||||
newLayout: boolean;
|
||||
|
||||
// Search settings
|
||||
/** Whether fuzzy matching is enabled in search (matches similar words when exact matches are insufficient). */
|
||||
searchEnableFuzzyMatching: boolean;
|
||||
/** Whether fuzzy matching is enabled for autocomplete (typing in search bar). Disabled by default for faster response. */
|
||||
searchAutocompleteFuzzy: boolean;
|
||||
|
||||
// Share settings
|
||||
redirectBareDomain: boolean;
|
||||
showLoginInShareTheme: boolean;
|
||||
|
||||
Reference in New Issue
Block a user