feat(search): try to rice performance some more

This commit is contained in:
perfectra1n
2026-03-11 21:11:55 -07:00
parent 585b6ccd3e
commit 77733ce205
5 changed files with 318 additions and 227 deletions

View File

@@ -4,8 +4,8 @@
* Uses the real SQLite database (spec/db/document.db loaded in-memory),
* real sql module, real becca cache, and the full app stack.
*
* Seeds a large number of notes via direct SQL (much faster than ETAPI)
* to create a realistic dataset for profiling.
* Profiles search at large scale (50K+ notes) to match real-world
* performance reports from users with 240K+ notes.
*/
import { Application } from "express";
import { beforeAll, describe, expect, it } from "vitest";
@@ -58,224 +58,246 @@ describe("Search profiling (integration)", () => {
app = await buildApp();
});
it("seed and profile with realistic data", async () => {
it("large-scale profiling (50K notes)", async () => {
const sql = (await import("../src/services/sql.js")).default;
const becca = (await import("../src/becca/becca.js")).default;
const beccaLoader = (await import("../src/becca/becca_loader.js")).default;
const cls = (await import("../src/services/cls.js")).default;
const searchService = (await import("../src/services/search/services/search.js")).default;
const SearchContext = (await import("../src/services/search/search_context.js")).default;
const beccaService = (await import("../src/becca/becca_service.js")).default;
await new Promise<void>((resolve) => {
cls.init(() => {
const initialNoteCount = Object.keys(becca.notes).length;
console.log(`\n Initial becca notes: ${initialNoteCount}`);
const configs = [
{ notes: 2000, words: 500, label: "2K notes × 500 words (~4KB)" },
{ notes: 2000, words: 2000, label: "2K notes × 2000 words (~15KB)" },
{ notes: 5000, words: 500, label: "5K notes × 500 words (~4KB)" },
{ notes: 5000, words: 2000, label: "5K notes × 2000 words (~15KB)" },
{ notes: 10000, words: 1000, label: "10K notes × 1000 words (~8KB)" },
];
// ── Seed 50K notes with hierarchy ──
// Some folders (depth), some with common keyword "test" in title
const TOTAL_NOTES = 50000;
const FOLDER_COUNT = 500; // 500 folders
const NOTES_PER_FOLDER = (TOTAL_NOTES - FOLDER_COUNT) / FOLDER_COUNT; // ~99 notes per folder
const MATCH_FRACTION = 0.10; // 10% match "test" — ~5000 notes
const CONTENT_WORDS = 500;
for (const cfg of configs) {
// Reset DB: delete all seeded notes from prior iteration
sql.execute(`DELETE FROM blobs WHERE blobId LIKE 'seed%'`);
sql.execute(`DELETE FROM notes WHERE noteId LIKE 'seed%'`);
sql.execute(`DELETE FROM branches WHERE branchId LIKE 'seed%'`);
const now = new Date().toISOString().replace("T", " ").replace("Z", "+0000");
console.log(` Seeding ${TOTAL_NOTES} notes (${FOLDER_COUNT} folders, ~${NOTES_PER_FOLDER.toFixed(0)} per folder)...`);
const TOTAL_NOTES = cfg.notes;
const MATCH_FRACTION = 0.15;
const CONTENT_WORDS = cfg.words;
const matchCount = Math.floor(TOTAL_NOTES * MATCH_FRACTION);
const [, seedMs] = timed(() => {
sql.transactional(() => {
const folderIds: string[] = [];
const now = new Date().toISOString().replace("T", " ").replace("Z", "+0000");
// Create folders under root
for (let f = 0; f < FOLDER_COUNT; f++) {
const noteId = `seed${randomId(8)}`;
const branchId = `seed${randomId(8)}`;
const blobId = `seed${randomId(16)}`;
folderIds.push(noteId);
console.log(`\n ──── ${cfg.label} ────`);
console.log(` Seeding ${TOTAL_NOTES} notes (${matchCount} with keyword)...`);
sql.execute(
`INSERT INTO blobs (blobId, content, dateModified, utcDateModified) VALUES (?, ?, ?, ?)`,
[blobId, `<p>Folder ${f}</p>`, now, now]
);
sql.execute(
`INSERT INTO notes (noteId, title, type, mime, blobId, isProtected, isDeleted,
dateCreated, dateModified, utcDateCreated, utcDateModified)
VALUES (?, ?, 'text', 'text/html', ?, 0, 0, ?, ?, ?, ?)`,
[noteId, `Folder ${f} ${randomWord(5)}`, blobId, now, now, now, now]
);
sql.execute(
`INSERT INTO branches (branchId, noteId, parentNoteId, notePosition, isDeleted, isExpanded, utcDateModified)
VALUES (?, ?, 'root', ?, 0, 0, ?)`,
[branchId, noteId, f * 10, now]
);
}
const [, seedMs] = timed(() => {
sql.transactional(() => {
for (let i = 0; i < TOTAL_NOTES; i++) {
const isMatch = i < matchCount;
// Create notes under folders
let noteIdx = 0;
for (let f = 0; f < FOLDER_COUNT; f++) {
const parentId = folderIds[f];
for (let n = 0; n < NOTES_PER_FOLDER; n++) {
const isMatch = noteIdx < TOTAL_NOTES * MATCH_FRACTION;
const noteId = `seed${randomId(8)}`;
const branchId = `seed${randomId(8)}`;
const blobId = `seed${randomId(16)}`;
const title = isMatch
? `Performance Doc ${i} ${randomWord(6)}`
: `General Note ${i} ${randomWord(6)} ${randomWord(5)}`;
const content = generateContent(
CONTENT_WORDS,
isMatch ? "performance" : undefined
);
? `Test Document ${noteIdx} ${randomWord(6)}`
: `Note ${noteIdx} ${randomWord(6)} ${randomWord(5)}`;
const content = generateContent(CONTENT_WORDS, isMatch ? "test" : undefined);
sql.execute(
`INSERT INTO blobs (blobId, content, dateModified, utcDateModified)
VALUES (?, ?, ?, ?)`,
`INSERT INTO blobs (blobId, content, dateModified, utcDateModified) VALUES (?, ?, ?, ?)`,
[blobId, content, now, now]
);
sql.execute(
`INSERT INTO notes (noteId, title, type, mime, blobId, isProtected, isDeleted,
dateCreated, dateModified, utcDateCreated, utcDateModified)
VALUES (?, ?, 'text', 'text/html', ?, 0, 0, ?, ?, ?, ?)`,
[noteId, title, blobId, now, now, now, now]
);
sql.execute(
`INSERT INTO branches (branchId, noteId, parentNoteId, notePosition, isDeleted, isExpanded,
utcDateModified)
VALUES (?, ?, 'root', ?, 0, 0, ?)`,
[branchId, noteId, i * 10, now]
`INSERT INTO branches (branchId, noteId, parentNoteId, notePosition, isDeleted, isExpanded, utcDateModified)
VALUES (?, ?, ?, ?, 0, 0, ?)`,
[branchId, noteId, parentId, n * 10, now]
);
noteIdx++;
}
});
});
console.log(` SQL seeding: ${seedMs.toFixed(0)}ms`);
// Reload becca to pick up new notes
const [, reloadMs] = timed(() => {
beccaLoader.load();
});
console.log(` Becca reload: ${reloadMs.toFixed(0)}ms`);
console.log(` Becca notes after seed: ${Object.keys(becca.notes).length}`);
// Verify content is accessible
const sampleNote = Object.values(becca.notes).find(n => n.title.startsWith("Performance Doc"));
if (sampleNote) {
const content = sampleNote.getContent();
console.log(` Sample content length: ${typeof content === 'string' ? content.length : 0} chars`);
}
// ==========================================
// PROFILING
// ==========================================
console.log(`\n --- PROFILING (${cfg.label}) ---\n`);
// --- 1. Fast search (NoteFlatTextExp only) ---
searchService.findResultsWithQuery("performance", new SearchContext({ fastSearch: true }));
const fastTimes: number[] = [];
let fastResultCount = 0;
for (let i = 0; i < 5; i++) {
const [r, ms] = timed(() =>
searchService.findResultsWithQuery("performance",
new SearchContext({ fastSearch: true })
)
);
fastTimes.push(ms);
fastResultCount = r.length;
}
const fastAvg = fastTimes.reduce((a, b) => a + b, 0) / fastTimes.length;
console.log(` Fast search (flat text only): avg ${fastAvg.toFixed(1)}ms (${fastResultCount} results)`);
// --- 2. Full search (flat text + content fulltext via SQL) ---
const fullTimes: number[] = [];
let fullResultCount = 0;
for (let i = 0; i < 3; i++) {
const [r, ms] = timed(() =>
searchService.findResultsWithQuery("performance",
new SearchContext({ fastSearch: false })
)
);
fullTimes.push(ms);
fullResultCount = r.length;
}
const fullAvg = fullTimes.reduce((a, b) => a + b, 0) / fullTimes.length;
console.log(` Full search (flat + SQL content): avg ${fullAvg.toFixed(1)}ms (${fullResultCount} results)`);
// --- 3. Content snippet extraction ---
const fastResults = searchService.findResultsWithQuery("performance",
new SearchContext({ fastSearch: true }));
const trimmed = fastResults.slice(0, 200);
const tokens = ["performance"];
const snippetTimes: number[] = [];
for (let i = 0; i < 3; i++) {
const [, ms] = timed(() => {
for (const r of trimmed) {
r.contentSnippet = searchService.extractContentSnippet(r.noteId, tokens);
}
});
snippetTimes.push(ms);
}
const snippetAvg = snippetTimes.reduce((a, b) => a + b, 0) / snippetTimes.length;
console.log(` Content snippet (${trimmed.length} results): avg ${snippetAvg.toFixed(1)}ms (${(snippetAvg / trimmed.length).toFixed(3)}ms/note)`);
// --- 4. Raw getContent() cost ---
const contentTimes: number[] = [];
const textNotes = trimmed
.map(r => becca.notes[r.noteId])
.filter(n => n && ["text", "code"].includes(n.type));
for (let i = 0; i < 5; i++) {
const [, ms] = timed(() => {
for (const n of textNotes) n.getContent();
});
contentTimes.push(ms);
}
const contentAvg = contentTimes.reduce((a, b) => a + b, 0) / contentTimes.length;
console.log(` getContent() × ${textNotes.length} notes: avg ${contentAvg.toFixed(1)}ms (${(contentAvg / textNotes.length).toFixed(3)}ms/note)`);
// --- 5. striptags + normalize cost (isolated) ---
const striptags = require("striptags");
const normalizeString = require("normalize-strings");
const contents = textNotes.map(n => n.getContent() as string).filter(Boolean);
const [, stripMs] = timed(() => {
for (const c of contents) {
striptags(c);
}
});
console.log(` striptags × ${contents.length} notes: ${stripMs.toFixed(1)}ms (${(stripMs / contents.length).toFixed(3)}ms/note)`);
});
console.log(` SQL seeding: ${seedMs.toFixed(0)}ms`);
const stripped = contents.map(c => striptags(c));
const [, normMs] = timed(() => {
for (const s of stripped) {
normalizeString(s.toLowerCase());
}
});
console.log(` normalizeString × ${stripped.length} notes: ${normMs.toFixed(1)}ms (${(normMs / stripped.length).toFixed(3)}ms/note)`);
const [, reloadMs] = timed(() => beccaLoader.load());
const totalNotes = Object.keys(becca.notes).length;
console.log(` Becca reload: ${reloadMs.toFixed(0)}ms Total notes: ${totalNotes}`);
// --- 6. Full autocomplete ---
const autoTimes: number[] = [];
let autoResultCount = 0;
for (let i = 0; i < 3; i++) {
const [r, ms] = timed(() =>
searchService.searchNotesForAutocomplete("performance", true)
);
autoTimes.push(ms);
autoResultCount = r.length;
// ── Warm caches ──
searchService.searchNotesForAutocomplete("test", true);
// ════════════════════════════════════════════
// PROFILING AT SCALE
// ════════════════════════════════════════════
console.log(`\n ════ PROFILING (${totalNotes} notes) ════\n`);
// 1. getCandidateNotes cost (the full-scan bottleneck)
const allNotes = Object.values(becca.notes);
const [, flatScanMs] = timed(() => {
let count = 0;
for (const note of allNotes) {
const ft = note.getFlatText();
if (ft.includes("test")) count++;
}
const autoAvg = autoTimes.reduce((a, b) => a + b, 0) / autoTimes.length;
console.log(`\n FULL AUTOCOMPLETE: avg ${autoAvg.toFixed(1)}ms (${autoResultCount} results)`);
return count;
});
console.log(` getFlatText + includes scan (${allNotes.length} notes): ${flatScanMs.toFixed(1)}ms`);
// --- 7. SQL content scan cost ---
const [scanCount, scanMs] = timed(() => {
let count = 0;
for (const row of sql.iterateRows<{ content: Buffer | string }>(`
SELECT noteId, type, mime, content, isProtected
FROM notes JOIN blobs USING (blobId)
WHERE type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap')
AND isDeleted = 0
AND LENGTH(content) < 2097152`)) {
count++;
}
return count;
});
console.log(` SQL content scan (${scanCount} rows): ${scanMs.toFixed(1)}ms`);
// --- Summary ---
console.log(`\n === SUMMARY (${cfg.label}, ${Object.keys(becca.notes).length} total notes) ===`);
console.log(` Fast search: ${fastAvg.toFixed(1)}ms`);
console.log(` Full search: ${fullAvg.toFixed(1)}ms`);
console.log(` Content snippets: ${snippetAvg.toFixed(1)}ms (${(snippetAvg / trimmed.length).toFixed(3)}ms/note)`);
console.log(` normalizeString: ${normMs.toFixed(1)}ms (${(normMs / stripped.length).toFixed(3)}ms/note)`);
console.log(` Full autocomplete: ${autoAvg.toFixed(1)}ms`);
console.log(` SQL scan: ${scanMs.toFixed(1)}ms (${scanCount} rows)`);
// 2. Full findResultsWithQuery (includes candidate scan + parent walk + scoring)
const findTimes: number[] = [];
let findResultCount = 0;
for (let i = 0; i < 3; i++) {
const [r, ms] = timed(() =>
searchService.findResultsWithQuery("test", new SearchContext({ fastSearch: true }))
);
findTimes.push(ms);
findResultCount = r.length;
}
const findAvg = findTimes.reduce((a, b) => a + b, 0) / findTimes.length;
console.log(` findResultsWithQuery (fast): avg ${findAvg.toFixed(1)}ms (${findResultCount} results)`);
// 3. Exact-only (no fuzzy)
const exactTimes: number[] = [];
for (let i = 0; i < 3; i++) {
const [, ms] = timed(() =>
searchService.findResultsWithQuery("test", new SearchContext({ fastSearch: true, enableFuzzyMatching: false }))
);
exactTimes.push(ms);
}
const exactAvg = exactTimes.reduce((a, b) => a + b, 0) / exactTimes.length;
console.log(` findResultsWithQuery (exact): avg ${exactAvg.toFixed(1)}ms`);
console.log(` Fuzzy overhead: ${(findAvg - exactAvg).toFixed(1)}ms`);
// 4. SearchResult construction + computeScore cost (isolated)
const results = searchService.findResultsWithQuery("test", new SearchContext({ fastSearch: true }));
console.log(` Total results before trim: ${results.length}`);
const [, scoreAllMs] = timed(() => {
for (const r of results) r.computeScore("test", ["test"], true);
});
console.log(` computeScore × ${results.length}: ${scoreAllMs.toFixed(1)}ms (${(scoreAllMs / results.length).toFixed(3)}ms/result)`);
// 5. getNoteTitleForPath for all results
const [, pathTitleMs] = timed(() => {
for (const r of results) beccaService.getNoteTitleForPath(r.notePathArray);
});
console.log(` getNoteTitleForPath × ${results.length}: ${pathTitleMs.toFixed(1)}ms`);
// 6. Content snippet extraction (only 200)
const trimmed = results.slice(0, 200);
const [, snippetMs] = timed(() => {
for (const r of trimmed) {
r.contentSnippet = searchService.extractContentSnippet(r.noteId, ["test"]);
}
});
console.log(` extractContentSnippet × 200: ${snippetMs.toFixed(1)}ms`);
// 7. Highlighting (only 200)
const [, hlMs] = timed(() => {
searchService.highlightSearchResults(trimmed, ["test"]);
});
console.log(` highlightSearchResults × 200: ${hlMs.toFixed(1)}ms`);
// 7b. getBestNotePath cost (used by fast path)
const sampleNotes = Object.values(becca.notes).filter(n => n.title.startsWith("Test Document")).slice(0, 1000);
const [, bestPathMs] = timed(() => {
for (const n of sampleNotes) n.getBestNotePath();
});
console.log(` getBestNotePath × ${sampleNotes.length}: ${bestPathMs.toFixed(1)}ms (${(bestPathMs/sampleNotes.length).toFixed(3)}ms/note)`);
// 8. Full autocomplete end-to-end
const autoTimes: number[] = [];
let autoCount = 0;
for (let i = 0; i < 3; i++) {
const [r, ms] = timed(() =>
searchService.searchNotesForAutocomplete("test", true)
);
autoTimes.push(ms);
autoCount = r.length;
}
const autoAvg = autoTimes.reduce((a, b) => a + b, 0) / autoTimes.length;
const autoMin = Math.min(...autoTimes);
console.log(`\n ★ FULL AUTOCOMPLETE: avg ${autoAvg.toFixed(1)}ms min ${autoMin.toFixed(1)}ms (${autoCount} results)`);
// 9. With a less common search term (fewer matches)
const rareTimes: number[] = [];
let rareCount = 0;
for (let i = 0; i < 3; i++) {
const [r, ms] = timed(() =>
searchService.searchNotesForAutocomplete("leitfaden", true)
);
rareTimes.push(ms);
rareCount = r.length;
}
const rareAvg = rareTimes.reduce((a, b) => a + b, 0) / rareTimes.length;
console.log(` Autocomplete "leitfaden": avg ${rareAvg.toFixed(1)}ms (${rareCount} results)`);
// 10. Full search (fastSearch=false) — the 2.7s bottleneck
console.log(`\n ── Full search (fastSearch=false) ──`);
const fullTimes: number[] = [];
let fullCount = 0;
for (let i = 0; i < 2; i++) {
const [r, ms] = timed(() =>
searchService.findResultsWithQuery("test", new SearchContext({ fastSearch: false }))
);
fullTimes.push(ms);
fullCount = r.length;
}
const fullAvg = fullTimes.reduce((a, b) => a + b, 0) / fullTimes.length;
console.log(` Full search (flat + SQL): avg ${fullAvg.toFixed(1)}ms (${fullCount} results)`);
// 11. SQL content scan alone
const [scanCount, scanMs] = timed(() => {
let count = 0;
for (const row of sql.iterateRows<{ content: Buffer | string }>(`
SELECT noteId, type, mime, content, isProtected
FROM notes JOIN blobs USING (blobId)
WHERE type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap')
AND isDeleted = 0
AND LENGTH(content) < 2097152`)) {
count++;
}
return count;
});
console.log(` Raw SQL scan (${scanCount} rows): ${scanMs.toFixed(1)}ms`);
// ── Summary ──
console.log(`\n ════ SUMMARY ════`);
console.log(` Notes: ${totalNotes} | Matches: ${findResultCount} | Hierarchy depth: 3 (root → folder → note)`);
console.log(` ──────────────────────────────────`);
console.log(` Autocomplete (fast): ${autoAvg.toFixed(1)}ms`);
console.log(` findResults: ${findAvg.toFixed(1)}ms (${((findAvg/autoAvg)*100).toFixed(0)}%)`);
console.log(` snippets+highlight: ${(snippetMs + hlMs).toFixed(1)}ms (${(((snippetMs+hlMs)/autoAvg)*100).toFixed(0)}%)`);
console.log(` Full search: ${fullAvg.toFixed(1)}ms`);
resolve();
});

View File

@@ -31,9 +31,17 @@ export default class Becca {
allNoteSetCache: NoteSet | null;
/**
* Pre-built parallel arrays for fast flat text scanning in search.
* Avoids per-note property access overhead when iterating 50K+ notes.
* Dirtied when notes change (along with allNoteSetCache).
*/
flatTextIndex: { notes: BNote[], flatTexts: string[] } | null;
constructor() {
this.reset();
this.allNoteSetCache = null;
this.flatTextIndex = null;
}
reset() {
@@ -239,6 +247,28 @@ export default class Becca {
/** Should be called when the set of all non-skeleton notes changes (added/removed) */
dirtyNoteSetCache() {
this.allNoteSetCache = null;
this.flatTextIndex = null;
}
/**
* Returns pre-built parallel arrays of notes and their flat texts for fast scanning.
* The flat texts are already normalized (lowercase, diacritics removed).
*/
getFlatTextIndex(): { notes: BNote[], flatTexts: string[] } {
if (!this.flatTextIndex) {
const allNoteSet = this.getAllNoteSet();
const notes: BNote[] = [];
const flatTexts: string[] = [];
for (const note of allNoteSet.notes) {
notes.push(note);
flatTexts.push(note.getFlatText());
}
this.flatTextIndex = { notes, flatTexts };
}
return this.flatTextIndex;
}
getAllNoteSet() {

View File

@@ -790,6 +790,9 @@ class BNote extends AbstractBeccaEntity<BNote> {
this.__attributeCache = null;
this.__inheritableAttributeCache = null;
this.__ancestorCache = null;
// Dirty the becca-level flat text index since this note's flat text may have changed
this.becca.flatTextIndex = null;
}
invalidateSubTree(path: string[] = []) {

View File

@@ -99,6 +99,22 @@ class NoteFlatTextExp extends Expression {
const candidateNotes = this.getCandidateNotes(inputNoteSet, searchContext);
// Fast path for single-token searches with a limit (e.g. autocomplete):
// Skip the expensive recursive parent walk and just use getBestNotePath().
// The flat text already matched, so we know the token is present.
if (this.tokens.length === 1 && searchContext.limit) {
for (const note of candidateNotes) {
if (!resultNoteSet.hasNoteId(note.noteId)) {
const notePath = note.getBestNotePath();
if (notePath) {
executionContext.noteIdToNotePath[note.noteId] = notePath;
resultNoteSet.add(note);
}
}
}
return resultNoteSet;
}
for (const note of candidateNotes) {
// autocomplete should be able to find notes by their noteIds as well (only leafs)
if (this.tokens.length === 1 && note.noteId.toLowerCase() === this.tokens[0]) {
@@ -112,7 +128,7 @@ class NoteFlatTextExp extends Expression {
// Add defensive checks for undefined properties
const typeMatches = note.type && note.type.includes(token);
const mimeMatches = note.mime && note.mime.includes(token);
if (typeMatches || mimeMatches) {
foundAttrTokens.push(token);
}
@@ -165,14 +181,38 @@ class NoteFlatTextExp extends Expression {
getCandidateNotes(noteSet: NoteSet, searchContext?: SearchContext): BNote[] {
const candidateNotes: BNote[] = [];
for (const note of noteSet.notes) {
const normalizedFlatText = normalizeSearchText(note.getFlatText());
// For limited searches (e.g. autocomplete), cap candidates to avoid
// processing thousands of matches when only a few hundred are needed.
// Use 5x the limit to ensure enough quality candidates for scoring.
const maxCandidates = searchContext?.limit ? searchContext.limit * 5 : Infinity;
// Use the pre-built flat text index for fast scanning.
// This provides pre-computed flat texts in a parallel array, avoiding
// per-note property access overhead at large scale (50K+ notes).
const { notes: indexNotes, flatTexts } = becca.getFlatTextIndex();
// Build a set for quick membership check when noteSet isn't the full set
const isFullSet = noteSet.notes.length === indexNotes.length;
for (let i = 0; i < indexNotes.length; i++) {
const note = indexNotes[i];
// Skip notes not in the input set (only check when not using the full set)
if (!isFullSet && !noteSet.hasNoteId(note.noteId)) {
continue;
}
const flatText = flatTexts[i];
for (const token of this.tokens) {
if (this.smartMatch(normalizedFlatText, token, searchContext)) {
if (this.smartMatch(flatText, token, searchContext)) {
candidateNotes.push(note);
break;
}
}
if (candidateNotes.length >= maxCandidates) {
break;
}
}
return candidateNotes;

View File

@@ -16,7 +16,6 @@ import type { SearchParams, TokenStructure } from "./types.js";
import type Expression from "../expressions/expression.js";
import sql from "../../sql.js";
import scriptService from "../../script.js";
import striptags from "striptags";
import protectedSessionService from "../../protected_session.js";
export interface SearchNoteResult {
@@ -249,23 +248,30 @@ function findResultsWithExpression(expression: Expression, searchContext: Search
return performSearch(expression, searchContext, false);
}
// For limited searches (e.g. autocomplete), skip the expensive two-phase
// fuzzy fallback. The user is typing and will refine their query — exact
// matching is sufficient and avoids a second full scan of all notes.
if (searchContext.limit) {
return performSearch(expression, searchContext, false);
}
// Phase 1: Try exact matches first (without fuzzy matching)
const exactResults = performSearch(expression, searchContext, false);
// Check if we have sufficient high-quality results
const minResultThreshold = 5;
const minScoreForQuality = 10; // Minimum score to consider a result "high quality"
const highQualityResults = exactResults.filter(result => result.score >= minScoreForQuality);
// If we have enough high-quality exact matches, return them
if (highQualityResults.length >= minResultThreshold) {
return exactResults;
}
// Phase 2: Add fuzzy matching as fallback when exact matches are insufficient
const fuzzyResults = performSearch(expression, searchContext, true);
// Merge results, ensuring exact matches always rank higher than fuzzy matches
return mergeExactAndFuzzyResults(exactResults, fuzzyResults);
}
@@ -447,7 +453,7 @@ function extractContentSnippet(noteId: string, searchTokens: string[], maxLength
try {
let content = note.getContent();
if (!content || typeof content !== "string") {
return "";
}
@@ -463,77 +469,66 @@ function extractContentSnippet(noteId: string, searchTokens: string[], maxLength
return ""; // Protected but no session available
}
// Strip HTML tags for text notes
// Strip HTML tags for text notes — use fast regex for snippet extraction
// (striptags library is ~18x slower and not needed for search snippets)
if (note.type === "text") {
content = striptags(content);
content = content.replace(/<[^>]*>/g, "");
}
// Normalize whitespace while preserving paragraph breaks
// First, normalize multiple newlines to double newlines (paragraph breaks)
content = content.replace(/\n\s*\n/g, "\n\n");
// Then normalize spaces within lines
content = content.split('\n').map(line => line.replace(/\s+/g, " ").trim()).join('\n');
// Finally trim the whole content
content = content.trim();
if (!content) {
return "";
}
// Try to find a snippet around the first matching token
// Find match position using normalize on the raw stripped content.
// We use a single normalize() pass — no need for expensive whitespace
// normalization just to find the match index.
const normalizedContent = normalize(content);
const normalizedTokens = searchTokens.map(token => normalize(token));
let snippetStart = 0;
let matchFound = false;
for (const token of searchTokens) {
const normalizedToken = normalize(token);
for (const normalizedToken of normalizedTokens) {
const matchIndex = normalizedContent.indexOf(normalizedToken);
if (matchIndex !== -1) {
// Center the snippet around the match
snippetStart = Math.max(0, matchIndex - maxLength / 2);
matchFound = true;
break;
}
}
// Extract snippet
let snippet = content.substring(snippetStart, snippetStart + maxLength);
// Extract a snippet region from the raw content, then clean only that
const snippetRegion = content.substring(snippetStart, snippetStart + maxLength + 100);
// If snippet contains linebreaks, limit to max 4 lines and override character limit
// Normalize whitespace only on the small snippet region
let snippet = snippetRegion
.replace(/\n\s*\n/g, "\n\n")
.replace(/[ \t]+/g, " ")
.trim()
.substring(0, maxLength);
// If snippet contains linebreaks, limit to max 4 lines
const lines = snippet.split('\n');
if (lines.length > 4) {
// Find which lines contain the search tokens to ensure they're included
const normalizedLines = lines.map(line => normalize(line));
const normalizedTokens = searchTokens.map(token => normalize(token));
// Find the first line that contains a search token
let firstMatchLine = -1;
for (let i = 0; i < normalizedLines.length; i++) {
if (normalizedTokens.some(token => normalizedLines[i].includes(token))) {
for (let i = 0; i < lines.length; i++) {
const normalizedLine = normalize(lines[i]);
if (normalizedTokens.some(token => normalizedLine.includes(token))) {
firstMatchLine = i;
break;
}
}
if (firstMatchLine !== -1) {
// Center the 4-line window around the first match
// Try to show 1 line before and 2 lines after the match
const startLine = Math.max(0, firstMatchLine - 1);
const endLine = Math.min(lines.length, startLine + 4);
snippet = lines.slice(startLine, endLine).join('\n');
} else {
// No match found in lines (shouldn't happen), just take first 4
snippet = lines.slice(0, 4).join('\n');
}
// Add ellipsis if we truncated lines
snippet = snippet + "...";
} else if (lines.length > 1) {
// For multi-line snippets that are 4 or fewer lines, keep them as-is
// No need to truncate
} else {
// Single line content - apply original word boundary logic
// Try to start/end at word boundaries
} else if (lines.length <= 1) {
// Single line content - apply word boundary logic
if (snippetStart > 0) {
const firstSpace = snippet.search(/\s/);
if (firstSpace > 0 && firstSpace < 20) {
@@ -541,7 +536,7 @@ function extractContentSnippet(noteId: string, searchTokens: string[], maxLength
}
snippet = "..." + snippet;
}
if (snippetStart + maxLength < content.length) {
const lastSpace = snippet.search(/\s[^\s]*$/);
if (lastSpace > snippet.length - 20 && lastSpace > 0) {
@@ -649,7 +644,8 @@ function searchNotesForAutocomplete(query: string, fastSearch: boolean = true) {
includeHiddenNotes: true,
fuzzyAttributeSearch: true,
ignoreInternalAttributes: true,
ancestorNoteId: hoistedNoteService.isHoistedInHiddenSubtree() ? "root" : hoistedNoteService.getHoistedNoteId()
ancestorNoteId: hoistedNoteService.isHoistedInHiddenSubtree() ? "root" : hoistedNoteService.getHoistedNoteId(),
limit: 200
});
const allSearchResults = findResultsWithQuery(query, searchContext);