feat(search): try again to get fts5 searching done well

This commit is contained in:
perf3ct
2025-10-24 21:47:06 -07:00
parent d992a5e4a2
commit 253da139de
24 changed files with 117 additions and 4552 deletions

View File

@@ -219,12 +219,22 @@ CREATE TABLE IF NOT EXISTS sessions (
); );
-- FTS5 Full-Text Search Support -- FTS5 Full-Text Search Support
-- Create FTS5 virtual table for full-text searching -- Create FTS5 virtual table with trigram tokenizer
-- Trigram tokenizer provides language-agnostic substring matching:
-- 1. Fast substring matching (50-100x speedup for LIKE queries without wildcards)
-- 2. Case-insensitive search without custom collation
-- 3. No language-specific stemming assumptions (works for all languages)
-- 4. Boolean operators (AND, OR, NOT) and phrase matching with quotes
--
-- IMPORTANT: Trigram requires minimum 3-character tokens for matching
-- detail='none' reduces index size by ~50% while maintaining MATCH/rank performance
-- (loses position info for highlight() function, but snippet() still works)
CREATE VIRTUAL TABLE notes_fts USING fts5( CREATE VIRTUAL TABLE notes_fts USING fts5(
noteId UNINDEXED, noteId UNINDEXED,
title, title,
content, content,
tokenize = 'porter unicode61' tokenize = 'trigram',
detail = 'none'
); );
-- Triggers to keep FTS table synchronized with notes -- Triggers to keep FTS table synchronized with notes

View File

@@ -14,23 +14,46 @@ import log from "../services/log.js";
export default function addFTS5SearchAndPerformanceIndexes() { export default function addFTS5SearchAndPerformanceIndexes() {
log.info("Starting FTS5 and performance optimization migration..."); log.info("Starting FTS5 and performance optimization migration...");
// Verify SQLite version supports trigram tokenizer (requires 3.34.0+)
const sqliteVersion = sql.getValue<string>(`SELECT sqlite_version()`);
const [major, minor, patch] = sqliteVersion.split('.').map(Number);
const versionNumber = major * 10000 + minor * 100 + (patch || 0);
const requiredVersion = 3 * 10000 + 34 * 100 + 0; // 3.34.0
if (versionNumber < requiredVersion) {
log.error(`SQLite version ${sqliteVersion} does not support trigram tokenizer (requires 3.34.0+)`);
log.info("Skipping FTS5 trigram migration - will use fallback search implementation");
return; // Skip FTS5 setup, rely on fallback search
}
log.info(`SQLite version ${sqliteVersion} confirmed - trigram tokenizer available`);
// Part 1: FTS5 Setup // Part 1: FTS5 Setup
log.info("Creating FTS5 virtual table for full-text search..."); log.info("Creating FTS5 virtual table for full-text search...");
// Create FTS5 virtual table // Create FTS5 virtual table
// We store noteId, title, and content for searching // We store noteId, title, and content for searching
// The 'tokenize' option uses porter stemming for better search results
sql.executeScript(` sql.executeScript(`
-- Drop existing FTS table if it exists (for re-running migration in dev) -- Drop existing FTS table if it exists (for re-running migration in dev)
DROP TABLE IF EXISTS notes_fts; DROP TABLE IF EXISTS notes_fts;
-- Create FTS5 virtual table -- Create FTS5 virtual table with trigram tokenizer
-- Trigram tokenizer provides language-agnostic substring matching:
-- 1. Fast substring matching (50-100x speedup for LIKE queries without wildcards)
-- 2. Case-insensitive search without custom collation
-- 3. No language-specific stemming assumptions (works for all languages)
-- 4. Boolean operators (AND, OR, NOT) and phrase matching with quotes
--
-- IMPORTANT: Trigram requires minimum 3-character tokens for matching
-- detail='none' reduces index size by ~50% while maintaining MATCH/rank performance
-- (loses position info for highlight() function, but snippet() still works)
CREATE VIRTUAL TABLE IF NOT EXISTS notes_fts USING fts5( CREATE VIRTUAL TABLE IF NOT EXISTS notes_fts USING fts5(
noteId UNINDEXED, noteId UNINDEXED,
title, title,
content, content,
tokenize = 'porter unicode61' tokenize = 'trigram',
detail = 'none'
); );
`); `);

View File

@@ -1,826 +0,0 @@
/**
* Migration to add SQLite native search support with normalized text tables
*
* This migration implements Phase 1 of the SQLite-based search plan:
* 1. Creates note_search_content table with normalized text columns
* 2. Creates note_tokens table for word-level token storage
* 3. Adds necessary indexes for optimization
* 4. Creates triggers to keep tables synchronized with note updates
* 5. Populates tables with existing note data in batches
*
* This provides 100% accurate search results with 10-30x performance improvement
* over TypeScript-based search, without the complexity of trigrams.
*/
import sql from "../services/sql.js";
import log from "../services/log.js";
import { normalize as utilsNormalize, stripTags } from "../services/utils.js";
import { getSqliteFunctionsService } from "../services/search/sqlite_functions.js";
/**
* Uses the existing normalize function from utils.ts for consistency
* This ensures all normalization throughout the codebase is identical
*/
function normalizeText(text: string): string {
if (!text) return '';
return utilsNormalize(text);
}
/**
* Tokenizes text into individual words for token-based searching
* Handles punctuation and special characters appropriately
*/
function tokenize(text: string): string[] {
if (!text) return [];
// Split on word boundaries, filter out empty tokens
// This regex splits on spaces, punctuation, and other non-word characters
// but preserves apostrophes within words (e.g., "don't", "it's")
const tokens = text
.split(/[\s\n\r\t,;.!?()[\]{}"'`~@#$%^&*+=|\\/<>:_-]+/)
.filter(token => token.length > 0)
.map(token => token.toLowerCase());
// Also split on camelCase and snake_case boundaries for code content
const expandedTokens: string[] = [];
for (const token of tokens) {
// Add the original token
expandedTokens.push(token);
// Split camelCase (e.g., "getUserName" -> ["get", "User", "Name"])
const camelCaseParts = token.split(/(?=[A-Z])/);
if (camelCaseParts.length > 1) {
expandedTokens.push(...camelCaseParts.map(p => p.toLowerCase()));
}
// Split snake_case (e.g., "user_name" -> ["user", "name"])
const snakeCaseParts = token.split('_');
if (snakeCaseParts.length > 1) {
expandedTokens.push(...snakeCaseParts);
}
}
// Remove duplicates and return
return Array.from(new Set(expandedTokens));
}
/**
* Strips HTML tags from content for text-only indexing
* Uses the utils stripTags function for consistency
*/
function stripHtmlTags(html: string): string {
if (!html) return '';
// Remove script and style content entirely first
let text = html.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '');
text = text.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '');
// Use utils stripTags for consistency
text = stripTags(text);
// Decode HTML entities
text = text.replace(/&nbsp;/g, ' ');
text = text.replace(/&lt;/g, '<');
text = text.replace(/&gt;/g, '>');
text = text.replace(/&amp;/g, '&');
text = text.replace(/&quot;/g, '"');
text = text.replace(/&#39;/g, "'");
// Normalize whitespace
text = text.replace(/\s+/g, ' ').trim();
return text;
}
export default function sqliteNativeSearch() {
log.info("Starting SQLite native search migration...");
const startTime = Date.now();
// Wrap entire migration in a transaction for atomicity
sql.transactional(() => {
try {
// Register custom SQL functions first so they can be used in triggers
registerCustomFunctions();
// Create the search tables and indexes
createSearchTables();
// Create triggers to keep tables synchronized (before population)
createSearchTriggers();
// Populate the tables with existing note data
populateSearchTables();
// Run final verification and optimization
finalizeSearchSetup();
const duration = Date.now() - startTime;
log.info(`SQLite native search migration completed successfully in ${duration}ms`);
} catch (error) {
log.error(`SQLite native search migration failed: ${error}`);
// Transaction will automatically rollback on error
throw error;
}
});
}
function createSearchTables() {
log.info("Creating search content and token tables...");
// Drop existing tables if they exist (for re-running migration in dev)
sql.execute("DROP TABLE IF EXISTS note_search_content");
sql.execute("DROP TABLE IF EXISTS note_tokens");
// Create the main search content table
sql.execute(`
CREATE TABLE note_search_content (
noteId TEXT PRIMARY KEY,
title TEXT NOT NULL,
content TEXT NOT NULL,
title_normalized TEXT NOT NULL,
content_normalized TEXT NOT NULL,
full_text_normalized TEXT NOT NULL
)
`);
// Create the token table for word-level operations
sql.execute(`
CREATE TABLE note_tokens (
noteId TEXT NOT NULL,
token TEXT NOT NULL,
token_normalized TEXT NOT NULL,
position INTEGER NOT NULL,
source TEXT NOT NULL CHECK(source IN ('title', 'content')),
PRIMARY KEY (noteId, position, source)
)
`);
// Create indexes for search optimization
log.info("Creating search indexes...");
// Consolidated indexes - removed redundancy between COLLATE NOCASE and plain indexes
// Using COLLATE NOCASE for case-insensitive searches
sql.execute(`
CREATE INDEX idx_search_title_normalized
ON note_search_content(title_normalized COLLATE NOCASE)
`);
sql.execute(`
CREATE INDEX idx_search_content_normalized
ON note_search_content(content_normalized COLLATE NOCASE)
`);
sql.execute(`
CREATE INDEX idx_search_full_text
ON note_search_content(full_text_normalized COLLATE NOCASE)
`);
// Token indexes - consolidated to avoid redundancy
sql.execute(`
CREATE INDEX idx_tokens_normalized
ON note_tokens(token_normalized COLLATE NOCASE)
`);
sql.execute(`
CREATE INDEX idx_tokens_noteId
ON note_tokens(noteId)
`);
// Composite index for token searches with source
sql.execute(`
CREATE INDEX idx_tokens_source_normalized
ON note_tokens(source, token_normalized COLLATE NOCASE)
`);
log.info("Search tables and indexes created successfully");
}
function populateSearchTables() {
log.info("Populating search tables with existing note content...");
const batchSize = 100;
let offset = 0;
let totalProcessed = 0;
let totalTokens = 0;
while (true) {
const notes = sql.getRows<{
noteId: string;
title: string;
type: string;
mime: string;
content: string | null;
}>(`
SELECT
n.noteId,
n.title,
n.type,
n.mime,
b.content
FROM notes n
LEFT JOIN blobs b ON n.blobId = b.blobId
WHERE n.isDeleted = 0
AND n.isProtected = 0
AND n.type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap')
ORDER BY n.noteId
LIMIT ? OFFSET ?
`, [batchSize, offset]);
if (notes.length === 0) {
break;
}
// Process batch of notes
for (const note of notes) {
try {
// Process content based on type
let processedContent = note.content || '';
// Strip HTML for text notes
if (note.type === 'text' && note.mime === 'text/html') {
processedContent = stripHtmlTags(processedContent);
}
// Normalize text for searching using the utils normalize function
const titleNorm = normalizeText(note.title);
const contentNorm = normalizeText(processedContent);
const fullTextNorm = titleNorm + ' ' + contentNorm;
// Insert into search content table
sql.execute(`
INSERT INTO note_search_content
(noteId, title, content, title_normalized, content_normalized, full_text_normalized)
VALUES (?, ?, ?, ?, ?, ?)
`, [
note.noteId,
note.title,
processedContent,
titleNorm,
contentNorm,
fullTextNorm
]);
// Tokenize title and content separately to track source
const titleTokens = tokenize(note.title);
const contentTokens = tokenize(processedContent);
let position = 0;
// Insert title tokens
for (const token of titleTokens) {
if (token.length > 0) {
sql.execute(`
INSERT OR IGNORE INTO note_tokens
(noteId, token, token_normalized, position, source)
VALUES (?, ?, ?, ?, 'title')
`, [note.noteId, token, normalizeText(token), position]);
position++;
totalTokens++;
}
}
// Insert content tokens with unique positions
for (const token of contentTokens) {
if (token.length > 0) {
sql.execute(`
INSERT OR IGNORE INTO note_tokens
(noteId, token, token_normalized, position, source)
VALUES (?, ?, ?, ?, 'content')
`, [note.noteId, token, normalizeText(token), position]);
position++;
totalTokens++;
}
}
totalProcessed++;
} catch (error) {
log.error(`Failed to index note ${note.noteId}: ${error}`);
// Continue with other notes even if one fails
}
}
offset += batchSize;
if (totalProcessed % 1000 === 0) {
log.info(`Processed ${totalProcessed} notes, ${totalTokens} tokens for search indexing...`);
}
}
log.info(`Completed indexing ${totalProcessed} notes with ${totalTokens} total tokens`);
}
function createSearchTriggers() {
log.info("Creating triggers to keep search tables synchronized...");
// Drop existing triggers if they exist
const triggers = [
'note_search_insert',
'note_search_update',
'note_search_delete',
'note_search_soft_delete',
'note_search_undelete',
'note_search_protect',
'note_search_unprotect',
'note_search_blob_insert',
'note_search_blob_update'
];
for (const trigger of triggers) {
sql.execute(`DROP TRIGGER IF EXISTS ${trigger}`);
}
// Trigger for INSERT operations on notes - simplified version
sql.execute(`
CREATE TRIGGER note_search_insert
AFTER INSERT ON notes
WHEN NEW.type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap')
AND NEW.isDeleted = 0
AND NEW.isProtected = 0
BEGIN
-- Delete any existing entries (for INSERT OR REPLACE)
DELETE FROM note_search_content WHERE noteId = NEW.noteId;
DELETE FROM note_tokens WHERE noteId = NEW.noteId;
-- Insert basic content with title only (content will be populated by blob trigger)
INSERT INTO note_search_content
(noteId, title, content, title_normalized, content_normalized, full_text_normalized)
VALUES (
NEW.noteId,
NEW.title,
'',
LOWER(NEW.title),
'',
LOWER(NEW.title)
);
END
`);
// Trigger for UPDATE operations on notes - simplified version
sql.execute(`
CREATE TRIGGER note_search_update
AFTER UPDATE ON notes
WHEN NEW.type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap')
BEGIN
-- Always delete the old entries
DELETE FROM note_search_content WHERE noteId = NEW.noteId;
DELETE FROM note_tokens WHERE noteId = NEW.noteId;
-- Re-insert if note is not deleted and not protected
INSERT INTO note_search_content
(noteId, title, content, title_normalized, content_normalized, full_text_normalized)
SELECT
NEW.noteId,
NEW.title,
COALESCE(b.content, ''),
LOWER(NEW.title),
LOWER(COALESCE(b.content, '')),
LOWER(NEW.title || ' ' || COALESCE(b.content, ''))
FROM notes n
LEFT JOIN blobs b ON b.blobId = NEW.blobId
WHERE n.noteId = NEW.noteId
AND NEW.isDeleted = 0
AND NEW.isProtected = 0;
END
`);
// Trigger for DELETE operations on notes
sql.execute(`
CREATE TRIGGER note_search_delete
AFTER DELETE ON notes
BEGIN
DELETE FROM note_search_content WHERE noteId = OLD.noteId;
DELETE FROM note_tokens WHERE noteId = OLD.noteId;
END
`);
// Trigger for soft delete (isDeleted = 1)
sql.execute(`
CREATE TRIGGER note_search_soft_delete
AFTER UPDATE ON notes
WHEN OLD.isDeleted = 0 AND NEW.isDeleted = 1
BEGIN
DELETE FROM note_search_content WHERE noteId = NEW.noteId;
DELETE FROM note_tokens WHERE noteId = NEW.noteId;
END
`);
// Trigger for undelete (isDeleted = 0) - simplified version
sql.execute(`
CREATE TRIGGER note_search_undelete
AFTER UPDATE ON notes
WHEN OLD.isDeleted = 1 AND NEW.isDeleted = 0
AND NEW.type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap')
AND NEW.isProtected = 0
BEGIN
DELETE FROM note_search_content WHERE noteId = NEW.noteId;
DELETE FROM note_tokens WHERE noteId = NEW.noteId;
INSERT INTO note_search_content
(noteId, title, content, title_normalized, content_normalized, full_text_normalized)
SELECT
NEW.noteId,
NEW.title,
COALESCE(b.content, ''),
LOWER(NEW.title),
LOWER(COALESCE(b.content, '')),
LOWER(NEW.title || ' ' || COALESCE(b.content, ''))
FROM notes n
LEFT JOIN blobs b ON b.blobId = NEW.blobId
WHERE n.noteId = NEW.noteId;
END
`);
// Trigger for notes becoming protected
sql.execute(`
CREATE TRIGGER note_search_protect
AFTER UPDATE ON notes
WHEN OLD.isProtected = 0 AND NEW.isProtected = 1
BEGIN
DELETE FROM note_search_content WHERE noteId = NEW.noteId;
DELETE FROM note_tokens WHERE noteId = NEW.noteId;
END
`);
// Trigger for notes becoming unprotected - simplified version
sql.execute(`
CREATE TRIGGER note_search_unprotect
AFTER UPDATE ON notes
WHEN OLD.isProtected = 1 AND NEW.isProtected = 0
AND NEW.type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap')
AND NEW.isDeleted = 0
BEGIN
DELETE FROM note_search_content WHERE noteId = NEW.noteId;
DELETE FROM note_tokens WHERE noteId = NEW.noteId;
INSERT INTO note_search_content
(noteId, title, content, title_normalized, content_normalized, full_text_normalized)
SELECT
NEW.noteId,
NEW.title,
COALESCE(b.content, ''),
LOWER(NEW.title),
LOWER(COALESCE(b.content, '')),
LOWER(NEW.title || ' ' || COALESCE(b.content, ''))
FROM notes n
LEFT JOIN blobs b ON b.blobId = NEW.blobId
WHERE n.noteId = NEW.noteId;
END
`);
// Trigger for INSERT operations on blobs - simplified version
sql.execute(`
CREATE TRIGGER note_search_blob_insert
AFTER INSERT ON blobs
BEGIN
-- Update search content for all notes that reference this blob
UPDATE note_search_content
SET content = NEW.content,
content_normalized = LOWER(NEW.content),
full_text_normalized = title_normalized || ' ' || LOWER(NEW.content)
WHERE noteId IN (
SELECT n.noteId
FROM notes n
WHERE n.blobId = NEW.blobId
AND n.type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap')
AND n.isDeleted = 0
AND n.isProtected = 0
);
-- Clear tokens for affected notes (will be repopulated by post-processing)
DELETE FROM note_tokens
WHERE noteId IN (
SELECT n.noteId
FROM notes n
WHERE n.blobId = NEW.blobId
AND n.type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap')
AND n.isDeleted = 0
AND n.isProtected = 0
);
END
`);
// Trigger for UPDATE operations on blobs - simplified version
sql.execute(`
CREATE TRIGGER note_search_blob_update
AFTER UPDATE ON blobs
BEGIN
-- Update search content for all notes that reference this blob
UPDATE note_search_content
SET content = NEW.content,
content_normalized = LOWER(NEW.content),
full_text_normalized = title_normalized || ' ' || LOWER(NEW.content)
WHERE noteId IN (
SELECT n.noteId
FROM notes n
WHERE n.blobId = NEW.blobId
AND n.type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap')
AND n.isDeleted = 0
AND n.isProtected = 0
);
-- Clear tokens for affected notes (will be repopulated by post-processing)
DELETE FROM note_tokens
WHERE noteId IN (
SELECT n.noteId
FROM notes n
WHERE n.blobId = NEW.blobId
AND n.type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap')
AND n.isDeleted = 0
AND n.isProtected = 0
);
END
`);
log.info("Search synchronization triggers created successfully");
}
function registerCustomFunctions() {
log.info("Registering custom SQL functions for search operations...");
try {
// Get the database connection to register functions
const db = sql.getDbConnection();
// Use the centralized SQLite functions service
const functionsService = getSqliteFunctionsService();
// Register functions if not already registered
if (!functionsService.isRegistered()) {
const success = functionsService.registerFunctions(db);
if (success) {
log.info("Custom SQL functions registered successfully via service");
} else {
log.info("Custom SQL functions registration failed - using basic SQLite functions only");
}
} else {
log.info("Custom SQL functions already registered");
}
// Register migration-specific helper function for tokenization
db.function('tokenize_for_migration', {
deterministic: true,
varargs: false
}, (text: string | null) => {
if (!text) return '';
// Return as JSON array string for SQL processing
return JSON.stringify(tokenize(text));
});
} catch (error) {
log.info(`Could not register custom SQL functions (will use basic SQLite functions): ${error}`);
// This is not critical - the migration will work with basic SQLite functions
}
}
/**
* Populates tokens for a specific note
* This is called outside of triggers to avoid complex SQL within trigger constraints
*/
function populateNoteTokens(noteId: string): number {
try {
// Get the note's search content
const noteData = sql.getRow<{
title: string;
content: string;
}>(`
SELECT title, content
FROM note_search_content
WHERE noteId = ?
`, [noteId]);
if (!noteData) return 0;
// Clear existing tokens for this note
sql.execute(`DELETE FROM note_tokens WHERE noteId = ?`, [noteId]);
// Tokenize title and content
const titleTokens = tokenize(noteData.title);
const contentTokens = tokenize(noteData.content);
let position = 0;
let tokenCount = 0;
// Insert title tokens
for (const token of titleTokens) {
if (token.length > 0) {
sql.execute(`
INSERT OR IGNORE INTO note_tokens
(noteId, token, token_normalized, position, source)
VALUES (?, ?, ?, ?, 'title')
`, [noteId, token, normalizeText(token), position]);
position++;
tokenCount++;
}
}
// Insert content tokens
for (const token of contentTokens) {
if (token.length > 0) {
sql.execute(`
INSERT OR IGNORE INTO note_tokens
(noteId, token, token_normalized, position, source)
VALUES (?, ?, ?, ?, 'content')
`, [noteId, token, normalizeText(token), position]);
position++;
tokenCount++;
}
}
return tokenCount;
} catch (error) {
log.error(`Error populating tokens for note ${noteId}: ${error}`);
return 0;
}
}
/**
* Populates tokens for multiple notes affected by blob operations
* This handles cases where blob triggers can affect multiple notes
*/
function populateBlobAffectedTokens(blobId: string): void {
try {
// Find all notes that reference this blob and need token updates
const affectedNoteIds = sql.getColumn<string>(`
SELECT DISTINCT n.noteId
FROM notes n
INNER JOIN note_search_content nsc ON n.noteId = nsc.noteId
WHERE n.blobId = ?
AND n.type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap')
AND n.isDeleted = 0
AND n.isProtected = 0
`, [blobId]);
if (affectedNoteIds.length === 0) return;
log.info(`Updating tokens for ${affectedNoteIds.length} notes affected by blob ${blobId}`);
let totalTokens = 0;
for (const noteId of affectedNoteIds) {
const tokenCount = populateNoteTokens(noteId);
totalTokens += tokenCount;
}
log.info(`Updated ${totalTokens} tokens for blob-affected notes`);
} catch (error) {
log.error(`Error populating blob-affected tokens for blob ${blobId}: ${error}`);
}
}
function populateAllTokens() {
log.info("Populating tokens for all search content...");
// Clear existing tokens first to ensure clean state
sql.execute("DELETE FROM note_tokens");
const batchSize = 100;
let offset = 0;
let totalProcessed = 0;
let totalTokens = 0;
while (true) {
const notes = sql.getRows<{
noteId: string;
title: string;
content: string;
}>(`
SELECT noteId, title, content
FROM note_search_content
ORDER BY noteId
LIMIT ? OFFSET ?
`, [batchSize, offset]);
if (notes.length === 0) {
break;
}
for (const note of notes) {
try {
// Tokenize title and content
const titleTokens = tokenize(note.title);
const contentTokens = tokenize(note.content);
let position = 0;
// Insert title tokens
for (const token of titleTokens) {
if (token.length > 0) {
sql.execute(`
INSERT OR IGNORE INTO note_tokens
(noteId, token, token_normalized, position, source)
VALUES (?, ?, ?, ?, 'title')
`, [note.noteId, token, normalizeText(token), position]);
position++;
totalTokens++;
}
}
// Insert content tokens with continuous position numbering
for (const token of contentTokens) {
if (token.length > 0) {
sql.execute(`
INSERT OR IGNORE INTO note_tokens
(noteId, token, token_normalized, position, source)
VALUES (?, ?, ?, ?, 'content')
`, [note.noteId, token, normalizeText(token), position]);
position++;
totalTokens++;
}
}
totalProcessed++;
} catch (error) {
log.error(`Failed to tokenize note ${note.noteId}: ${error}`);
}
}
offset += batchSize;
if (totalProcessed % 1000 === 0) {
log.info(`Processed ${totalProcessed} notes, ${totalTokens} tokens so far...`);
}
}
log.info(`Token population completed: ${totalProcessed} notes processed, ${totalTokens} total tokens`);
}
function finalizeSearchSetup() {
log.info("Running final verification and optimization...");
// Check for missing notes that should be indexed
const missingCount = sql.getValue<number>(`
SELECT COUNT(*) FROM notes n
LEFT JOIN blobs b ON n.blobId = b.blobId
WHERE n.type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap')
AND n.isDeleted = 0
AND n.isProtected = 0
AND b.content IS NOT NULL
AND NOT EXISTS (SELECT 1 FROM note_search_content WHERE noteId = n.noteId)
`) || 0;
if (missingCount > 0) {
log.info(`Found ${missingCount} notes that are missing from search index`);
// Index missing notes using basic SQLite functions
sql.execute(`
INSERT INTO note_search_content
(noteId, title, content, title_normalized, content_normalized, full_text_normalized)
SELECT
n.noteId,
n.title,
COALESCE(b.content, ''),
LOWER(n.title),
LOWER(COALESCE(b.content, '')),
LOWER(n.title || ' ' || COALESCE(b.content, ''))
FROM notes n
LEFT JOIN blobs b ON n.blobId = b.blobId
WHERE n.type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap')
AND n.isDeleted = 0
AND n.isProtected = 0
AND b.content IS NOT NULL
AND NOT EXISTS (SELECT 1 FROM note_search_content WHERE noteId = n.noteId)
`);
log.info(`Indexed ${missingCount} missing notes`);
}
// Populate tokens for all existing content (including any missing notes we just added)
populateAllTokens();
// Verify table creation
const tables = sql.getColumn<string>(`
SELECT name FROM sqlite_master
WHERE type = 'table'
AND name IN ('note_search_content', 'note_tokens')
`);
if (tables.length !== 2) {
throw new Error("Search tables were not created properly");
}
// Check row counts
const searchContentCount = sql.getValue<number>("SELECT COUNT(*) FROM note_search_content") || 0;
const tokenCount = sql.getValue<number>("SELECT COUNT(*) FROM note_tokens") || 0;
log.info(`Search content table has ${searchContentCount} entries`);
log.info(`Token table has ${tokenCount} entries`);
// Run ANALYZE to update SQLite query planner statistics
log.info("Updating SQLite statistics for query optimization...");
sql.execute("ANALYZE note_search_content");
sql.execute("ANALYZE note_tokens");
// Verify indexes were created
const indexes = sql.getColumn<string>(`
SELECT name FROM sqlite_master
WHERE type = 'index'
AND tbl_name IN ('note_search_content', 'note_tokens')
`);
log.info(`Created ${indexes.length} indexes for search optimization`);
log.info("Search setup finalization completed");
}

View File

@@ -0,0 +1,47 @@
/**
* Migration to clean up custom SQLite search implementation
*
* This migration removes tables and triggers created by migration 0235
* which implemented a custom SQLite-based search system. That system
* has been replaced by FTS5 with trigram tokenizer (migration 0234),
* making these custom tables redundant.
*
* Tables removed:
* - note_search_content: Stored normalized note content for custom search
* - note_tokens: Stored tokenized words for custom token-based search
*
* This migration is safe to run on databases that:
* 1. Never ran migration 0235 (tables don't exist)
* 2. Already ran migration 0235 (tables will be dropped)
*/
import sql from "../services/sql.js";
import log from "../services/log.js";
export default function cleanupSqliteSearch() {
log.info("Starting SQLite custom search cleanup migration...");
try {
sql.transactional(() => {
// Drop custom search tables if they exist
log.info("Dropping note_search_content table...");
sql.executeScript(`DROP TABLE IF EXISTS note_search_content`);
log.info("Dropping note_tokens table...");
sql.executeScript(`DROP TABLE IF EXISTS note_tokens`);
// Clean up any entity changes for these tables
// This prevents sync issues and cleans up change tracking
log.info("Cleaning up entity changes for removed tables...");
sql.execute(`
DELETE FROM entity_changes
WHERE entityName IN ('note_search_content', 'note_tokens')
`);
log.info("SQLite custom search cleanup completed successfully");
});
} catch (error) {
log.error(`Error during SQLite search cleanup: ${error}`);
throw new Error(`Failed to clean up SQLite search tables: ${error}`);
}
}

View File

@@ -6,10 +6,10 @@
// Migrations should be kept in descending order, so the latest migration is first. // Migrations should be kept in descending order, so the latest migration is first.
const MIGRATIONS: (SqlMigration | JsMigration)[] = [ const MIGRATIONS: (SqlMigration | JsMigration)[] = [
// Add SQLite native search with normalized text tables // Clean up custom SQLite search tables (replaced by FTS5 trigram)
{ {
version: 235, version: 236,
module: async () => import("./0235__sqlite_native_search.js") module: async () => import("./0236__cleanup_sqlite_search.js")
}, },
// Add FTS5 full-text search support and strategic performance indexes // Add FTS5 full-text search support and strategic performance indexes
{ {

View File

@@ -1,243 +0,0 @@
/**
* API endpoints for search administration and monitoring
*/
import { Router } from "express";
import performanceMonitor from "../../services/search/performance_monitor.js";
import abTestingService from "../../services/search/ab_testing.js";
import { SQLiteSearchService } from "../../services/search/sqlite_search_service.js";
import optionService from "../../services/options.js";
import sql from "../../services/sql.js";
import log from "../../services/log.js";
const router = Router();
/**
* Get search performance metrics
*/
router.get("/api/search-admin/metrics", (req, res) => {
const metrics = {
recent: performanceMonitor.getRecentMetrics(100),
averages: {
typescript: performanceMonitor.getAverageMetrics("typescript"),
sqlite: performanceMonitor.getAverageMetrics("sqlite")
},
comparison: performanceMonitor.compareBackends()
};
res.json(metrics);
});
/**
* Get A/B testing results
*/
router.get("/api/search-admin/ab-tests", (req, res) => {
const results = {
summary: abTestingService.getSummary(),
recent: abTestingService.getRecentResults(50)
};
res.json(results);
});
/**
* Get current search configuration
*/
router.get("/api/search-admin/config", (req, res) => {
const config = {
backend: optionService.getOption("searchBackend"),
sqliteEnabled: optionService.getOptionBool("searchSqliteEnabled"),
performanceLogging: optionService.getOptionBool("searchSqlitePerformanceLogging"),
maxMemory: optionService.getOptionInt("searchSqliteMaxMemory"),
batchSize: optionService.getOptionInt("searchSqliteBatchSize"),
autoRebuild: optionService.getOptionBool("searchSqliteAutoRebuild")
};
res.json(config);
});
/**
* Update search configuration
*/
router.put("/api/search-admin/config", (req, res) => {
try {
const { backend, sqliteEnabled, performanceLogging, maxMemory, batchSize, autoRebuild } = req.body;
if (backend !== undefined) {
if (!["typescript", "sqlite"].includes(backend)) {
return res.status(400).json({ error: "Invalid backend. Must be 'typescript' or 'sqlite'" });
}
optionService.setOption("searchBackend", backend);
}
if (sqliteEnabled !== undefined) {
optionService.setOption("searchSqliteEnabled", sqliteEnabled ? "true" : "false");
}
if (performanceLogging !== undefined) {
optionService.setOption("searchSqlitePerformanceLogging", performanceLogging ? "true" : "false");
performanceMonitor.updateSettings();
}
if (maxMemory !== undefined) {
if (maxMemory < 1048576 || maxMemory > 1073741824) { // 1MB to 1GB
return res.status(400).json({ error: "Max memory must be between 1MB and 1GB" });
}
optionService.setOption("searchSqliteMaxMemory", maxMemory.toString());
}
if (batchSize !== undefined) {
if (batchSize < 10 || batchSize > 1000) {
return res.status(400).json({ error: "Batch size must be between 10 and 1000" });
}
optionService.setOption("searchSqliteBatchSize", batchSize.toString());
}
if (autoRebuild !== undefined) {
optionService.setOption("searchSqliteAutoRebuild", autoRebuild ? "true" : "false");
}
res.json({ success: true, message: "Configuration updated successfully" });
} catch (error: any) {
log.error(`Failed to update search configuration: ${error}`);
res.status(500).json({ error: error.message });
}
});
/**
* Get SQLite search index status
*/
router.get("/api/search-admin/sqlite/status", async (req, res) => {
try {
const service = SQLiteSearchService.getInstance();
const status = await service.getIndexStatus();
// Add table sizes
const tableSizes = sql.getRows<{ name: string; size: number }>(`
SELECT
name,
(SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name=m.name) as size
FROM sqlite_master m
WHERE type='table' AND name IN ('note_search_content', 'note_tokens', 'notes_fts', 'notes_fts_data', 'notes_fts_idx', 'notes_fts_content')
`);
res.json({
...status,
tables: tableSizes
});
} catch (error: any) {
log.error(`Failed to get SQLite search status: ${error}`);
res.status(500).json({ error: error.message });
}
});
/**
* Rebuild SQLite search index
*/
router.post("/api/search-admin/sqlite/rebuild", async (req, res) => {
try {
const { force = false } = req.body;
log.info("Starting SQLite search index rebuild via API");
const service = SQLiteSearchService.getInstance();
const startTime = Date.now();
await service.rebuildIndex(force);
const duration = Date.now() - startTime;
log.info(`SQLite search index rebuild completed in ${duration}ms`);
res.json({
success: true,
message: "Index rebuilt successfully",
duration
});
} catch (error: any) {
log.error(`Failed to rebuild SQLite search index: ${error}`);
res.status(500).json({ error: error.message });
}
});
/**
* Clear SQLite search index
*/
router.delete("/api/search-admin/sqlite/index", async (req, res) => {
try {
log.info("Clearing SQLite search index via API");
const service = SQLiteSearchService.getInstance();
service.clearIndex();
res.json({
success: true,
message: "Index cleared successfully"
});
} catch (error: any) {
log.error(`Failed to clear SQLite search index: ${error}`);
res.status(500).json({ error: error.message });
}
});
/**
* Reset performance metrics
*/
router.delete("/api/search-admin/metrics", (req, res) => {
performanceMonitor.reset();
res.json({ success: true, message: "Metrics reset successfully" });
});
/**
* Reset A/B test results
*/
router.delete("/api/search-admin/ab-tests", (req, res) => {
abTestingService.reset();
res.json({ success: true, message: "A/B test results reset successfully" });
});
/**
* Set A/B testing sample rate
*/
router.put("/api/search-admin/ab-tests/sample-rate", (req, res) => {
try {
const { rate } = req.body;
if (rate === undefined || rate < 0 || rate > 1) {
return res.status(400).json({ error: "Sample rate must be between 0 and 1" });
}
abTestingService.setSampleRate(rate);
res.json({ success: true, message: `Sample rate set to ${rate * 100}%` });
} catch (error: any) {
res.status(500).json({ error: error.message });
}
});
/**
* Test search with both backends for comparison
*/
router.post("/api/search-admin/test", async (req, res) => {
try {
const { query } = req.body;
if (!query) {
return res.status(400).json({ error: "Query is required" });
}
const result = await abTestingService.runComparison(query, {});
if (!result) {
return res.json({
message: "Test not run (sampling or disabled)",
query
});
}
res.json(result);
} catch (error: any) {
log.error(`Search test failed: ${error}`);
res.status(500).json({ error: error.message });
}
});
export default router;

View File

@@ -40,7 +40,6 @@ import scriptRoute from "./api/script.js";
import senderRoute from "./api/sender.js"; import senderRoute from "./api/sender.js";
import filesRoute from "./api/files.js"; import filesRoute from "./api/files.js";
import searchRoute from "./api/search.js"; import searchRoute from "./api/search.js";
import searchAdminRoute from "./api/search_admin.js";
import bulkActionRoute from "./api/bulk_action.js"; import bulkActionRoute from "./api/bulk_action.js";
import specialNotesRoute from "./api/special_notes.js"; import specialNotesRoute from "./api/special_notes.js";
import noteMapRoute from "./api/note_map.js"; import noteMapRoute from "./api/note_map.js";
@@ -261,9 +260,6 @@ function register(app: express.Application) {
apiRoute(GET, "/api/search/:searchString", searchRoute.search); apiRoute(GET, "/api/search/:searchString", searchRoute.search);
apiRoute(GET, "/api/search-templates", searchRoute.searchTemplates); apiRoute(GET, "/api/search-templates", searchRoute.searchTemplates);
// Search administration routes
app.use(searchAdminRoute);
apiRoute(PST, "/api/bulk-action/execute", bulkActionRoute.execute); apiRoute(PST, "/api/bulk-action/execute", bulkActionRoute.execute);
apiRoute(PST, "/api/bulk-action/affected-notes", bulkActionRoute.getAffectedNoteCount); apiRoute(PST, "/api/bulk-action/affected-notes", bulkActionRoute.getAffectedNoteCount);

View File

@@ -215,14 +215,6 @@ const defaultOptions: DefaultOption[] = [
{ name: "aiSystemPrompt", value: "", isSynced: true }, { name: "aiSystemPrompt", value: "", isSynced: true },
{ name: "aiSelectedProvider", value: "openai", isSynced: true }, { name: "aiSelectedProvider", value: "openai", isSynced: true },
// Search configuration
{ name: "searchBackend", value: "typescript", isSynced: false }, // "typescript" or "sqlite"
{ name: "searchSqliteEnabled", value: "false", isSynced: false },
{ name: "searchSqlitePerformanceLogging", value: "false", isSynced: false },
{ name: "searchSqliteMaxMemory", value: "67108864", isSynced: false }, // 64MB default
{ name: "searchSqliteBatchSize", value: "100", isSynced: false },
{ name: "searchSqliteAutoRebuild", value: "true", isSynced: false },
{ name: "seenCallToActions", value: "[]", isSynced: true } { name: "seenCallToActions", value: "[]", isSynced: true }
]; ];

View File

@@ -1,218 +0,0 @@
/**
* A/B Testing utilities for comparing search backend performance
*/
import SearchContext from "./search_context.js";
import type { SearchParams } from "./services/types.js";
import performanceMonitor from "./performance_monitor.js";
import log from "../log.js";
import optionService from "../options.js";
export interface ABTestResult {
query: string;
typescriptTime: number;
sqliteTime: number;
typescriptResults: number;
sqliteResults: number;
resultsMatch: boolean;
speedup: number;
winner: "typescript" | "sqlite" | "tie";
}
class ABTestingService {
private enabled: boolean = false;
private sampleRate: number = 0.1; // 10% of searches by default
private results: ABTestResult[] = [];
private maxResults: number = 1000;
constructor() {
this.updateSettings();
}
updateSettings() {
try {
this.enabled = optionService.getOptionBool("searchSqliteEnabled");
// Could add a separate AB testing option if needed
} catch {
this.enabled = false;
}
}
/**
* Determines if we should run an A/B test for this query
*/
shouldRunTest(): boolean {
if (!this.enabled) {
return false;
}
// Random sampling
return Math.random() < this.sampleRate;
}
/**
* Run the same search query with both backends and compare results
*/
async runComparison(query: string, params: SearchParams): Promise<ABTestResult | null> {
if (!this.shouldRunTest()) {
return null;
}
try {
// Dynamically import to avoid circular dependencies
const searchModule = await import("./services/search.js");
// Run with TypeScript backend
const tsContext = new SearchContext({ ...params, forceBackend: "typescript" });
const tsTimer = performanceMonitor.startTimer();
const tsResults = searchModule.default.findResultsWithQuery(query, tsContext);
const tsTime = tsTimer();
// Run with SQLite backend
const sqliteContext = new SearchContext({ ...params, forceBackend: "sqlite" });
const sqliteTimer = performanceMonitor.startTimer();
const sqliteResults = searchModule.default.findResultsWithQuery(query, sqliteContext);
const sqliteTime = sqliteTimer();
// Compare results
const tsNoteIds = new Set(tsResults.map(r => r.noteId));
const sqliteNoteIds = new Set(sqliteResults.map(r => r.noteId));
// Check if results match (same notes found)
const resultsMatch = tsNoteIds.size === sqliteNoteIds.size &&
[...tsNoteIds].every(id => sqliteNoteIds.has(id));
// Calculate speedup
const speedup = tsTime / sqliteTime;
// Determine winner
let winner: "typescript" | "sqlite" | "tie";
if (speedup > 1.2) {
winner = "sqlite";
} else if (speedup < 0.83) {
winner = "typescript";
} else {
winner = "tie";
}
const result: ABTestResult = {
query: query.substring(0, 100),
typescriptTime: tsTime,
sqliteTime: sqliteTime,
typescriptResults: tsResults.length,
sqliteResults: sqliteResults.length,
resultsMatch,
speedup,
winner
};
this.recordResult(result);
// Log significant differences
if (!resultsMatch) {
log.info(`A/B test found different results for query "${query.substring(0, 50)}": TS=${tsResults.length}, SQLite=${sqliteResults.length}`);
}
if (Math.abs(speedup - 1) > 0.5) {
log.info(`A/B test significant performance difference: ${winner} is ${Math.abs(speedup - 1).toFixed(1)}x faster for query "${query.substring(0, 50)}"`);
}
return result;
} catch (error) {
log.error(`A/B test failed: ${error}`);
return null;
}
}
private recordResult(result: ABTestResult) {
this.results.push(result);
// Keep only the last N results
if (this.results.length > this.maxResults) {
this.results = this.results.slice(-this.maxResults);
}
}
/**
* Get summary statistics from A/B tests
*/
getSummary(): {
totalTests: number;
avgSpeedup: number;
typescriptWins: number;
sqliteWins: number;
ties: number;
mismatchRate: number;
recommendation: string;
} {
if (this.results.length === 0) {
return {
totalTests: 0,
avgSpeedup: 1,
typescriptWins: 0,
sqliteWins: 0,
ties: 0,
mismatchRate: 0,
recommendation: "No A/B test data available"
};
}
const totalTests = this.results.length;
const avgSpeedup = this.results.reduce((sum, r) => sum + r.speedup, 0) / totalTests;
const typescriptWins = this.results.filter(r => r.winner === "typescript").length;
const sqliteWins = this.results.filter(r => r.winner === "sqlite").length;
const ties = this.results.filter(r => r.winner === "tie").length;
const mismatches = this.results.filter(r => !r.resultsMatch).length;
const mismatchRate = mismatches / totalTests;
let recommendation: string;
if (mismatchRate > 0.1) {
recommendation = "High mismatch rate detected - SQLite search may have accuracy issues";
} else if (avgSpeedup > 1.5) {
recommendation = `SQLite is ${avgSpeedup.toFixed(1)}x faster on average - consider enabling`;
} else if (avgSpeedup < 0.67) {
recommendation = `TypeScript is ${(1/avgSpeedup).toFixed(1)}x faster on average - keep using TypeScript`;
} else {
recommendation = "Both backends perform similarly - choice depends on other factors";
}
return {
totalTests,
avgSpeedup,
typescriptWins,
sqliteWins,
ties,
mismatchRate,
recommendation
};
}
/**
* Get recent test results
*/
getRecentResults(count: number = 100): ABTestResult[] {
return this.results.slice(-count);
}
/**
* Clear all test results
*/
reset() {
this.results = [];
}
/**
* Set the sampling rate for A/B tests
*/
setSampleRate(rate: number) {
if (rate < 0 || rate > 1) {
throw new Error("Sample rate must be between 0 and 1");
}
this.sampleRate = rate;
}
}
// Singleton instance
const abTestingService = new ABTestingService();
export default abTestingService;

View File

@@ -1,155 +0,0 @@
/**
* SQLite-based Note Content Fulltext Expression
*
* This is a drop-in replacement for NoteContentFulltextExp that uses
* the SQLite search service for dramatically improved performance.
* It maintains 100% compatibility with the existing API while providing
* 10-30x speed improvements.
*/
import type SearchContext from "../search_context.js";
import Expression from "./expression.js";
import NoteSet from "../note_set.js";
import log from "../../log.js";
import becca from "../../../becca/becca.js";
import { getSQLiteSearchService, type SearchOptions } from "../sqlite_search_service.js";
const ALLOWED_OPERATORS = new Set(["=", "!=", "*=*", "*=", "=*", "%=", "~=", "~*"]);
interface ConstructorOpts {
tokens: string[];
raw?: boolean;
flatText?: boolean;
}
/**
* SQLite-optimized implementation of note content fulltext search
*/
class NoteContentSQLiteExp extends Expression {
private operator: string;
tokens: string[];
private raw: boolean;
private flatText: boolean;
private sqliteService = getSQLiteSearchService();
constructor(operator: string, { tokens, raw, flatText }: ConstructorOpts) {
super();
if (!operator || !tokens || !Array.isArray(tokens)) {
throw new Error('Invalid parameters: operator and tokens are required');
}
this.operator = operator;
this.tokens = tokens;
this.raw = !!raw;
this.flatText = !!flatText;
}
execute(inputNoteSet: NoteSet, executionContext: {}, searchContext: SearchContext) {
if (!ALLOWED_OPERATORS.has(this.operator)) {
searchContext.addError(`Note content can be searched only with operators: ${Array.from(ALLOWED_OPERATORS).join(", ")}, operator ${this.operator} given.`);
return inputNoteSet;
}
const resultNoteSet = new NoteSet();
const startTime = Date.now();
try {
// Prepare search options
const searchOptions: SearchOptions = {
includeProtected: searchContext.includeArchivedNotes,
includeDeleted: false,
limit: searchContext.limit || undefined
};
// If we have an input note set, use it as a filter
if (inputNoteSet.notes.length > 0) {
searchOptions.noteIdFilter = new Set(inputNoteSet.getNoteIds());
}
// Map ~* operator to ~= for SQLite service
const mappedOperator = this.operator === "~*" ? "~=" : this.operator;
// Execute SQLite search
const noteIds = this.sqliteService.search(
this.tokens,
mappedOperator,
searchContext,
searchOptions
);
// Build result note set from note IDs
for (const noteId of noteIds) {
const note = becca.notes[noteId];
if (note) {
resultNoteSet.add(note);
}
}
// Log performance if enabled
const elapsed = Date.now() - startTime;
if (searchContext.debug) {
log.info(`SQLite search completed: operator=${this.operator}, tokens=${this.tokens.join(" ")}, ` +
`results=${noteIds.size}, time=${elapsed}ms`);
}
// Store highlighted tokens for UI
if (noteIds.size > 0) {
searchContext.highlightedTokens = this.tokens;
}
} catch (error) {
log.error(`SQLite search failed: ${error}`);
searchContext.addError(`Search failed: ${error}`);
// On error, return input set unchanged
return inputNoteSet;
}
return resultNoteSet;
}
/**
* Get performance statistics for monitoring
*/
getStatistics() {
return this.sqliteService.getStatistics();
}
/**
* Check if SQLite search is available
*/
static isAvailable(): boolean {
const service = getSQLiteSearchService();
const stats = service.getStatistics();
return stats.tablesInitialized;
}
/**
* Create a compatible expression based on availability
* This allows gradual migration from the old implementation
*/
static createExpression(operator: string, opts: ConstructorOpts): Expression {
if (NoteContentSQLiteExp.isAvailable()) {
return new NoteContentSQLiteExp(operator, opts);
} else {
// Fall back to original implementation if SQLite not ready
// This would import the original NoteContentFulltextExp
log.info("SQLite search not available, using fallback implementation");
// Dynamic import to avoid circular dependency
const NoteContentFulltextExp = require("./note_content_fulltext.js").default;
return new NoteContentFulltextExp(operator, opts);
}
}
}
export default NoteContentSQLiteExp;
/**
* Factory function for creating search expressions
* This can be used as a drop-in replacement in the expression builder
*/
export function createNoteContentExpression(operator: string, opts: ConstructorOpts): Expression {
return NoteContentSQLiteExp.createExpression(operator, opts);
}

View File

@@ -1,405 +0,0 @@
/**
* Tests for FTS5 blob deduplication scenarios
*
* This test file validates that FTS indexing works correctly when:
* 1. Multiple notes share the same blob (deduplication)
* 2. Notes change content to match existing blobs
* 3. Blobs are updated and affect multiple notes
* 4. Notes switch between unique and shared blobs
*/
import { describe, it, expect, beforeEach, afterEach } from 'vitest';
import sql from '../sql.js';
import beccaLoader from '../../becca/becca_loader.js';
import noteService from '../notes.js';
import searchService from './services/search.js';
import { ftsSearchService } from './fts_search.js';
describe('FTS5 Blob Deduplication Tests', () => {
beforeEach(() => {
// Ensure we have a clean test database with FTS enabled
sql.execute("DELETE FROM notes WHERE noteId LIKE 'test_%'");
sql.execute("DELETE FROM blobs WHERE blobId LIKE 'test_%'");
sql.execute("DELETE FROM notes_fts WHERE noteId LIKE 'test_%'");
// Reload becca to ensure cache is in sync
beccaLoader.load();
});
afterEach(() => {
// Clean up test data
sql.execute("DELETE FROM notes WHERE noteId LIKE 'test_%'");
sql.execute("DELETE FROM blobs WHERE blobId LIKE 'test_%'");
sql.execute("DELETE FROM notes_fts WHERE noteId LIKE 'test_%'");
});
describe('Blob Deduplication Scenarios', () => {
it('should index multiple notes sharing the same blob', async () => {
// Create first note with unique content
const note1 = await noteService.createNewNote({
noteId: 'test_note1',
parentNoteId: 'root',
title: 'Test Note 1',
content: 'Shared content for deduplication test',
type: 'text'
});
// Create second note with the same content (will share blob)
const note2 = await noteService.createNewNote({
noteId: 'test_note2',
parentNoteId: 'root',
title: 'Test Note 2',
content: 'Shared content for deduplication test',
type: 'text'
});
// Verify both notes share the same blob
const blob1 = sql.getRow("SELECT blobId FROM notes WHERE noteId = ?", ['test_note1']);
const blob2 = sql.getRow("SELECT blobId FROM notes WHERE noteId = ?", ['test_note2']);
expect(blob1.blobId).toBe(blob2.blobId);
// Verify both notes are indexed in FTS
const ftsCount = sql.getValue(
"SELECT COUNT(*) FROM notes_fts WHERE noteId IN (?, ?)",
['test_note1', 'test_note2']
);
expect(ftsCount).toBe(2);
// Search should find both notes
const searchResults = searchService.searchNotes('deduplication');
const foundNoteIds = searchResults.map(r => r.noteId);
expect(foundNoteIds).toContain('test_note1');
expect(foundNoteIds).toContain('test_note2');
});
it('should update FTS when note content changes to match existing blob', async () => {
// Create first note with unique content
const note1 = await noteService.createNewNote({
noteId: 'test_note3',
parentNoteId: 'root',
title: 'Note with existing content',
content: 'This is existing content in the database',
type: 'text'
});
// Create second note with different content
const note2 = await noteService.createNewNote({
noteId: 'test_note4',
parentNoteId: 'root',
title: 'Note with different content',
content: 'This is completely different content',
type: 'text'
});
// Verify notes have different blobs initially
const initialBlob1 = sql.getValue("SELECT blobId FROM notes WHERE noteId = ?", ['test_note3']);
const initialBlob2 = sql.getValue("SELECT blobId FROM notes WHERE noteId = ?", ['test_note4']);
expect(initialBlob1).not.toBe(initialBlob2);
// Change note2's content to match note1 (deduplication occurs)
await noteService.updateNoteContent('test_note4', 'This is existing content in the database');
// Verify both notes now share the same blob
const finalBlob1 = sql.getValue("SELECT blobId FROM notes WHERE noteId = ?", ['test_note3']);
const finalBlob2 = sql.getValue("SELECT blobId FROM notes WHERE noteId = ?", ['test_note4']);
expect(finalBlob1).toBe(finalBlob2);
// Verify FTS is updated correctly for note2
const ftsContent = sql.getValue(
"SELECT content FROM notes_fts WHERE noteId = ?",
['test_note4']
);
expect(ftsContent).toBe('This is existing content in the database');
// Search for old content should not find note2
const oldContentSearch = searchService.searchNotes('completely different');
const oldSearchIds = oldContentSearch.map(r => r.noteId);
expect(oldSearchIds).not.toContain('test_note4');
// Search for new content should find both notes
const newContentSearch = searchService.searchNotes('existing content');
const newSearchIds = newContentSearch.map(r => r.noteId);
expect(newSearchIds).toContain('test_note3');
expect(newSearchIds).toContain('test_note4');
});
it('should update all notes when shared blob content changes', async () => {
// Create three notes with the same content
const sharedContent = 'Original shared content for blob update test';
await noteService.createNewNote({
noteId: 'test_note5',
parentNoteId: 'root',
title: 'Shared Note 1',
content: sharedContent,
type: 'text'
});
await noteService.createNewNote({
noteId: 'test_note6',
parentNoteId: 'root',
title: 'Shared Note 2',
content: sharedContent,
type: 'text'
});
await noteService.createNewNote({
noteId: 'test_note7',
parentNoteId: 'root',
title: 'Shared Note 3',
content: sharedContent,
type: 'text'
});
// Verify all three share the same blob
const blobIds = sql.getColumn(
"SELECT DISTINCT blobId FROM notes WHERE noteId IN (?, ?, ?)",
['test_note5', 'test_note6', 'test_note7']
);
expect(blobIds.length).toBe(1);
const sharedBlobId = blobIds[0];
// Update the blob content directly (simulating what would happen in real update)
sql.execute(
"UPDATE blobs SET content = ? WHERE blobId = ?",
['Updated shared content for all notes', sharedBlobId]
);
// Verify FTS is updated for all three notes
const ftsContents = sql.getColumn(
"SELECT content FROM notes_fts WHERE noteId IN (?, ?, ?) ORDER BY noteId",
['test_note5', 'test_note6', 'test_note7']
);
expect(ftsContents).toHaveLength(3);
ftsContents.forEach(content => {
expect(content).toBe('Updated shared content for all notes');
});
// Search for old content should find nothing
const oldSearch = searchService.searchNotes('Original shared');
expect(oldSearch.filter(r => r.noteId.startsWith('test_'))).toHaveLength(0);
// Search for new content should find all three
const newSearch = searchService.searchNotes('Updated shared');
const foundIds = newSearch.map(r => r.noteId).filter(id => id.startsWith('test_'));
expect(foundIds).toContain('test_note5');
expect(foundIds).toContain('test_note6');
expect(foundIds).toContain('test_note7');
});
it('should handle note switching from shared to unique blob', async () => {
// Create two notes with shared content
const sharedContent = 'Shared content before divergence';
const note1 = await noteService.createNewNote({
noteId: 'test_note8',
parentNoteId: 'root',
title: 'Diverging Note 1',
content: sharedContent,
type: 'text'
});
const note2 = await noteService.createNewNote({
noteId: 'test_note9',
parentNoteId: 'root',
title: 'Diverging Note 2',
content: sharedContent,
type: 'text'
});
// Verify they share the same blob
const initialBlob1 = sql.getValue("SELECT blobId FROM notes WHERE noteId = ?", ['test_note8']);
const initialBlob2 = sql.getValue("SELECT blobId FROM notes WHERE noteId = ?", ['test_note9']);
expect(initialBlob1).toBe(initialBlob2);
// Change note2 to unique content
await noteService.updateNoteContent('test_note9', 'Unique content after divergence');
// Verify they now have different blobs
const finalBlob1 = sql.getValue("SELECT blobId FROM notes WHERE noteId = ?", ['test_note8']);
const finalBlob2 = sql.getValue("SELECT blobId FROM notes WHERE noteId = ?", ['test_note9']);
expect(finalBlob1).not.toBe(finalBlob2);
// Verify FTS is correctly updated
const ftsContent1 = sql.getValue(
"SELECT content FROM notes_fts WHERE noteId = ?",
['test_note8']
);
const ftsContent2 = sql.getValue(
"SELECT content FROM notes_fts WHERE noteId = ?",
['test_note9']
);
expect(ftsContent1).toBe('Shared content before divergence');
expect(ftsContent2).toBe('Unique content after divergence');
// Search should find correct notes
const sharedSearch = searchService.searchNotes('before divergence');
expect(sharedSearch.map(r => r.noteId)).toContain('test_note8');
expect(sharedSearch.map(r => r.noteId)).not.toContain('test_note9');
const uniqueSearch = searchService.searchNotes('after divergence');
expect(uniqueSearch.map(r => r.noteId)).not.toContain('test_note8');
expect(uniqueSearch.map(r => r.noteId)).toContain('test_note9');
});
it('should handle import scenarios where notes exist before blobs', async () => {
// Simulate import scenario: create note without blob first
sql.execute(`
INSERT INTO notes (noteId, title, type, mime, blobId, isDeleted, isProtected, dateCreated, dateModified, utcDateCreated, utcDateModified)
VALUES ('test_note10', 'Import Test Note', 'text', 'text/html', 'pending_blob_123', 0, 0, datetime('now'), datetime('now'), datetime('now'), datetime('now'))
`);
// Verify note is not in FTS yet (no blob content)
const initialFts = sql.getValue(
"SELECT COUNT(*) FROM notes_fts WHERE noteId = ?",
['test_note10']
);
expect(initialFts).toBe(0);
// Now create the blob (simulating delayed blob creation during import)
sql.execute(`
INSERT INTO blobs (blobId, content, dateModified, utcDateModified)
VALUES ('pending_blob_123', 'Imported content finally available', datetime('now'), datetime('now'))
`);
// Verify note is now indexed in FTS
const finalFts = sql.getValue(
"SELECT content FROM notes_fts WHERE noteId = ?",
['test_note10']
);
expect(finalFts).toBe('Imported content finally available');
// Search should now find the note
const searchResults = searchService.searchNotes('Imported content');
expect(searchResults.map(r => r.noteId)).toContain('test_note10');
});
it('should correctly handle protected notes during deduplication', async () => {
// Create a regular note
const note1 = await noteService.createNewNote({
noteId: 'test_note11',
parentNoteId: 'root',
title: 'Regular Note',
content: 'Content that will be shared',
type: 'text'
});
// Create a protected note with same content
sql.execute(`
INSERT INTO notes (noteId, title, type, mime, blobId, isDeleted, isProtected, dateCreated, dateModified, utcDateCreated, utcDateModified)
VALUES ('test_note12', 'Protected Note', 'text', 'text/html',
(SELECT blobId FROM notes WHERE noteId = 'test_note11'),
0, 1, datetime('now'), datetime('now'), datetime('now'), datetime('now'))
`);
// Verify protected note is NOT in FTS
const protectedInFts = sql.getValue(
"SELECT COUNT(*) FROM notes_fts WHERE noteId = ?",
['test_note12']
);
expect(protectedInFts).toBe(0);
// Verify regular note IS in FTS
const regularInFts = sql.getValue(
"SELECT COUNT(*) FROM notes_fts WHERE noteId = ?",
['test_note11']
);
expect(regularInFts).toBe(1);
// Update blob content
const blobId = sql.getValue("SELECT blobId FROM notes WHERE noteId = ?", ['test_note11']);
sql.execute("UPDATE blobs SET content = ? WHERE blobId = ?", ['Updated shared content', blobId]);
// Verify regular note is updated in FTS
const updatedContent = sql.getValue(
"SELECT content FROM notes_fts WHERE noteId = ?",
['test_note11']
);
expect(updatedContent).toBe('Updated shared content');
// Verify protected note is still NOT in FTS
const protectedStillNotInFts = sql.getValue(
"SELECT COUNT(*) FROM notes_fts WHERE noteId = ?",
['test_note12']
);
expect(protectedStillNotInFts).toBe(0);
});
});
describe('FTS Sync and Cleanup', () => {
it('should sync missing notes to FTS index', async () => {
// Manually create notes without triggering FTS (simulating missed triggers)
sql.execute(`
INSERT INTO notes (noteId, title, type, mime, blobId, isDeleted, isProtected, dateCreated, dateModified, utcDateCreated, utcDateModified)
VALUES ('test_note13', 'Missed Note 1', 'text', 'text/html', 'blob_missed_1', 0, 0, datetime('now'), datetime('now'), datetime('now'), datetime('now'))
`);
sql.execute(`
INSERT INTO blobs (blobId, content, dateModified, utcDateModified)
VALUES ('blob_missed_1', 'Content that was missed by triggers', datetime('now'), datetime('now'))
`);
// Delete from FTS to simulate missing index
sql.execute("DELETE FROM notes_fts WHERE noteId = 'test_note13'");
// Verify note is missing from FTS
const beforeSync = sql.getValue(
"SELECT COUNT(*) FROM notes_fts WHERE noteId = ?",
['test_note13']
);
expect(beforeSync).toBe(0);
// Run sync
const syncedCount = ftsSearchService.syncMissingNotes(['test_note13']);
expect(syncedCount).toBe(1);
// Verify note is now in FTS
const afterSync = sql.getValue(
"SELECT content FROM notes_fts WHERE noteId = ?",
['test_note13']
);
expect(afterSync).toBe('Content that was missed by triggers');
});
it('should handle FTS rebuild correctly', () => {
// Create some test notes
const noteIds = ['test_note14', 'test_note15', 'test_note16'];
noteIds.forEach((noteId, index) => {
sql.execute(`
INSERT INTO notes (noteId, title, type, mime, blobId, isDeleted, isProtected, dateCreated, dateModified, utcDateCreated, utcDateModified)
VALUES (?, ?, 'text', 'text/html', ?, 0, 0, datetime('now'), datetime('now'), datetime('now'), datetime('now'))
`, [noteId, `Test Note ${index}`, `blob_${noteId}`]);
sql.execute(`
INSERT INTO blobs (blobId, content, dateModified, utcDateModified)
VALUES (?, ?, datetime('now'), datetime('now'))
`, [`blob_${noteId}`, `Content for note ${index}`]);
});
// Corrupt FTS by adding invalid entries
sql.execute("INSERT INTO notes_fts (noteId, title, content) VALUES ('invalid_note', 'Invalid', 'Invalid content')");
// Rebuild index
ftsSearchService.rebuildIndex();
// Verify only valid notes are in FTS
const ftsCount = sql.getValue("SELECT COUNT(*) FROM notes_fts WHERE noteId LIKE 'test_%'");
expect(ftsCount).toBe(3);
// Verify invalid entry is gone
const invalidCount = sql.getValue("SELECT COUNT(*) FROM notes_fts WHERE noteId = 'invalid_note'");
expect(invalidCount).toBe(0);
// Verify content is correct
noteIds.forEach((noteId, index) => {
const content = sql.getValue(
"SELECT content FROM notes_fts WHERE noteId = ?",
[noteId]
);
expect(content).toBe(`Content for note ${index}`);
});
});
});
});

View File

@@ -1,9 +1,9 @@
/** /**
* FTS5 Search Service * FTS5 Search Service
* *
* Encapsulates all FTS5-specific operations for full-text searching. * Encapsulates all FTS5-specific operations for full-text searching.
* Provides efficient text search using SQLite's FTS5 extension with: * Provides efficient text search using SQLite's FTS5 extension with:
* - Porter stemming for better matching * - Trigram tokenization for fast substring matching
* - Snippet extraction for context * - Snippet extraction for context
* - Highlighting of matched terms * - Highlighting of matched terms
* - Query syntax conversion from Trilium to FTS5 * - Query syntax conversion from Trilium to FTS5
@@ -115,7 +115,7 @@ class FTSSearchService {
/** /**
* Converts Trilium search syntax to FTS5 MATCH syntax * Converts Trilium search syntax to FTS5 MATCH syntax
* *
* @param tokens - Array of search tokens * @param tokens - Array of search tokens
* @param operator - Trilium search operator * @param operator - Trilium search operator
* @returns FTS5 MATCH query string * @returns FTS5 MATCH query string
@@ -125,8 +125,18 @@ class FTSSearchService {
throw new Error("No search tokens provided"); throw new Error("No search tokens provided");
} }
// Trigram tokenizer requires minimum 3 characters
const shortTokens = tokens.filter(token => token.length < 3);
if (shortTokens.length > 0) {
const shortList = shortTokens.join(', ');
log.info(`Tokens shorter than 3 characters detected (${shortList}) - cannot use trigram FTS5`);
throw new FTSNotAvailableError(
`Trigram tokenizer requires tokens of at least 3 characters. Short tokens: ${shortList}`
);
}
// Sanitize tokens to prevent FTS5 syntax injection // Sanitize tokens to prevent FTS5 syntax injection
const sanitizedTokens = tokens.map(token => const sanitizedTokens = tokens.map(token =>
this.sanitizeFTS5Token(token) this.sanitizeFTS5Token(token)
); );

View File

@@ -24,10 +24,6 @@ class SearchContext {
fulltextQuery: string; fulltextQuery: string;
dbLoadNeeded: boolean; dbLoadNeeded: boolean;
error: string | null; error: string | null;
/** Determines which backend to use for fulltext search */
searchBackend: "typescript" | "sqlite";
/** Whether SQLite search is enabled (cached from options) */
sqliteSearchEnabled: boolean;
constructor(params: SearchParams = {}) { constructor(params: SearchParams = {}) {
this.fastSearch = !!params.fastSearch; this.fastSearch = !!params.fastSearch;
@@ -58,43 +54,6 @@ class SearchContext {
// and some extra data needs to be loaded before executing // and some extra data needs to be loaded before executing
this.dbLoadNeeded = false; this.dbLoadNeeded = false;
this.error = null; this.error = null;
// Determine search backend
this.sqliteSearchEnabled = this.checkSqliteEnabled();
this.searchBackend = this.determineSearchBackend(params);
}
private checkSqliteEnabled(): boolean {
try {
// Import dynamically to avoid circular dependencies
const optionService = require("../options.js").default;
// Default to true if the option doesn't exist
const enabled = optionService.getOptionOrNull("searchSqliteEnabled");
return enabled === null ? true : enabled === "true";
} catch {
return true; // Default to enabled
}
}
private determineSearchBackend(params: SearchParams): "typescript" | "sqlite" {
// Allow override via params for testing
if (params.forceBackend) {
return params.forceBackend;
}
// Check if SQLite is enabled
if (!this.sqliteSearchEnabled) {
return "typescript";
}
try {
const optionService = require("../options.js").default;
const backend = optionService.getOptionOrNull("searchBackend");
// Default to sqlite if option doesn't exist
return backend === "typescript" ? "typescript" : "sqlite";
} catch {
return "sqlite"; // Default to SQLite for better performance
}
} }
addError(error: string) { addError(error: string) {

View File

@@ -13,7 +13,6 @@ import AttributeExistsExp from "../expressions/attribute_exists.js";
import LabelComparisonExp from "../expressions/label_comparison.js"; import LabelComparisonExp from "../expressions/label_comparison.js";
import NoteFlatTextExp from "../expressions/note_flat_text.js"; import NoteFlatTextExp from "../expressions/note_flat_text.js";
import NoteContentFulltextExp from "../expressions/note_content_fulltext.js"; import NoteContentFulltextExp from "../expressions/note_content_fulltext.js";
import NoteContentSqliteExp from "../expressions/note_content_sqlite.js";
import OrderByAndLimitExp from "../expressions/order_by_and_limit.js"; import OrderByAndLimitExp from "../expressions/order_by_and_limit.js";
import AncestorExp from "../expressions/ancestor.js"; import AncestorExp from "../expressions/ancestor.js";
import buildComparator from "./build_comparator.js"; import buildComparator from "./build_comparator.js";
@@ -38,20 +37,15 @@ function getFulltext(_tokens: TokenData[], searchContext: SearchContext, leading
const operator = leadingOperator === "=" ? "=" : "*=*"; const operator = leadingOperator === "=" ? "=" : "*=*";
if (!searchContext.fastSearch) { if (!searchContext.fastSearch) {
// Choose between SQLite and TypeScript backend
const ContentExp = searchContext.searchBackend === "sqlite"
? NoteContentSqliteExp
: NoteContentFulltextExp;
// For exact match with "=", we need different behavior // For exact match with "=", we need different behavior
if (leadingOperator === "=" && tokens.length === 1) { if (leadingOperator === "=" && tokens.length === 1) {
// Exact match on title OR exact match on content // Exact match on title OR exact match on content
return new OrExp([ return new OrExp([
new PropertyComparisonExp(searchContext, "title", "=", tokens[0]), new PropertyComparisonExp(searchContext, "title", "=", tokens[0]),
new ContentExp("=", { tokens, flatText: false }) new NoteContentFulltextExp("=", { tokens, flatText: false })
]); ]);
} }
return new OrExp([new NoteFlatTextExp(tokens), new ContentExp(operator, { tokens, flatText: true })]); return new OrExp([new NoteFlatTextExp(tokens), new NoteContentFulltextExp(operator, { tokens, flatText: true })]);
} else { } else {
return new NoteFlatTextExp(tokens); return new NoteFlatTextExp(tokens);
} }
@@ -154,12 +148,7 @@ function getExpression(tokens: TokenData[], searchContext: SearchContext, level
i++; i++;
// Choose between SQLite and TypeScript backend return new NoteContentFulltextExp(operator.token, { tokens: [tokens[i].token], raw });
const ContentExp = searchContext.searchBackend === "sqlite"
? NoteContentSqliteExp
: NoteContentFulltextExp;
return new ContentExp(operator.token, { tokens: [tokens[i].token], raw });
} }
if (tokens[i].token === "parents") { if (tokens[i].token === "parents") {
@@ -222,12 +211,7 @@ function getExpression(tokens: TokenData[], searchContext: SearchContext, level
i += 2; i += 2;
// Choose between SQLite and TypeScript backend return new OrExp([new PropertyComparisonExp(searchContext, "title", "*=*", tokens[i].token), new NoteContentFulltextExp("*=*", { tokens: [tokens[i].token] })]);
const ContentExp = searchContext.searchBackend === "sqlite"
? NoteContentSqliteExp
: NoteContentFulltextExp;
return new OrExp([new PropertyComparisonExp(searchContext, "title", "*=*", tokens[i].token), new ContentExp("*=*", { tokens: [tokens[i].token] })]);
} }
if (PropertyComparisonExp.isProperty(tokens[i].token)) { if (PropertyComparisonExp.isProperty(tokens[i].token)) {

View File

@@ -19,9 +19,6 @@ import sql from "../../sql.js";
import scriptService from "../../script.js"; import scriptService from "../../script.js";
import striptags from "striptags"; import striptags from "striptags";
import protectedSessionService from "../../protected_session.js"; import protectedSessionService from "../../protected_session.js";
import performanceMonitor from "../performance_monitor.js";
import type { DetailedMetrics } from "../performance_monitor.js";
import abTestingService from "../ab_testing.js";
export interface SearchNoteResult { export interface SearchNoteResult {
searchResultNoteIds: string[]; searchResultNoteIds: string[];
@@ -405,14 +402,6 @@ function parseQueryToExpression(query: string, searchContext: SearchContext) {
function searchNotes(query: string, params: SearchParams = {}): BNote[] { function searchNotes(query: string, params: SearchParams = {}): BNote[] {
const searchContext = new SearchContext(params); const searchContext = new SearchContext(params);
// Run A/B test in background (non-blocking)
setImmediate(() => {
abTestingService.runComparison(query, params).catch(err => {
log.info(`A/B test failed: ${err}`);
});
});
const searchResults = findResultsWithQuery(query, searchContext); const searchResults = findResultsWithQuery(query, searchContext);
return searchResults.map((sr) => becca.notes[sr.noteId]); return searchResults.map((sr) => becca.notes[sr.noteId]);
@@ -422,49 +411,25 @@ function findResultsWithQuery(query: string, searchContext: SearchContext): Sear
query = query || ""; query = query || "";
searchContext.originalQuery = query; searchContext.originalQuery = query;
// Start performance monitoring
const totalTimer = performanceMonitor.startTimer();
const phases: { name: string; duration: number }[] = [];
// Parse query
const parseTimer = performanceMonitor.startTimer();
const expression = parseQueryToExpression(query, searchContext); const expression = parseQueryToExpression(query, searchContext);
phases.push({ name: "parse", duration: parseTimer() });
if (!expression) { if (!expression) {
return []; return [];
} }
// If the query starts with '#', it's a pure expression query. // If the query starts with '#', it's a pure expression query.
// Don't use progressive search for these as they may have complex // Don't use progressive search for these as they may have complex
// ordering or other logic that shouldn't be interfered with. // ordering or other logic that shouldn't be interfered with.
const isPureExpressionQuery = query.trim().startsWith('#'); const isPureExpressionQuery = query.trim().startsWith('#');
let results: SearchResult[]; let results: SearchResult[];
const searchTimer = performanceMonitor.startTimer();
if (isPureExpressionQuery) { if (isPureExpressionQuery) {
// For pure expression queries, use standard search without progressive phases // For pure expression queries, use standard search without progressive phases
results = performSearch(expression, searchContext, searchContext.enableFuzzyMatching); results = performSearch(expression, searchContext, searchContext.enableFuzzyMatching);
} else { } else {
results = findResultsWithExpression(expression, searchContext); results = findResultsWithExpression(expression, searchContext);
} }
phases.push({ name: "search", duration: searchTimer() });
// Record metrics
const metrics: DetailedMetrics = {
query: query.substring(0, 200), // Truncate long queries
backend: searchContext.searchBackend,
totalTime: totalTimer(),
parseTime: phases[0].duration,
searchTime: phases[1].duration,
resultCount: results.length,
phases,
error: searchContext.error || undefined
};
performanceMonitor.recordDetailedMetrics(metrics);
return results; return results;
} }

View File

@@ -21,6 +21,4 @@ export interface SearchParams {
limit?: number | null; limit?: number | null;
debug?: boolean; debug?: boolean;
fuzzyAttributeSearch?: boolean; fuzzyAttributeSearch?: boolean;
/** Force a specific search backend for testing/comparison */
forceBackend?: "typescript" | "sqlite";
} }

View File

@@ -5,7 +5,6 @@
import { describe, it, expect, beforeEach, afterEach } from 'vitest'; import { describe, it, expect, beforeEach, afterEach } from 'vitest';
import Database from 'better-sqlite3'; import Database from 'better-sqlite3';
import { SqliteFunctionsService, getSqliteFunctionsService } from './sqlite_functions.js'; import { SqliteFunctionsService, getSqliteFunctionsService } from './sqlite_functions.js';
import { normalize, stripTags } from '../utils.js';
describe('SqliteFunctionsService', () => { describe('SqliteFunctionsService', () => {
let db: Database.Database; let db: Database.Database;
@@ -46,38 +45,6 @@ describe('SqliteFunctionsService', () => {
}); });
}); });
describe('normalize_text function', () => {
beforeEach(() => {
service.registerFunctions(db);
});
it('should normalize text correctly', () => {
const tests = [
['café', 'cafe'],
['naïve', 'naive'],
['HELLO WORLD', 'hello world'],
['Über', 'uber'],
['', ''],
[null, ''],
];
for (const [input, expected] of tests) {
const result = db.prepare('SELECT normalize_text(?) as result').get(input) as { result: string };
expect(result.result).toBe(expected);
// Verify it matches the utils normalize function
if (input) {
expect(result.result).toBe(normalize(input as string));
}
}
});
it('should handle special characters', () => {
const input = 'Ñoño 123 ABC!@#';
const result = db.prepare('SELECT normalize_text(?) as result').get(input) as any;
expect(result.result).toBe(normalize(input));
});
});
describe('edit_distance function', () => { describe('edit_distance function', () => {
beforeEach(() => { beforeEach(() => {
service.registerFunctions(db); service.registerFunctions(db);
@@ -143,199 +110,4 @@ describe('SqliteFunctionsService', () => {
expect(result.match).toBe(0); expect(result.match).toBe(0);
}); });
}); });
describe('tokenize_text function', () => {
beforeEach(() => {
service.registerFunctions(db);
});
it('should tokenize text correctly', () => {
const tests = [
['hello world', ['hello', 'world']],
['getUserName', ['getusername', 'get', 'user', 'name']],
['user_name', ['user_name', 'user', 'name']],
['hello-world', ['hello', 'world']],
['test@example.com', ['test', 'example', 'com']],
['', []],
];
for (const [input, expected] of tests) {
const result = db.prepare('SELECT tokenize_text(?) as tokens').get(input) as any;
const tokens = JSON.parse(result.tokens);
// Check that all expected tokens are present (order may vary due to Set)
for (const token of expected) {
expect(tokens).toContain(token);
}
}
});
it('should handle camelCase and snake_case', () => {
const result = db.prepare('SELECT tokenize_text(?) as tokens').get('getUserById_async') as any;
const tokens = JSON.parse(result.tokens);
expect(tokens).toContain('getuserbyid_async');
expect(tokens).toContain('getuserbyid');
expect(tokens).toContain('async');
expect(tokens).toContain('get');
expect(tokens).toContain('user');
expect(tokens).toContain('by');
expect(tokens).toContain('id');
});
it('should handle null input', () => {
const result = db.prepare('SELECT tokenize_text(?) as tokens').get(null) as any;
expect(result.tokens).toBe('[]');
});
});
describe('strip_html function', () => {
beforeEach(() => {
service.registerFunctions(db);
});
it('should strip HTML tags correctly', () => {
const tests = [
['<p>Hello World</p>', 'Hello World'],
['<div><span>Test</span></div>', 'Test'],
['<script>alert("bad")</script>content', 'content'],
['<style>body{color:red}</style>text', 'text'],
['Hello &lt;world&gt;', 'Hello <world>'],
['&nbsp;&nbsp;Space', ' Space'],
['', ''],
];
for (const [input, expected] of tests) {
const result = db.prepare('SELECT strip_html(?) as text').get(input) as any;
expect(result.text).toBe(expected);
}
});
it('should handle complex HTML', () => {
const html = `
<html>
<head><title>Test</title></head>
<body>
<h1>Title</h1>
<p>Paragraph with <strong>bold</strong> text.</p>
<script>console.log("test")</script>
</body>
</html>
`;
const result = db.prepare('SELECT strip_html(?) as text').get(html) as any;
expect(result.text).toContain('Title');
expect(result.text).toContain('Paragraph with bold text');
expect(result.text).not.toContain('console.log');
});
it('should handle null input', () => {
const result = db.prepare('SELECT strip_html(?) as text').get(null) as any;
expect(result.text).toBe('');
});
});
describe('fuzzy_match function', () => {
beforeEach(() => {
service.registerFunctions(db);
});
it('should perform exact matches', () => {
const tests = [
['hello', 'hello world', 1],
['world', 'hello world', 1],
['foo', 'hello world', 0],
];
for (const [needle, haystack, expected] of tests) {
const result = db.prepare('SELECT fuzzy_match(?, ?, 2) as match').get(needle, haystack) as any;
expect(result.match).toBe(expected);
}
});
it('should perform fuzzy matches within edit distance', () => {
const tests = [
['helo', 'hello world', 1], // 1 edit distance
['wrld', 'hello world', 1], // 1 edit distance
['hallo', 'hello world', 1], // 1 edit distance
['xyz', 'hello world', 0], // Too different
];
for (const [needle, haystack, expected] of tests) {
const result = db.prepare('SELECT fuzzy_match(?, ?, 2) as match').get(needle, haystack) as any;
expect(result.match).toBe(expected);
}
});
it('should handle case insensitive matching', () => {
const result = db.prepare('SELECT fuzzy_match(?, ?, 2) as match').get('HELLO', 'hello world') as any;
expect(result.match).toBe(1);
});
it('should handle null inputs', () => {
const result = db.prepare('SELECT fuzzy_match(?, ?, 2) as match').get(null, 'test') as any;
expect(result.match).toBe(0);
});
});
describe('Integration with SQL queries', () => {
beforeEach(() => {
service.registerFunctions(db);
// Create a test table
db.exec(`
CREATE TABLE test_notes (
id INTEGER PRIMARY KEY,
title TEXT,
content TEXT
)
`);
// Insert test data
const insert = db.prepare('INSERT INTO test_notes (title, content) VALUES (?, ?)');
insert.run('Café Meeting', '<p>Discussion about naïve implementation</p>');
insert.run('über wichtig', 'Very important note with HTML &amp; entities');
insert.run('getUserData', 'Function to get_user_data from database');
});
it('should work in WHERE clauses with normalize_text', () => {
const results = db.prepare(`
SELECT title FROM test_notes
WHERE normalize_text(title) LIKE '%cafe%'
`).all();
expect(results).toHaveLength(1);
expect((results[0] as any).title).toBe('Café Meeting');
});
it('should work with fuzzy matching in queries', () => {
const results = db.prepare(`
SELECT title FROM test_notes
WHERE fuzzy_match('getuserdata', normalize_text(title), 2) = 1
`).all();
expect(results).toHaveLength(1);
expect((results[0] as any).title).toBe('getUserData');
});
it('should work with HTML stripping', () => {
const results = db.prepare(`
SELECT strip_html(content) as clean_content
FROM test_notes
WHERE title = 'Café Meeting'
`).all();
expect((results[0] as any).clean_content).toBe('Discussion about naïve implementation');
});
it('should work with tokenization', () => {
const result = db.prepare(`
SELECT tokenize_text(title) as tokens
FROM test_notes
WHERE title = 'getUserData'
`).get() as any;
const tokens = JSON.parse(result.tokens);
expect(tokens).toContain('get');
expect(tokens).toContain('user');
expect(tokens).toContain('data');
});
});
}); });

View File

@@ -1,19 +1,17 @@
/** /**
* SQLite Custom Functions Service * SQLite Custom Functions Service
* *
* This service manages custom SQLite functions that enhance search capabilities. * This service manages custom SQLite functions for general database operations.
* Functions are registered with better-sqlite3 to provide native-speed operations * Functions are registered with better-sqlite3 to provide native-speed operations
* directly within SQL queries, enabling efficient search indexing and querying. * directly within SQL queries.
* *
* These functions are used by: * These functions are used by:
* - Database triggers for automatic search index maintenance * - Fuzzy search fallback (edit_distance)
* - Direct SQL queries for search operations * - Regular expression matching (regex_match)
* - Migration scripts for initial data population
*/ */
import type { Database } from "better-sqlite3"; import type { Database } from "better-sqlite3";
import log from "../log.js"; import log from "../log.js";
import { normalize as utilsNormalize, stripTags } from "../utils.js";
/** /**
* Configuration for fuzzy search operations * Configuration for fuzzy search operations
@@ -67,15 +65,7 @@ export class SqliteFunctionsService {
// Bind all methods to preserve 'this' context // Bind all methods to preserve 'this' context
this.functions = [ this.functions = [
{ {
name: "normalize_text", name: "edit_distance",
implementation: this.normalizeText.bind(this),
options: {
deterministic: true,
varargs: false
}
},
{
name: "edit_distance",
implementation: this.editDistance.bind(this), implementation: this.editDistance.bind(this),
options: { options: {
deterministic: true, deterministic: true,
@@ -89,30 +79,6 @@ export class SqliteFunctionsService {
deterministic: true, deterministic: true,
varargs: true // Changed to true to handle variable arguments varargs: true // Changed to true to handle variable arguments
} }
},
{
name: "tokenize_text",
implementation: this.tokenizeText.bind(this),
options: {
deterministic: true,
varargs: false
}
},
{
name: "strip_html",
implementation: this.stripHtml.bind(this),
options: {
deterministic: true,
varargs: false
}
},
{
name: "fuzzy_match",
implementation: this.fuzzyMatch.bind(this),
options: {
deterministic: true,
varargs: true // Changed to true to handle variable arguments
}
} }
]; ];
} }
@@ -182,22 +148,6 @@ export class SqliteFunctionsService {
// ===== Function Implementations ===== // ===== Function Implementations =====
/**
* Normalize text by removing diacritics and converting to lowercase
* Matches the behavior of utils.normalize() exactly
*
* @param text Text to normalize
* @returns Normalized text
*/
private normalizeText(text: string | null | undefined): string {
if (!text || typeof text !== 'string') {
return '';
}
// Use the exact same normalization as the rest of the codebase
return utilsNormalize(text);
}
/** /**
* Calculate Levenshtein edit distance between two strings * Calculate Levenshtein edit distance between two strings
* Optimized with early termination and single-array approach * Optimized with early termination and single-array approach
@@ -314,186 +264,6 @@ export class SqliteFunctionsService {
return null; return null;
} }
} }
/**
* Tokenize text into searchable words
* Handles punctuation, camelCase, and snake_case
*
* @param text Text to tokenize
* @returns JSON array string of tokens
*/
private tokenizeText(text: string | null | undefined): string {
if (!text || typeof text !== 'string') {
return '[]';
}
try {
// Use a Set to avoid duplicates from the start
const expandedTokens: Set<string> = new Set();
// Split on word boundaries, preserving apostrophes within words
// But we need to handle underscore separately for snake_case
const tokens = text
.split(/[\s\n\r\t,;.!?()[\]{}"'`~@#$%^&*+=|\\/<>:-]+/)
.filter(token => token.length > 0);
// Process each token
for (const token of tokens) {
// Add the original token in lowercase
expandedTokens.add(token.toLowerCase());
// Handle snake_case first (split on underscore)
const snakeParts = token.split('_').filter(part => part.length > 0);
if (snakeParts.length > 1) {
// We have snake_case
for (const snakePart of snakeParts) {
// Add each snake part
expandedTokens.add(snakePart.toLowerCase());
// Also check for camelCase within each snake part
const camelParts = this.splitCamelCase(snakePart);
for (const camelPart of camelParts) {
if (camelPart.length > 0) {
expandedTokens.add(camelPart.toLowerCase());
}
}
}
} else {
// No snake_case, just check for camelCase
const camelParts = this.splitCamelCase(token);
for (const camelPart of camelParts) {
if (camelPart.length > 0) {
expandedTokens.add(camelPart.toLowerCase());
}
}
}
}
// Convert Set to Array for JSON serialization
const uniqueTokens = Array.from(expandedTokens);
// Return as JSON array string for SQL processing
return JSON.stringify(uniqueTokens);
} catch (error) {
log.error(`Error tokenizing text in SQL: ${error}`);
return '[]';
}
}
/**
* Helper method to split camelCase strings
* @param str String to split
* @returns Array of parts
*/
private splitCamelCase(str: string): string[] {
// Split on transitions from lowercase to uppercase
// Also handle sequences of uppercase letters (e.g., "XMLParser" -> ["XML", "Parser"])
return str.split(/(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])/);
}
/**
* Strip HTML tags from content
* Removes script and style content, then strips tags and decodes entities
*
* @param html HTML content
* @returns Plain text without HTML tags
*/
private stripHtml(html: string | null | undefined): string {
if (!html || typeof html !== 'string') {
return '';
}
try {
let text = html;
// First remove script and style content entirely (including the tags)
// This needs to happen before stripTags to remove the content
text = text.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '');
text = text.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '');
// Now use stripTags to remove remaining HTML tags
text = stripTags(text);
// Decode common HTML entities
text = text.replace(/&lt;/g, '<');
text = text.replace(/&gt;/g, '>');
text = text.replace(/&amp;/g, '&');
text = text.replace(/&quot;/g, '"');
text = text.replace(/&#39;/g, "'");
text = text.replace(/&apos;/g, "'");
text = text.replace(/&nbsp;/g, ' ');
// Normalize whitespace - reduce multiple spaces to single space
// But don't trim leading/trailing space if it was from &nbsp;
text = text.replace(/\s+/g, ' ');
return text;
} catch (error) {
log.error(`Error stripping HTML in SQL: ${error}`);
return html; // Return original on error
}
}
/**
* Fuzzy match with configurable edit distance
* Combines exact and fuzzy matching for optimal performance
*
* SQLite will pass 2 or 3 arguments:
* - 2 args: needle, haystack (uses default maxDistance)
* - 3 args: needle, haystack, maxDistance
*
* @returns 1 if match found, 0 otherwise
*/
private fuzzyMatch(...args: any[]): number {
// Handle variable arguments from SQLite
let needle: string | null | undefined = args[0];
let haystack: string | null | undefined = args[1];
let maxDistance: number = args.length > 2 ? args[2] : FUZZY_CONFIG.MAX_EDIT_DISTANCE;
// Validate input types
if (!needle || !haystack) {
return 0;
}
if (typeof needle !== 'string' || typeof haystack !== 'string') {
return 0;
}
// Validate and sanitize maxDistance
if (typeof maxDistance !== 'number' || !Number.isFinite(maxDistance)) {
maxDistance = FUZZY_CONFIG.MAX_EDIT_DISTANCE;
} else {
// Ensure it's a positive integer
maxDistance = Math.max(0, Math.floor(maxDistance));
}
// Normalize for comparison
const normalizedNeedle = needle.toLowerCase();
const normalizedHaystack = haystack.toLowerCase();
// Check exact match first (most common case)
if (normalizedHaystack.includes(normalizedNeedle)) {
return 1;
}
// For fuzzy matching, check individual words
const words = normalizedHaystack.split(/\s+/).filter(w => w.length > 0);
for (const word of words) {
// Skip if word length difference is too large
if (Math.abs(word.length - normalizedNeedle.length) > maxDistance) {
continue;
}
// Check edit distance - call with all 3 args since we're calling internally
const distance = this.editDistance(normalizedNeedle, word, maxDistance);
if (distance <= maxDistance) {
return 1;
}
}
return 0;
}
} }
// Export singleton instance getter // Export singleton instance getter

View File

@@ -1,153 +0,0 @@
/**
* Integration tests for SQLite search implementation
*/
import { describe, it, expect, beforeAll, afterAll } from "vitest";
import sql from "../sql.js";
import { getSQLiteSearchService } from "./sqlite_search_service.js";
import SearchContext from "./search_context.js";
import NoteContentSqliteExp from "./expressions/note_content_sqlite.js";
import NoteSet from "./note_set.js";
import { getSqliteFunctionsService } from "./sqlite_functions.js";
describe("SQLite Search Integration", () => {
let searchService: ReturnType<typeof getSQLiteSearchService>;
let searchContext: SearchContext;
beforeAll(() => {
// Initialize services
searchService = getSQLiteSearchService();
searchContext = new SearchContext({
// searchBackend: "sqlite", // TODO: Add to SearchParams type
// searchSqliteEnabled: true
});
// Register SQL functions
const functionsService = getSqliteFunctionsService();
const db = sql.getDbConnection();
functionsService.registerFunctions(db);
});
afterAll(() => {
// Cleanup if needed
});
describe("Service Initialization", () => {
it("should initialize SQLite search service", () => {
expect(searchService).toBeDefined();
const stats = searchService.getStatistics();
expect(stats).toBeDefined();
expect(stats).toHaveProperty("tablesInitialized");
});
it("should have registered SQL functions", () => {
const functionsService = getSqliteFunctionsService();
expect(functionsService.isRegistered()).toBe(true);
});
});
describe("Expression Creation", () => {
it("should create SQLite expression when available", () => {
const exp = NoteContentSqliteExp.createExpression("*=*", {
tokens: ["test"],
raw: false,
flatText: false
});
expect(exp).toBeDefined();
// Check if it's the SQLite version or fallback
if (NoteContentSqliteExp.isAvailable()) {
expect(exp).toBeInstanceOf(NoteContentSqliteExp);
}
});
it("should handle different operators", () => {
const operators = ["=", "!=", "*=*", "*=", "=*", "%=", "~="];
for (const op of operators) {
const exp = new NoteContentSqliteExp(op, {
tokens: ["test"],
raw: false,
flatText: false
});
expect(exp).toBeDefined();
expect(exp.tokens).toEqual(["test"]);
}
});
});
describe("Search Execution", () => {
it("should execute search with empty input set", () => {
const exp = new NoteContentSqliteExp("*=*", {
tokens: ["test"],
raw: false,
flatText: false
});
const inputSet = new NoteSet();
const resultSet = exp.execute(inputSet, {}, searchContext);
expect(resultSet).toBeDefined();
expect(resultSet).toBeInstanceOf(NoteSet);
});
it("should handle search errors gracefully", () => {
const exp = new NoteContentSqliteExp("invalid_op", {
tokens: ["test"],
raw: false,
flatText: false
});
const inputSet = new NoteSet();
const resultSet = exp.execute(inputSet, {}, searchContext);
expect(resultSet).toBeDefined();
expect(searchContext.hasError()).toBe(true);
});
});
describe("Backend Selection", () => {
it("should use SQLite backend when enabled", () => {
const ctx = new SearchContext({
forceBackend: "sqlite"
});
expect(ctx.searchBackend).toBe("sqlite");
});
it("should use TypeScript backend when forced", () => {
const ctx = new SearchContext({
forceBackend: "typescript"
});
expect(ctx.searchBackend).toBe("typescript");
});
it("should default to SQLite when no preference", () => {
const ctx = new SearchContext({});
// Should default to SQLite for better performance
expect(["sqlite", "typescript"]).toContain(ctx.searchBackend);
});
});
describe("Performance Statistics", () => {
it("should track search statistics", () => {
const initialStats = searchService.getStatistics();
const initialSearches = initialStats.totalSearches || 0;
// Execute a search
searchService.search(
["test"],
"*=*",
searchContext,
{}
);
const newStats = searchService.getStatistics();
expect(newStats.totalSearches).toBeGreaterThan(initialSearches);
expect(newStats.lastSearchTimeMs).toBeGreaterThanOrEqual(0);
});
});
});

View File

@@ -1,320 +0,0 @@
/**
* Tests for SQLite Search Service
*
* These tests verify that the SQLite-based search implementation
* correctly handles all search operators and provides accurate results.
*/
import { describe, it, expect, beforeAll, afterAll, beforeEach } from "vitest";
import { SQLiteSearchService } from "./sqlite_search_service.js";
import sql from "../sql.js";
import SearchContext from "./search_context.js";
import { initializeSqliteFunctions } from "./sqlite_functions.js";
describe("SQLiteSearchService", () => {
let searchService: SQLiteSearchService;
let searchContext: SearchContext;
beforeAll(() => {
// Initialize SQLite functions for tests
const db = sql.getDbConnection();
if (db) {
initializeSqliteFunctions(db);
}
// Get search service instance
searchService = SQLiteSearchService.getInstance();
// Create test tables if they don't exist
sql.execute(`
CREATE TABLE IF NOT EXISTS note_search_content (
noteId TEXT PRIMARY KEY,
noteContent TEXT,
normalized_content TEXT,
normalized_title TEXT,
isProtected INTEGER DEFAULT 0,
isDeleted INTEGER DEFAULT 0
)
`);
sql.execute(`
CREATE TABLE IF NOT EXISTS note_tokens (
noteId TEXT PRIMARY KEY,
tokens TEXT
)
`);
sql.execute(`
CREATE VIRTUAL TABLE IF NOT EXISTS note_fts USING fts5(
noteId UNINDEXED,
title,
content,
tokenize = 'unicode61'
)
`);
});
beforeEach(() => {
// Clear test data
sql.execute(`DELETE FROM note_search_content`);
sql.execute(`DELETE FROM note_tokens`);
sql.execute(`DELETE FROM note_fts`);
// Create fresh search context
searchContext = new SearchContext();
// Insert test data
insertTestNote("note1", "Hello World", "This is a test note with hello world content.");
insertTestNote("note2", "Programming", "JavaScript and TypeScript programming languages.");
insertTestNote("note3", "Fuzzy Search", "Testing fuzzy matching with similar words like helo and wrold.");
insertTestNote("note4", "Special Characters", "Testing with special@email.com and user_name variables.");
insertTestNote("note5", "CamelCase", "getUserName and setUserEmail functions in JavaScript.");
});
function insertTestNote(noteId: string, title: string, content: string) {
// Insert into search content table
sql.execute(`
INSERT INTO note_search_content (noteId, noteContent, normalized_content, normalized_title, isProtected, isDeleted)
VALUES (?, ?, LOWER(?), LOWER(?), 0, 0)
`, [noteId, content, content, title]);
// Generate tokens
const tokens = tokenize(content + " " + title);
sql.execute(`
INSERT INTO note_tokens (noteId, tokens)
VALUES (?, ?)
`, [noteId, JSON.stringify(tokens)]);
// Insert into FTS5 table
sql.execute(`
INSERT INTO note_fts (noteId, title, content)
VALUES (?, ?, ?)
`, [noteId, title, content]);
}
function tokenize(text: string): string[] {
return text.toLowerCase()
.split(/[\s\n\r\t,;.!?()[\]{}"'`~@#$%^&*+=|\\/<>:_-]+/)
.filter(token => token.length > 0);
}
describe("Substring Search (*=*)", () => {
it("should find notes containing substring", () => {
const results = searchService.search(["hello"], "*=*", searchContext);
expect(results).toContain("note1");
expect(results.size).toBe(1);
});
it("should find notes with multiple tokens", () => {
const results = searchService.search(["java", "script"], "*=*", searchContext);
expect(results).toContain("note2");
expect(results).toContain("note5");
expect(results.size).toBe(2);
});
it("should be case insensitive", () => {
const results = searchService.search(["HELLO"], "*=*", searchContext);
expect(results).toContain("note1");
});
});
describe("Fuzzy Search (~=)", () => {
it("should find notes with fuzzy matching", () => {
const results = searchService.search(["helo"], "~=", searchContext);
expect(results).toContain("note3"); // Contains "helo"
expect(results).toContain("note1"); // Contains "hello" (1 edit distance)
});
it("should respect edit distance threshold", () => {
const results = searchService.search(["xyz"], "~=", searchContext);
expect(results.size).toBe(0); // Too different from any content
});
it("should handle multiple fuzzy tokens", () => {
const results = searchService.search(["fuzzy", "match"], "~=", searchContext);
expect(results).toContain("note3");
});
});
describe("Prefix Search (=*)", () => {
it("should find notes starting with prefix", () => {
const results = searchService.search(["test"], "=*", searchContext);
expect(results).toContain("note3"); // "Testing fuzzy..."
expect(results).toContain("note4"); // "Testing with..."
expect(results.size).toBe(2);
});
it("should handle multiple prefixes", () => {
const results = searchService.search(["java", "type"], "=*", searchContext);
expect(results).toContain("note2"); // Has both "JavaScript" and "TypeScript"
});
});
describe("Suffix Search (*=)", () => {
it("should find notes ending with suffix", () => {
const results = searchService.search(["script"], "*=", searchContext);
expect(results).toContain("note2"); // "JavaScript" and "TypeScript"
expect(results).toContain("note5"); // "JavaScript"
});
it("should handle special suffixes", () => {
const results = searchService.search([".com"], "*=", searchContext);
expect(results).toContain("note4"); // "special@email.com"
});
});
describe("Regex Search (%=)", () => {
it("should find notes matching regex pattern", () => {
const results = searchService.search(["\\w+@\\w+\\.com"], "%=", searchContext);
expect(results).toContain("note4"); // Contains email pattern
});
it("should handle complex patterns", () => {
const results = searchService.search(["get\\w+Name"], "%=", searchContext);
expect(results).toContain("note5"); // "getUserName"
});
it("should handle invalid regex gracefully", () => {
const results = searchService.search(["[invalid"], "%=", searchContext);
expect(results.size).toBe(0); // Should return empty on invalid regex
});
});
describe("Exact Word Search (=)", () => {
it("should find notes with exact word match", () => {
const results = searchService.search(["hello"], "=", searchContext);
expect(results).toContain("note1");
expect(results.size).toBe(1);
});
it("should not match partial words", () => {
const results = searchService.search(["java"], "=", searchContext);
expect(results.size).toBe(0); // "JavaScript" contains "java" but not as whole word
});
it("should find multiple exact words", () => {
const results = searchService.search(["fuzzy", "matching"], "=", searchContext);
expect(results).toContain("note3");
});
});
describe("Not Equals Search (!=)", () => {
it("should find notes not containing exact word", () => {
const results = searchService.search(["hello"], "!=", searchContext);
expect(results).not.toContain("note1");
expect(results.size).toBe(4); // All except note1
});
it("should handle multiple tokens", () => {
const results = searchService.search(["fuzzy", "matching"], "!=", searchContext);
expect(results).not.toContain("note3");
expect(results.size).toBe(4); // All except note3
});
});
describe("Search Options", () => {
it("should respect limit option", () => {
const results = searchService.search(["test"], "*=*", searchContext, { limit: 1 });
expect(results.size).toBeLessThanOrEqual(1);
});
it("should filter by noteId set", () => {
const noteIdFilter = new Set(["note1", "note3"]);
const results = searchService.search(["test"], "*=*", searchContext, { noteIdFilter });
for (const noteId of results) {
expect(noteIdFilter).toContain(noteId);
}
});
it("should exclude deleted notes by default", () => {
// Mark note1 as deleted
sql.execute(`UPDATE note_search_content SET isDeleted = 1 WHERE noteId = 'note1'`);
const results = searchService.search(["hello"], "*=*", searchContext);
expect(results).not.toContain("note1");
});
it("should include deleted notes when specified", () => {
// Mark note1 as deleted
sql.execute(`UPDATE note_search_content SET isDeleted = 1 WHERE noteId = 'note1'`);
const results = searchService.search(["hello"], "*=*", searchContext, { includeDeleted: true });
expect(results).toContain("note1");
});
});
describe("Complex Queries", () => {
it("should combine multiple searches with AND", () => {
const queries = [
{ tokens: ["java"], operator: "*=*" },
{ tokens: ["script"], operator: "*=*" }
];
const results = searchService.searchMultiple(queries, "AND", searchContext);
expect(results).toContain("note2");
expect(results).toContain("note5");
});
it("should combine multiple searches with OR", () => {
const queries = [
{ tokens: ["hello"], operator: "*=*" },
{ tokens: ["fuzzy"], operator: "*=*" }
];
const results = searchService.searchMultiple(queries, "OR", searchContext);
expect(results).toContain("note1");
expect(results).toContain("note3");
expect(results.size).toBe(2);
});
});
describe("Performance", () => {
beforeEach(() => {
// Add more test data for performance testing
for (let i = 10; i < 1000; i++) {
insertTestNote(
`note${i}`,
`Title ${i}`,
`This is note number ${i} with some random content for testing performance.`
);
}
});
it("should handle large result sets efficiently", () => {
const startTime = Date.now();
const results = searchService.search(["note"], "*=*", searchContext);
const elapsed = Date.now() - startTime;
expect(results.size).toBeGreaterThan(100);
expect(elapsed).toBeLessThan(1000); // Should complete within 1 second
});
it("should use limit to restrict results", () => {
const startTime = Date.now();
const results = searchService.search(["note"], "*=*", searchContext, { limit: 10 });
const elapsed = Date.now() - startTime;
expect(results.size).toBeLessThanOrEqual(10);
expect(elapsed).toBeLessThan(100); // Should be very fast with limit
});
});
describe("Statistics", () => {
it("should return correct statistics", () => {
const stats = searchService.getStatistics();
expect(stats.tablesInitialized).toBe(true);
expect(stats.indexedNotes).toBe(5);
expect(stats.totalTokens).toBe(5);
expect(stats.fts5Available).toBe(true);
});
});
afterAll(() => {
// Clean up test data
sql.execute(`DELETE FROM note_search_content`);
sql.execute(`DELETE FROM note_tokens`);
sql.execute(`DELETE FROM note_fts`);
});
});

View File

@@ -1,943 +0,0 @@
/**
* SQLite Search Service
*
* This service provides high-performance search operations using pure SQLite queries.
* It implements all search operators with 100% accuracy and 10-30x performance improvement
* over the TypeScript-based implementation.
*
* Operators supported:
* - *=* (substring): Uses LIKE on normalized content
* - ~= (fuzzy): Uses edit_distance function with tokens
* - =* (prefix): Uses LIKE with prefix pattern
* - *= (suffix): Uses LIKE with suffix pattern
* - %= (regex): Uses regex_match function
* - = (exact word): Uses FTS5 table
* - != (not equals): Inverse of equals
*
* Performance characteristics:
* - Substring search: O(n) with optimized LIKE
* - Fuzzy search: O(n*m) where m is token count
* - Prefix/suffix: O(n) with optimized LIKE
* - Regex: O(n) with native regex support
* - Exact word: O(log n) with FTS5 index
*/
import sql from "../sql.js";
import log from "../log.js";
import type SearchContext from "./search_context.js";
import protectedSessionService from "../protected_session.js";
import { normalize } from "../utils.js";
/**
* Configuration for search operations
*/
const SEARCH_CONFIG = {
MAX_EDIT_DISTANCE: 2,
MIN_TOKEN_LENGTH: 3,
MAX_RESULTS: 10000,
BATCH_SIZE: 1000,
LOG_PERFORMANCE: true,
} as const;
/**
* Interface for search results
*/
export interface SearchResult {
noteId: string;
score?: number;
snippet?: string;
}
/**
* Interface for search options
*/
export interface SearchOptions {
includeProtected?: boolean;
includeDeleted?: boolean;
noteIdFilter?: Set<string>;
limit?: number;
offset?: number;
}
/**
* SQLite-based search service for high-performance note searching
*/
export class SQLiteSearchService {
private static instance: SQLiteSearchService | null = null;
private isInitialized: boolean = false;
private statistics = {
tablesInitialized: false,
totalSearches: 0,
totalTimeMs: 0,
averageTimeMs: 0,
lastSearchTimeMs: 0
};
private constructor() {
this.checkAndInitialize();
}
/**
* Get singleton instance of the search service
*/
static getInstance(): SQLiteSearchService {
if (!SQLiteSearchService.instance) {
SQLiteSearchService.instance = new SQLiteSearchService();
}
return SQLiteSearchService.instance;
}
/**
* Check if search tables are initialized and create them if needed
*/
private checkAndInitialize(): void {
try {
// Check if tables exist
const tableExists = sql.getValue(`
SELECT name FROM sqlite_master
WHERE type='table' AND name='note_search_content'
`);
if (!tableExists) {
log.info("Search tables not found. They will be created by migration.");
this.isInitialized = false;
return;
}
// Verify table structure
const columnCount = sql.getValue<number>(`
SELECT COUNT(*) FROM pragma_table_info('note_search_content')
`) || 0;
if (columnCount > 0) {
this.isInitialized = true;
this.statistics.tablesInitialized = true;
log.info("SQLite search service initialized successfully");
}
} catch (error) {
log.error(`Failed to initialize SQLite search service: ${error}`);
this.isInitialized = false;
this.statistics.tablesInitialized = false;
}
}
/**
* Main search method that delegates to appropriate operator implementation
*/
search(
tokens: string[],
operator: string,
searchContext: SearchContext,
options: SearchOptions = {}
): Set<string> {
if (!this.isInitialized) {
log.info("SQLite search service not initialized, falling back to traditional search");
return new Set();
}
const startTime = Date.now();
let results: Set<string>;
try {
// Normalize tokens for consistent searching
const normalizedTokens = tokens.map(token => normalize(token).toLowerCase());
// Delegate to appropriate search method based on operator
switch (operator) {
case "*=*":
results = this.searchSubstring(normalizedTokens, options);
break;
case "~=":
results = this.searchFuzzy(normalizedTokens, options);
break;
case "=*":
results = this.searchPrefix(normalizedTokens, options);
break;
case "*=":
results = this.searchSuffix(normalizedTokens, options);
break;
case "%=":
results = this.searchRegex(tokens, options); // Use original tokens for regex
break;
case "=":
results = this.searchExactWord(normalizedTokens, options);
break;
case "!=":
results = this.searchNotEquals(normalizedTokens, options);
break;
default:
log.info(`Unsupported search operator: ${operator}`);
return new Set();
}
const elapsed = Date.now() - startTime;
// Update statistics
this.statistics.totalSearches++;
this.statistics.totalTimeMs += elapsed;
this.statistics.lastSearchTimeMs = elapsed;
this.statistics.averageTimeMs = this.statistics.totalTimeMs / this.statistics.totalSearches;
if (SEARCH_CONFIG.LOG_PERFORMANCE) {
log.info(`SQLite search completed: operator=${operator}, tokens=${tokens.join(" ")}, ` +
`results=${results.size}, time=${elapsed}ms`);
}
return results;
} catch (error) {
log.error(`SQLite search failed: ${error}`);
searchContext.addError(`Search failed: ${error}`);
return new Set();
}
}
/**
* Substring search using LIKE on normalized content
* Operator: *=*
*/
private searchSubstring(tokens: string[], options: SearchOptions): Set<string> {
const results = new Set<string>();
// Build WHERE clause for all tokens
const conditions = tokens.map(() =>
`nsc.full_text_normalized LIKE '%' || ? || '%'`
).join(' AND ');
// Build base query - JOIN with notes table for isDeleted/isProtected filtering
let query = `
SELECT DISTINCT nsc.noteId
FROM note_search_content nsc
JOIN notes n ON nsc.noteId = n.noteId
WHERE ${conditions}
`;
const params = [...tokens];
// Add filters using the notes table columns
if (!options.includeDeleted) {
query += ` AND n.isDeleted = 0`;
}
if (!options.includeProtected && !protectedSessionService.isProtectedSessionAvailable()) {
query += ` AND n.isProtected = 0`;
}
// Add limit if specified
if (options.limit) {
query += ` LIMIT ${options.limit}`;
}
// Execute query
for (const row of sql.iterateRows<{ noteId: string }>(query, params)) {
// Apply noteId filter if provided
if (!options.noteIdFilter || options.noteIdFilter.has(row.noteId)) {
results.add(row.noteId);
}
}
return results;
}
/**
* Fuzzy search using edit distance on tokens
* Operator: ~=
*/
private searchFuzzy(tokens: string[], options: SearchOptions): Set<string> {
const results = new Set<string>();
// For fuzzy search, we need to check tokens individually
// First, get all note IDs that might match
let query = `
SELECT DISTINCT nsc.noteId, nsc.full_text_normalized
FROM note_search_content nsc
JOIN notes n ON nsc.noteId = n.noteId
WHERE 1=1
`;
if (!options.includeDeleted) {
query += ` AND n.isDeleted = 0`;
}
if (!options.includeProtected && !protectedSessionService.isProtectedSessionAvailable()) {
query += ` AND n.isProtected = 0`;
}
// Process in batches for better performance
const noteData = new Map<string, string>();
for (const row of sql.iterateRows<{ noteId: string, full_text_normalized: string }>(query)) {
if (options.noteIdFilter && !options.noteIdFilter.has(row.noteId)) {
continue;
}
noteData.set(row.noteId, row.full_text_normalized || '');
}
// Get tokens for fuzzy matching
const tokenQuery = `
SELECT DISTINCT noteId, token_normalized
FROM note_tokens
WHERE noteId IN (${Array.from(noteData.keys()).map(() => '?').join(',')})
`;
const noteTokens = new Map<string, Set<string>>();
if (noteData.size > 0) {
for (const row of sql.iterateRows<{ noteId: string, token_normalized: string }>(
tokenQuery, Array.from(noteData.keys())
)) {
if (!noteTokens.has(row.noteId)) {
noteTokens.set(row.noteId, new Set());
}
noteTokens.get(row.noteId)!.add(row.token_normalized);
}
}
// Now check each note for fuzzy matches
for (const [noteId, content] of noteData) {
let allTokensMatch = true;
const noteTokenSet = noteTokens.get(noteId) || new Set();
for (const searchToken of tokens) {
let tokenMatches = false;
// Check if token matches any word in the note
// First check exact match in content
if (content.includes(searchToken)) {
tokenMatches = true;
} else {
// Check fuzzy match against tokens
for (const noteToken of noteTokenSet) {
if (this.fuzzyMatchTokens(searchToken, noteToken)) {
tokenMatches = true;
break;
}
}
}
if (!tokenMatches) {
allTokensMatch = false;
break;
}
}
if (allTokensMatch) {
results.add(noteId);
if (options.limit && results.size >= options.limit) {
break;
}
}
}
return results;
}
/**
* Helper method for fuzzy matching between two tokens
*/
private fuzzyMatchTokens(token1: string, token2: string): boolean {
// Quick exact match check
if (token1 === token2) {
return true;
}
// Don't fuzzy match very short tokens
if (token1.length < SEARCH_CONFIG.MIN_TOKEN_LENGTH ||
token2.length < SEARCH_CONFIG.MIN_TOKEN_LENGTH) {
return false;
}
// Check if length difference is within edit distance threshold
if (Math.abs(token1.length - token2.length) > SEARCH_CONFIG.MAX_EDIT_DISTANCE) {
return false;
}
// Use SQL function for edit distance calculation
const distance = sql.getValue<number>(`
SELECT edit_distance(?, ?, ?)
`, [token1, token2, SEARCH_CONFIG.MAX_EDIT_DISTANCE]);
return distance <= SEARCH_CONFIG.MAX_EDIT_DISTANCE;
}
/**
* Prefix search using LIKE with prefix pattern
* Operator: =*
*/
private searchPrefix(tokens: string[], options: SearchOptions): Set<string> {
const results = new Set<string>();
// Build WHERE clause for all tokens
const conditions = tokens.map(() =>
`nsc.full_text_normalized LIKE ? || '%'`
).join(' AND ');
// Build query - JOIN with notes table for isDeleted/isProtected filtering
let query = `
SELECT DISTINCT nsc.noteId
FROM note_search_content nsc
JOIN notes n ON nsc.noteId = n.noteId
WHERE ${conditions}
`;
const params = [...tokens];
// Add filters using the notes table columns
if (!options.includeDeleted) {
query += ` AND n.isDeleted = 0`;
}
if (!options.includeProtected && !protectedSessionService.isProtectedSessionAvailable()) {
query += ` AND n.isProtected = 0`;
}
// Add limit if specified
if (options.limit) {
query += ` LIMIT ${options.limit}`;
}
// Execute query
for (const row of sql.iterateRows<{ noteId: string }>(query, params)) {
if (!options.noteIdFilter || options.noteIdFilter.has(row.noteId)) {
results.add(row.noteId);
}
}
return results;
}
/**
* Suffix search using LIKE with suffix pattern
* Operator: *=
*/
private searchSuffix(tokens: string[], options: SearchOptions): Set<string> {
const results = new Set<string>();
// Build WHERE clause for all tokens
const conditions = tokens.map(() =>
`nsc.full_text_normalized LIKE '%' || ?`
).join(' AND ');
// Build query - JOIN with notes table for isDeleted/isProtected filtering
let query = `
SELECT DISTINCT nsc.noteId
FROM note_search_content nsc
JOIN notes n ON nsc.noteId = n.noteId
WHERE ${conditions}
`;
const params = [...tokens];
// Add filters using the notes table columns
if (!options.includeDeleted) {
query += ` AND n.isDeleted = 0`;
}
if (!options.includeProtected && !protectedSessionService.isProtectedSessionAvailable()) {
query += ` AND n.isProtected = 0`;
}
// Add limit if specified
if (options.limit) {
query += ` LIMIT ${options.limit}`;
}
// Execute query
for (const row of sql.iterateRows<{ noteId: string }>(query, params)) {
if (!options.noteIdFilter || options.noteIdFilter.has(row.noteId)) {
results.add(row.noteId);
}
}
return results;
}
/**
* Regex search using regex_match function
* Operator: %=
*/
private searchRegex(patterns: string[], options: SearchOptions): Set<string> {
const results = new Set<string>();
// For regex, we use the combined title+content (not normalized)
// Build WHERE clause for all patterns
const conditions = patterns.map(() =>
`regex_match(nsc.title || ' ' || nsc.content, ?, 'ims') = 1`
).join(' AND ');
// Build query - JOIN with notes table for isDeleted/isProtected filtering
let query = `
SELECT DISTINCT nsc.noteId
FROM note_search_content nsc
JOIN notes n ON nsc.noteId = n.noteId
WHERE ${conditions}
`;
const params = [...patterns];
// Add filters using the notes table columns
if (!options.includeDeleted) {
query += ` AND n.isDeleted = 0`;
}
if (!options.includeProtected && !protectedSessionService.isProtectedSessionAvailable()) {
query += ` AND n.isProtected = 0`;
}
// Add limit if specified
if (options.limit) {
query += ` LIMIT ${options.limit}`;
}
// Execute query
try {
for (const row of sql.iterateRows<{ noteId: string }>(query, params)) {
if (!options.noteIdFilter || options.noteIdFilter.has(row.noteId)) {
results.add(row.noteId);
}
}
} catch (error) {
log.error(`Regex search failed: ${error}`);
// Return empty set on regex error
}
return results;
}
/**
* Exact word search using FTS5 or token matching
* Operator: =
*/
private searchExactWord(tokens: string[], options: SearchOptions): Set<string> {
const results = new Set<string>();
// Try FTS5 first if available
const fts5Available = this.checkFTS5Availability();
if (fts5Available) {
try {
// Build FTS5 query
const ftsQuery = tokens.map(t => `"${t}"`).join(' ');
// FTS5 doesn't have isDeleted or isProtected columns,
// so we need to join with notes table for filtering
let query = `
SELECT DISTINCT f.noteId
FROM notes_fts f
JOIN notes n ON f.noteId = n.noteId
WHERE f.notes_fts MATCH ?
`;
const params = [ftsQuery];
// Add filters using the notes table columns
if (!options.includeDeleted) {
query += ` AND n.isDeleted = 0`;
}
if (!options.includeProtected && !protectedSessionService.isProtectedSessionAvailable()) {
query += ` AND n.isProtected = 0`;
}
// Add limit if specified
if (options.limit) {
query += ` LIMIT ${options.limit}`;
}
for (const row of sql.iterateRows<{ noteId: string }>(query, params)) {
if (!options.noteIdFilter || options.noteIdFilter.has(row.noteId)) {
results.add(row.noteId);
}
}
return results;
} catch (error) {
log.info(`FTS5 search failed, falling back to token search: ${error}`);
}
}
// Fallback to token-based exact match
// Build query to check if all tokens exist as whole words
let query = `
SELECT DISTINCT nt.noteId, nt.token_normalized
FROM note_tokens nt
JOIN notes n ON nt.noteId = n.noteId
WHERE 1=1
`;
if (!options.includeDeleted) {
query += ` AND n.isDeleted = 0`;
}
if (!options.includeProtected && !protectedSessionService.isProtectedSessionAvailable()) {
query += ` AND n.isProtected = 0`;
}
// Get all matching notes and their tokens
const candidateNotes = new Map<string, Set<string>>();
for (const row of sql.iterateRows<{ noteId: string, token_normalized: string }>(query)) {
if (options.noteIdFilter && !options.noteIdFilter.has(row.noteId)) {
continue;
}
if (!candidateNotes.has(row.noteId)) {
candidateNotes.set(row.noteId, new Set());
}
candidateNotes.get(row.noteId)!.add(row.token_normalized);
}
// Check each candidate for exact token matches
for (const [noteId, noteTokenSet] of candidateNotes) {
const allTokensFound = tokens.every(token => noteTokenSet.has(token));
if (allTokensFound) {
results.add(noteId);
if (options.limit && results.size >= options.limit) {
break;
}
}
}
return results;
}
/**
* Not equals search - inverse of exact word search
* Operator: !=
*/
private searchNotEquals(tokens: string[], options: SearchOptions): Set<string> {
// Get all notes that DON'T match the exact word search
const matchingNotes = this.searchExactWord(tokens, options);
// Get all notes - JOIN with notes table for isDeleted/isProtected filtering
let query = `
SELECT DISTINCT nsc.noteId
FROM note_search_content nsc
JOIN notes n ON nsc.noteId = n.noteId
WHERE 1=1
`;
if (!options.includeDeleted) {
query += ` AND n.isDeleted = 0`;
}
if (!options.includeProtected && !protectedSessionService.isProtectedSessionAvailable()) {
query += ` AND n.isProtected = 0`;
}
const allNotes = new Set<string>();
for (const row of sql.iterateRows<{ noteId: string }>(query)) {
if (!options.noteIdFilter || options.noteIdFilter.has(row.noteId)) {
allNotes.add(row.noteId);
}
}
// Return the difference
const results = new Set<string>();
for (const noteId of allNotes) {
if (!matchingNotes.has(noteId)) {
results.add(noteId);
if (options.limit && results.size >= options.limit) {
break;
}
}
}
return results;
}
/**
* Check if FTS5 is available
*/
private checkFTS5Availability(): boolean {
try {
const result = sql.getValue(`
SELECT name FROM sqlite_master
WHERE type='table' AND name='notes_fts'
`);
return !!result;
} catch {
return false;
}
}
/**
* Search with multiple operators (for complex queries)
*/
searchMultiple(
queries: Array<{ tokens: string[], operator: string }>,
combineMode: 'AND' | 'OR',
searchContext: SearchContext,
options: SearchOptions = {}
): Set<string> {
if (queries.length === 0) {
return new Set();
}
const resultSets = queries.map(q =>
this.search(q.tokens, q.operator, searchContext, options)
);
if (combineMode === 'AND') {
// Intersection of all result sets
return resultSets.reduce((acc, set) => {
const intersection = new Set<string>();
for (const item of acc) {
if (set.has(item)) {
intersection.add(item);
}
}
return intersection;
});
} else {
// Union of all result sets
return resultSets.reduce((acc, set) => {
for (const item of set) {
acc.add(item);
}
return acc;
}, new Set<string>());
}
}
/**
* Get search statistics for monitoring
*/
getStatistics() {
// Return the in-memory statistics object which includes performance data
return {
...this.statistics,
indexedNotes: this.isInitialized ? this.getIndexedNotesCount() : 0,
totalTokens: this.isInitialized ? this.getTotalTokensCount() : 0,
fts5Available: this.isInitialized ? this.checkFTS5Availability() : false
};
}
/**
* Get count of indexed notes
*/
private getIndexedNotesCount(): number {
try {
return sql.getValue<number>(`
SELECT COUNT(DISTINCT nsc.noteId)
FROM note_search_content nsc
JOIN notes n ON nsc.noteId = n.noteId
WHERE n.isDeleted = 0
`) || 0;
} catch {
return 0;
}
}
/**
* Get total tokens count
*/
private getTotalTokensCount(): number {
try {
return sql.getValue<number>(`
SELECT COUNT(*) FROM note_tokens
`) || 0;
} catch {
return 0;
}
}
/**
* Rebuild search index for a specific note
*/
rebuildNoteIndex(noteId: string): void {
if (!this.isInitialized) {
log.info("Cannot rebuild index - search tables not initialized");
return;
}
try {
// This will be handled by triggers automatically
// But we can force an update by touching the note
sql.execute(`
UPDATE notes
SET dateModified = strftime('%Y-%m-%d %H:%M:%S.%f', 'now')
WHERE noteId = ?
`, [noteId]);
log.info(`Rebuilt search index for note ${noteId}`);
} catch (error) {
log.error(`Failed to rebuild index for note ${noteId}: ${error}`);
}
}
/**
* Clear search index (for testing/maintenance)
*/
clearIndex(): void {
if (!this.isInitialized) {
return;
}
try {
sql.execute(`DELETE FROM note_search_content`);
sql.execute(`DELETE FROM note_tokens`);
if (this.checkFTS5Availability()) {
sql.execute(`DELETE FROM notes_fts`);
}
log.info("Search index cleared");
} catch (error) {
log.error(`Failed to clear search index: ${error}`);
}
}
/**
* Get detailed index status information
*/
async getIndexStatus(): Promise<{
initialized: boolean;
tablesExist: boolean;
indexedNotes: number;
totalNotes: number;
totalTokens: number;
fts5Available: boolean;
lastRebuild?: string;
coverage: number;
}> {
const tablesExist = this.isInitialized;
if (!tablesExist) {
return {
initialized: false,
tablesExist: false,
indexedNotes: 0,
totalNotes: 0,
totalTokens: 0,
fts5Available: false,
coverage: 0
};
}
// Get total indexable notes
const totalNotes = sql.getValue<number>(`
SELECT COUNT(*)
FROM notes
WHERE type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap')
AND isDeleted = 0
AND isProtected = 0
`) || 0;
// Get indexed notes count
const indexedNotes = sql.getValue<number>(`
SELECT COUNT(DISTINCT nsc.noteId)
FROM note_search_content nsc
JOIN notes n ON nsc.noteId = n.noteId
WHERE n.isDeleted = 0
`) || 0;
// Get token count
const totalTokens = sql.getValue<number>(`
SELECT COUNT(*) FROM note_tokens
`) || 0;
// Calculate coverage percentage
const coverage = totalNotes > 0 ? (indexedNotes / totalNotes) * 100 : 0;
return {
initialized: true,
tablesExist: true,
indexedNotes,
totalNotes,
totalTokens,
fts5Available: this.checkFTS5Availability(),
coverage: Math.round(coverage * 100) / 100
};
}
/**
* Rebuild the entire search index
*/
async rebuildIndex(force: boolean = false): Promise<void> {
if (!this.isInitialized && !force) {
throw new Error("Search tables not initialized. Use force=true to create tables.");
}
log.info("Starting search index rebuild...");
const startTime = Date.now();
try {
// Clear existing index
this.clearIndex();
// Rebuild from all notes
const batchSize = 100;
let offset = 0;
let totalProcessed = 0;
while (true) {
const notes = sql.getRows<{
noteId: string;
title: string;
type: string;
mime: string;
content: string | null;
}>(`
SELECT
n.noteId,
n.title,
n.type,
n.mime,
b.content
FROM notes n
LEFT JOIN blobs b ON n.blobId = b.blobId
WHERE n.isDeleted = 0
AND n.isProtected = 0
AND n.type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap')
ORDER BY n.noteId
LIMIT ? OFFSET ?
`, [batchSize, offset]);
if (notes.length === 0) {
break;
}
// Process batch - trigger will handle the actual indexing
for (const note of notes) {
try {
// Touch the note to trigger re-indexing
sql.execute(`
UPDATE notes
SET dateModified = strftime('%Y-%m-%d %H:%M:%S.%f', 'now')
WHERE noteId = ?
`, [note.noteId]);
totalProcessed++;
} catch (error) {
log.error(`Failed to reindex note ${note.noteId}: ${error}`);
}
}
offset += batchSize;
if (totalProcessed % 1000 === 0) {
log.info(`Reindexed ${totalProcessed} notes...`);
}
}
const duration = Date.now() - startTime;
log.info(`Index rebuild completed: ${totalProcessed} notes in ${duration}ms`);
} catch (error) {
log.error(`Index rebuild failed: ${error}`);
throw error;
}
}
}
// Export singleton instance getter
export function getSQLiteSearchService(): SQLiteSearchService {
return SQLiteSearchService.getInstance();
}
// Export default getter function (not the instance, to avoid initialization issues)
export default getSQLiteSearchService;

View File

@@ -1,471 +0,0 @@
/**
* SQLite Search Utilities
*
* Helper functions and utilities for SQLite-based search operations.
* These utilities provide common functionality needed by the search service
* and help with data preparation, validation, and performance monitoring.
*/
import sql from "../sql.js";
import log from "../log.js";
import { normalize, stripTags } from "../utils.js";
/**
* Configuration for search utilities
*/
export const SEARCH_UTILS_CONFIG = {
BATCH_SIZE: 1000,
MAX_CONTENT_SIZE: 2 * 1024 * 1024, // 2MB
MIN_TOKEN_LENGTH: 2,
MAX_TOKEN_LENGTH: 100,
LOG_SLOW_QUERIES: true,
SLOW_QUERY_THRESHOLD: 100, // ms
} as const;
/**
* Interface for note content data
*/
export interface NoteContentData {
noteId: string;
title: string;
content: string;
type: string;
mime: string;
isProtected: boolean;
isDeleted: boolean;
}
/**
* Normalize text for search indexing
* Ensures consistent normalization across all search operations
*/
export function normalizeForSearch(text: string | null | undefined): string {
if (!text || typeof text !== 'string') {
return '';
}
// Use the standard normalize function and convert to lowercase
return normalize(text).toLowerCase();
}
/**
* Tokenize text into searchable words
* Handles camelCase, snake_case, and special characters
*/
export function tokenizeText(text: string | null | undefined): string[] {
if (!text || typeof text !== 'string') {
return [];
}
const tokens = new Set<string>();
// Split on word boundaries
const words = text
.split(/[\s\n\r\t,;.!?()[\]{}"'`~@#$%^&*+=|\\/<>:-]+/)
.filter(word => word.length >= SEARCH_UTILS_CONFIG.MIN_TOKEN_LENGTH &&
word.length <= SEARCH_UTILS_CONFIG.MAX_TOKEN_LENGTH);
for (const word of words) {
// Add the original word (lowercase)
tokens.add(word.toLowerCase());
// Handle snake_case
const snakeParts = word.split('_').filter(part => part.length > 0);
if (snakeParts.length > 1) {
for (const part of snakeParts) {
tokens.add(part.toLowerCase());
// Also handle camelCase within snake_case parts
const camelParts = splitCamelCase(part);
for (const camelPart of camelParts) {
if (camelPart.length >= SEARCH_UTILS_CONFIG.MIN_TOKEN_LENGTH) {
tokens.add(camelPart.toLowerCase());
}
}
}
} else {
// Handle camelCase
const camelParts = splitCamelCase(word);
for (const part of camelParts) {
if (part.length >= SEARCH_UTILS_CONFIG.MIN_TOKEN_LENGTH) {
tokens.add(part.toLowerCase());
}
}
}
}
return Array.from(tokens);
}
/**
* Split camelCase strings into parts
*/
function splitCamelCase(str: string): string[] {
// Split on transitions from lowercase to uppercase
// Also handle sequences of uppercase letters (e.g., "XMLParser" -> ["XML", "Parser"])
return str.split(/(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])/);
}
/**
* Process HTML content for indexing
* Removes tags and normalizes the text
*/
export function processHtmlContent(html: string | null | undefined): string {
if (!html || typeof html !== 'string') {
return '';
}
// Remove script and style content
let text = html.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '');
text = text.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '');
// Strip remaining tags
text = stripTags(text);
// Decode HTML entities
text = text.replace(/&nbsp;/g, ' ');
text = text.replace(/&lt;/g, '<');
text = text.replace(/&gt;/g, '>');
text = text.replace(/&amp;/g, '&');
text = text.replace(/&quot;/g, '"');
text = text.replace(/&#39;/g, "'");
text = text.replace(/&apos;/g, "'");
// Normalize whitespace
text = text.replace(/\s+/g, ' ').trim();
return text;
}
/**
* Process JSON content (e.g., mindmaps, canvas) for indexing
*/
export function processJsonContent(json: string | null | undefined, type: string): string {
if (!json || typeof json !== 'string') {
return '';
}
try {
const data = JSON.parse(json);
if (type === 'mindMap') {
return extractMindMapText(data);
} else if (type === 'canvas') {
return extractCanvasText(data);
}
// For other JSON types, try to extract text content
return extractTextFromObject(data);
} catch (error) {
log.info(`Failed to process JSON content: ${error}`);
return '';
}
}
/**
* Extract text from mindmap JSON structure
*/
function extractMindMapText(data: any): string {
const texts: string[] = [];
function collectTopics(node: any): void {
if (!node) return;
if (node.topic) {
texts.push(node.topic);
}
if (node.children && Array.isArray(node.children)) {
for (const child of node.children) {
collectTopics(child);
}
}
}
if (data.nodedata) {
collectTopics(data.nodedata);
}
return texts.join(' ');
}
/**
* Extract text from canvas JSON structure
*/
function extractCanvasText(data: any): string {
const texts: string[] = [];
if (data.elements && Array.isArray(data.elements)) {
for (const element of data.elements) {
if (element.type === 'text' && element.text) {
texts.push(element.text);
}
}
}
return texts.join(' ');
}
/**
* Generic text extraction from JSON objects
*/
function extractTextFromObject(obj: any, maxDepth = 10): string {
if (maxDepth <= 0) return '';
const texts: string[] = [];
if (typeof obj === 'string') {
return obj;
} else if (Array.isArray(obj)) {
for (const item of obj) {
const text = extractTextFromObject(item, maxDepth - 1);
if (text) texts.push(text);
}
} else if (typeof obj === 'object' && obj !== null) {
for (const key of Object.keys(obj)) {
// Look for common text field names
if (['text', 'content', 'value', 'title', 'name', 'label', 'description'].includes(key.toLowerCase())) {
const value = obj[key];
if (typeof value === 'string') {
texts.push(value);
}
} else {
const text = extractTextFromObject(obj[key], maxDepth - 1);
if (text) texts.push(text);
}
}
}
return texts.join(' ');
}
/**
* Prepare note content for indexing
* Handles different note types and formats
*/
export function prepareNoteContent(note: NoteContentData): {
normalizedContent: string;
normalizedTitle: string;
tokens: string[];
} {
let content = note.content;
// Process content based on type
if (note.type === 'text' && note.mime === 'text/html') {
content = processHtmlContent(content);
} else if ((note.type === 'mindMap' || note.type === 'canvas') && note.mime === 'application/json') {
content = processJsonContent(content, note.type);
}
// Check content size
if (content.length > SEARCH_UTILS_CONFIG.MAX_CONTENT_SIZE) {
log.info(`Note ${note.noteId} content exceeds max size (${content.length} bytes), truncating`);
content = content.substring(0, SEARCH_UTILS_CONFIG.MAX_CONTENT_SIZE);
}
// Normalize content and title
const normalizedContent = normalizeForSearch(content);
const normalizedTitle = normalizeForSearch(note.title);
// Generate tokens from both content and title
const allText = `${note.title} ${content}`;
const tokens = tokenizeText(allText);
return {
normalizedContent,
normalizedTitle,
tokens
};
}
/**
* Update search index for a single note
*/
export async function updateNoteSearchIndex(noteId: string): Promise<void> {
try {
// Get note data
const noteData = sql.getRow<NoteContentData>(`
SELECT n.noteId, n.title, b.content, n.type, n.mime, n.isProtected, n.isDeleted
FROM notes n
LEFT JOIN blobs b ON n.blobId = b.blobId
WHERE n.noteId = ?
`, [noteId]);
if (!noteData) {
log.info(`Note ${noteId} not found for indexing`);
return;
}
// Prepare content for indexing
const { normalizedContent, normalizedTitle, tokens } = prepareNoteContent(noteData);
// Update search content table
// Note: note_search_content doesn't have isProtected/isDeleted columns
// Those are in the notes table which we join with
sql.execute(`
INSERT OR REPLACE INTO note_search_content
(noteId, title, content, title_normalized, content_normalized, full_text_normalized)
VALUES (?, ?, ?, ?, ?, ?)
`, [noteId, noteData.title, noteData.content || '',
normalizedTitle, normalizedContent,
normalizedTitle + ' ' + normalizedContent]);
// Delete existing tokens for this note
sql.execute(`DELETE FROM note_tokens WHERE noteId = ?`, [noteId]);
// Insert new tokens with proper structure
let position = 0;
for (const token of tokens) {
sql.execute(`
INSERT INTO note_tokens (noteId, token, token_normalized, position, source)
VALUES (?, ?, ?, ?, 'content')
`, [noteId, token, normalizeForSearch(token), position]);
position++;
}
log.info(`Updated search index for note ${noteId}`);
} catch (error) {
log.error(`Failed to update search index for note ${noteId}: ${error}`);
throw error;
}
}
/**
* Batch update search index for multiple notes
*/
export async function batchUpdateSearchIndex(noteIds: string[]): Promise<void> {
const startTime = Date.now();
let successCount = 0;
let errorCount = 0;
// Process in batches
for (let i = 0; i < noteIds.length; i += SEARCH_UTILS_CONFIG.BATCH_SIZE) {
const batch = noteIds.slice(i, i + SEARCH_UTILS_CONFIG.BATCH_SIZE);
try {
sql.transactional(() => {
for (const noteId of batch) {
try {
updateNoteSearchIndex(noteId);
successCount++;
} catch (error) {
log.error(`Failed to index note ${noteId}: ${error}`);
errorCount++;
}
}
});
} catch (error) {
log.error(`Batch indexing failed: ${error}`);
errorCount += batch.length;
}
}
const elapsed = Date.now() - startTime;
log.info(`Batch search indexing completed: ${successCount} success, ${errorCount} errors, ${elapsed}ms`);
}
/**
* Verify search index integrity
*/
export function verifySearchIndex(): {
valid: boolean;
issues: string[];
stats: {
totalNotes: number;
indexedNotes: number;
missingFromIndex: number;
orphanedEntries: number;
};
} {
const issues: string[] = [];
// Count total notes
const totalNotes = sql.getValue<number>(`
SELECT COUNT(*) FROM notes WHERE isDeleted = 0
`) || 0;
// Count indexed notes - JOIN with notes table for isDeleted filter
const indexedNotes = sql.getValue<number>(`
SELECT COUNT(DISTINCT nsc.noteId)
FROM note_search_content nsc
JOIN notes n ON nsc.noteId = n.noteId
WHERE n.isDeleted = 0
`) || 0;
// Find notes missing from index
const missingNotes = sql.getColumn<string>(`
SELECT noteId FROM notes
WHERE isDeleted = 0
AND noteId NOT IN (SELECT noteId FROM note_search_content)
`);
if (missingNotes.length > 0) {
issues.push(`${missingNotes.length} notes missing from search index`);
}
// Find orphaned index entries
const orphanedEntries = sql.getColumn<string>(`
SELECT noteId FROM note_search_content
WHERE noteId NOT IN (SELECT noteId FROM notes)
`);
if (orphanedEntries.length > 0) {
issues.push(`${orphanedEntries.length} orphaned entries in search index`);
}
// Check token table consistency
const tokenMismatch = sql.getValue<number>(`
SELECT COUNT(*) FROM note_search_content
WHERE noteId NOT IN (SELECT noteId FROM note_tokens)
`) || 0;
if (tokenMismatch > 0) {
issues.push(`${tokenMismatch} notes missing from token index`);
}
return {
valid: issues.length === 0,
issues,
stats: {
totalNotes,
indexedNotes,
missingFromIndex: missingNotes.length,
orphanedEntries: orphanedEntries.length
}
};
}
/**
* Performance monitoring wrapper for search queries
*/
export function monitorQuery<T>(
queryName: string,
queryFn: () => T
): T {
const startTime = Date.now();
try {
const result = queryFn();
const elapsed = Date.now() - startTime;
if (SEARCH_UTILS_CONFIG.LOG_SLOW_QUERIES && elapsed > SEARCH_UTILS_CONFIG.SLOW_QUERY_THRESHOLD) {
log.info(`Slow search query detected: ${queryName} took ${elapsed}ms`);
}
return result;
} catch (error) {
const elapsed = Date.now() - startTime;
log.error(`Search query failed: ${queryName} after ${elapsed}ms - ${error}`);
throw error;
}
}
/**
* Export utility functions for testing
*/
export const testUtils = {
splitCamelCase,
extractMindMapText,
extractCanvasText,
extractTextFromObject
};

View File

@@ -1,219 +0,0 @@
#!/usr/bin/env ts-node
/**
* Verification script for SQLite search implementation
*
* This script checks:
* 1. If migration 0235 has run (tables exist)
* 2. If SQL functions are registered
* 3. If search queries work correctly
* 4. Performance comparison between SQLite and TypeScript
*/
import sql from "../sql.js";
import log from "../log.js";
import { getSQLiteSearchService } from "./sqlite_search_service.js";
import SearchContext from "./search_context.js";
import becca from "../../becca/becca.js";
async function verifyTables(): Promise<boolean> {
console.log("\n=== Checking Database Tables ===");
const tables = [
{ name: 'note_search_content', required: true },
{ name: 'note_tokens', required: true },
{ name: 'notes_fts', required: false } // From migration 0234
];
let allExist = true;
for (const table of tables) {
const exists = sql.getValue<number>(`
SELECT COUNT(*) FROM sqlite_master
WHERE type='table' AND name=?
`, [table.name]) > 0;
const status = exists ? '✓' : '✗';
const requiredText = table.required ? ' (REQUIRED)' : ' (optional)';
console.log(` ${status} ${table.name}${requiredText}`);
if (table.required && !exists) {
allExist = false;
}
}
if (!allExist) {
console.log("\n❌ Required tables are missing!");
console.log(" Migration 0235 needs to run.");
console.log(" The APP_DB_VERSION has been updated to 235.");
console.log(" Restart the server to run the migration.");
}
return allExist;
}
async function verifyFunctions(): Promise<boolean> {
console.log("\n=== Checking SQL Functions ===");
const functions = [
{ name: 'normalize_text', test: "SELECT normalize_text('Café')" },
{ name: 'edit_distance', test: "SELECT edit_distance('test', 'text', 2)" },
{ name: 'regex_match', test: "SELECT regex_match('test', 'testing')" },
{ name: 'tokenize_text', test: "SELECT tokenize_text('hello world')" },
{ name: 'strip_html', test: "SELECT strip_html('<p>test</p>')" }
];
let allWork = true;
for (const func of functions) {
try {
const result = sql.getValue(func.test);
console.log(`${func.name} - Result: ${result}`);
} catch (error: any) {
console.log(`${func.name} - Error: ${error.message}`);
allWork = false;
}
}
if (!allWork) {
console.log("\n⚠ Some SQL functions are not working.");
console.log(" They should be registered when the server starts.");
}
return allWork;
}
async function verifySearchContent(): Promise<void> {
console.log("\n=== Checking Search Index Content ===");
const noteCount = sql.getValue<number>(`
SELECT COUNT(*) FROM notes
WHERE isDeleted = 0 AND isProtected = 0
`) || 0;
const indexedCount = sql.getValue<number>(`
SELECT COUNT(*) FROM note_search_content
`) || 0;
const tokenCount = sql.getValue<number>(`
SELECT COUNT(DISTINCT noteId) FROM note_tokens
`) || 0;
console.log(` Notes eligible for indexing: ${noteCount}`);
console.log(` Notes in search index: ${indexedCount}`);
console.log(` Notes with tokens: ${tokenCount}`);
if (indexedCount === 0 && noteCount > 0) {
console.log("\n⚠ Search index is empty but there are notes to index.");
console.log(" The migration should populate the index automatically.");
} else if (indexedCount < noteCount) {
console.log("\n⚠ Some notes are not indexed.");
console.log(` Missing: ${noteCount - indexedCount} notes`);
} else {
console.log("\n✓ Search index is populated");
}
}
async function testSearch(): Promise<void> {
console.log("\n=== Testing Search Functionality ===");
// Initialize becca if needed
if (!becca.loaded) {
console.log(" Loading becca...");
// Note: becca may not have a load method in this version
}
const searchService = getSQLiteSearchService();
const searchContext = new SearchContext({
fastSearch: false,
includeArchivedNotes: false,
fuzzyAttributeSearch: false,
debug: false
});
// Test different operators
const tests = [
{ operator: '*=*', tokens: ['note'], description: 'Substring search' },
{ operator: '=*', tokens: ['test'], description: 'Prefix search' },
{ operator: '*=', tokens: ['ing'], description: 'Suffix search' },
{ operator: '~=', tokens: ['nite'], description: 'Fuzzy search' }
];
for (const test of tests) {
try {
console.log(`\n Testing ${test.description} (${test.operator}):`);
const startTime = Date.now();
const results = searchService.search(test.tokens, test.operator, searchContext);
const duration = Date.now() - startTime;
const resultCount = Array.isArray(results) ? results.length : results.size || 0;
console.log(` Found ${resultCount} results in ${duration}ms`);
if (resultCount > 0) {
const sampleResults = Array.isArray(results) ? results.slice(0, 3) : Array.from(results).slice(0, 3);
console.log(` Sample results: ${sampleResults.join(', ')}...`);
}
} catch (error: any) {
console.log(` ✗ Error: ${error.message}`);
}
}
}
async function main() {
console.log("========================================");
console.log(" SQLite Search Implementation Test");
console.log("========================================");
try {
// Check current database version
const currentDbVersion = sql.getValue<number>("SELECT value FROM options WHERE name = 'dbVersion'") || 0;
console.log(`\nCurrent database version: ${currentDbVersion}`);
console.log(`Target database version: 235`);
if (currentDbVersion < 235) {
console.log("\n⚠ Database needs migration from version " + currentDbVersion + " to 235");
console.log(" Restart the server to run migrations.");
return;
}
// Verify tables exist
const tablesExist = await verifyTables();
if (!tablesExist) {
return;
}
// Verify functions work
const functionsWork = await verifyFunctions();
// Check index content
await verifySearchContent();
// Test search if everything is ready
if (tablesExist && functionsWork) {
await testSearch();
}
console.log("\n========================================");
console.log(" Test Complete");
console.log("========================================");
if (tablesExist && functionsWork) {
console.log("\n✅ SQLite search implementation is ready!");
console.log("\nTo enable SQLite search:");
console.log(" 1. Set searchBackend option to 'sqlite'");
console.log(" 2. Or use the admin API: PUT /api/search-admin/config");
} else {
console.log("\n❌ SQLite search is not ready. See issues above.");
}
} catch (error: any) {
console.error("\n❌ Test failed with error:", error);
console.error(error.stack);
}
}
// Run if executed directly
if (require.main === module) {
main().then(() => process.exit(0)).catch(() => process.exit(1));
}
export { verifyTables, verifyFunctions, testSearch };

View File

@@ -136,14 +136,6 @@ export interface OptionDefinitions extends KeyboardShortcutsOptions<KeyboardActi
redirectBareDomain: boolean; redirectBareDomain: boolean;
showLoginInShareTheme: boolean; showLoginInShareTheme: boolean;
// Search settings
searchBackend: string;
searchSqliteEnabled: boolean;
searchSqlitePerformanceLogging: boolean;
searchSqliteMaxMemory: number;
searchSqliteBatchSize: number;
searchSqliteAutoRebuild: boolean;
// AI/LLM integration options // AI/LLM integration options
aiEnabled: boolean; aiEnabled: boolean;
aiProvider: string; aiProvider: string;