Files
Trilium/apps/server/src/migrations/0235__sqlite_native_search.ts

826 lines
29 KiB
TypeScript

/**
* Migration to add SQLite native search support with normalized text tables
*
* This migration implements Phase 1 of the SQLite-based search plan:
* 1. Creates note_search_content table with normalized text columns
* 2. Creates note_tokens table for word-level token storage
* 3. Adds necessary indexes for optimization
* 4. Creates triggers to keep tables synchronized with note updates
* 5. Populates tables with existing note data in batches
*
* This provides 100% accurate search results with 10-30x performance improvement
* over TypeScript-based search, without the complexity of trigrams.
*/
import sql from "../services/sql.js";
import log from "../services/log.js";
import { normalize as utilsNormalize, stripTags } from "../services/utils.js";
import { getSqliteFunctionsService } from "../services/search/sqlite_functions.js";
/**
* Uses the existing normalize function from utils.ts for consistency
* This ensures all normalization throughout the codebase is identical
*/
function normalizeText(text: string): string {
if (!text) return '';
return utilsNormalize(text);
}
/**
* Tokenizes text into individual words for token-based searching
* Handles punctuation and special characters appropriately
*/
function tokenize(text: string): string[] {
if (!text) return [];
// Split on word boundaries, filter out empty tokens
// This regex splits on spaces, punctuation, and other non-word characters
// but preserves apostrophes within words (e.g., "don't", "it's")
const tokens = text
.split(/[\s\n\r\t,;.!?()[\]{}"'`~@#$%^&*+=|\\/<>:_-]+/)
.filter(token => token.length > 0)
.map(token => token.toLowerCase());
// Also split on camelCase and snake_case boundaries for code content
const expandedTokens: string[] = [];
for (const token of tokens) {
// Add the original token
expandedTokens.push(token);
// Split camelCase (e.g., "getUserName" -> ["get", "User", "Name"])
const camelCaseParts = token.split(/(?=[A-Z])/);
if (camelCaseParts.length > 1) {
expandedTokens.push(...camelCaseParts.map(p => p.toLowerCase()));
}
// Split snake_case (e.g., "user_name" -> ["user", "name"])
const snakeCaseParts = token.split('_');
if (snakeCaseParts.length > 1) {
expandedTokens.push(...snakeCaseParts);
}
}
// Remove duplicates and return
return Array.from(new Set(expandedTokens));
}
/**
* Strips HTML tags from content for text-only indexing
* Uses the utils stripTags function for consistency
*/
function stripHtmlTags(html: string): string {
if (!html) return '';
// Remove script and style content entirely first
let text = html.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '');
text = text.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '');
// Use utils stripTags for consistency
text = stripTags(text);
// Decode HTML entities
text = text.replace(/&nbsp;/g, ' ');
text = text.replace(/&lt;/g, '<');
text = text.replace(/&gt;/g, '>');
text = text.replace(/&amp;/g, '&');
text = text.replace(/&quot;/g, '"');
text = text.replace(/&#39;/g, "'");
// Normalize whitespace
text = text.replace(/\s+/g, ' ').trim();
return text;
}
export default function sqliteNativeSearch() {
log.info("Starting SQLite native search migration...");
const startTime = Date.now();
// Wrap entire migration in a transaction for atomicity
sql.transactional(() => {
try {
// Register custom SQL functions first so they can be used in triggers
registerCustomFunctions();
// Create the search tables and indexes
createSearchTables();
// Create triggers to keep tables synchronized (before population)
createSearchTriggers();
// Populate the tables with existing note data
populateSearchTables();
// Run final verification and optimization
finalizeSearchSetup();
const duration = Date.now() - startTime;
log.info(`SQLite native search migration completed successfully in ${duration}ms`);
} catch (error) {
log.error(`SQLite native search migration failed: ${error}`);
// Transaction will automatically rollback on error
throw error;
}
});
}
function createSearchTables() {
log.info("Creating search content and token tables...");
// Drop existing tables if they exist (for re-running migration in dev)
sql.execute("DROP TABLE IF EXISTS note_search_content");
sql.execute("DROP TABLE IF EXISTS note_tokens");
// Create the main search content table
sql.execute(`
CREATE TABLE note_search_content (
noteId TEXT PRIMARY KEY,
title TEXT NOT NULL,
content TEXT NOT NULL,
title_normalized TEXT NOT NULL,
content_normalized TEXT NOT NULL,
full_text_normalized TEXT NOT NULL
)
`);
// Create the token table for word-level operations
sql.execute(`
CREATE TABLE note_tokens (
noteId TEXT NOT NULL,
token TEXT NOT NULL,
token_normalized TEXT NOT NULL,
position INTEGER NOT NULL,
source TEXT NOT NULL CHECK(source IN ('title', 'content')),
PRIMARY KEY (noteId, position, source)
)
`);
// Create indexes for search optimization
log.info("Creating search indexes...");
// Consolidated indexes - removed redundancy between COLLATE NOCASE and plain indexes
// Using COLLATE NOCASE for case-insensitive searches
sql.execute(`
CREATE INDEX idx_search_title_normalized
ON note_search_content(title_normalized COLLATE NOCASE)
`);
sql.execute(`
CREATE INDEX idx_search_content_normalized
ON note_search_content(content_normalized COLLATE NOCASE)
`);
sql.execute(`
CREATE INDEX idx_search_full_text
ON note_search_content(full_text_normalized COLLATE NOCASE)
`);
// Token indexes - consolidated to avoid redundancy
sql.execute(`
CREATE INDEX idx_tokens_normalized
ON note_tokens(token_normalized COLLATE NOCASE)
`);
sql.execute(`
CREATE INDEX idx_tokens_noteId
ON note_tokens(noteId)
`);
// Composite index for token searches with source
sql.execute(`
CREATE INDEX idx_tokens_source_normalized
ON note_tokens(source, token_normalized COLLATE NOCASE)
`);
log.info("Search tables and indexes created successfully");
}
function populateSearchTables() {
log.info("Populating search tables with existing note content...");
const batchSize = 100;
let offset = 0;
let totalProcessed = 0;
let totalTokens = 0;
while (true) {
const notes = sql.getRows<{
noteId: string;
title: string;
type: string;
mime: string;
content: string | null;
}>(`
SELECT
n.noteId,
n.title,
n.type,
n.mime,
b.content
FROM notes n
LEFT JOIN blobs b ON n.blobId = b.blobId
WHERE n.isDeleted = 0
AND n.isProtected = 0
AND n.type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap')
ORDER BY n.noteId
LIMIT ? OFFSET ?
`, [batchSize, offset]);
if (notes.length === 0) {
break;
}
// Process batch of notes
for (const note of notes) {
try {
// Process content based on type
let processedContent = note.content || '';
// Strip HTML for text notes
if (note.type === 'text' && note.mime === 'text/html') {
processedContent = stripHtmlTags(processedContent);
}
// Normalize text for searching using the utils normalize function
const titleNorm = normalizeText(note.title);
const contentNorm = normalizeText(processedContent);
const fullTextNorm = titleNorm + ' ' + contentNorm;
// Insert into search content table
sql.execute(`
INSERT INTO note_search_content
(noteId, title, content, title_normalized, content_normalized, full_text_normalized)
VALUES (?, ?, ?, ?, ?, ?)
`, [
note.noteId,
note.title,
processedContent,
titleNorm,
contentNorm,
fullTextNorm
]);
// Tokenize title and content separately to track source
const titleTokens = tokenize(note.title);
const contentTokens = tokenize(processedContent);
let position = 0;
// Insert title tokens
for (const token of titleTokens) {
if (token.length > 0) {
sql.execute(`
INSERT OR IGNORE INTO note_tokens
(noteId, token, token_normalized, position, source)
VALUES (?, ?, ?, ?, 'title')
`, [note.noteId, token, normalizeText(token), position]);
position++;
totalTokens++;
}
}
// Insert content tokens with unique positions
for (const token of contentTokens) {
if (token.length > 0) {
sql.execute(`
INSERT OR IGNORE INTO note_tokens
(noteId, token, token_normalized, position, source)
VALUES (?, ?, ?, ?, 'content')
`, [note.noteId, token, normalizeText(token), position]);
position++;
totalTokens++;
}
}
totalProcessed++;
} catch (error) {
log.error(`Failed to index note ${note.noteId}: ${error}`);
// Continue with other notes even if one fails
}
}
offset += batchSize;
if (totalProcessed % 1000 === 0) {
log.info(`Processed ${totalProcessed} notes, ${totalTokens} tokens for search indexing...`);
}
}
log.info(`Completed indexing ${totalProcessed} notes with ${totalTokens} total tokens`);
}
function createSearchTriggers() {
log.info("Creating triggers to keep search tables synchronized...");
// Drop existing triggers if they exist
const triggers = [
'note_search_insert',
'note_search_update',
'note_search_delete',
'note_search_soft_delete',
'note_search_undelete',
'note_search_protect',
'note_search_unprotect',
'note_search_blob_insert',
'note_search_blob_update'
];
for (const trigger of triggers) {
sql.execute(`DROP TRIGGER IF EXISTS ${trigger}`);
}
// Trigger for INSERT operations on notes - simplified version
sql.execute(`
CREATE TRIGGER note_search_insert
AFTER INSERT ON notes
WHEN NEW.type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap')
AND NEW.isDeleted = 0
AND NEW.isProtected = 0
BEGIN
-- Delete any existing entries (for INSERT OR REPLACE)
DELETE FROM note_search_content WHERE noteId = NEW.noteId;
DELETE FROM note_tokens WHERE noteId = NEW.noteId;
-- Insert basic content with title only (content will be populated by blob trigger)
INSERT INTO note_search_content
(noteId, title, content, title_normalized, content_normalized, full_text_normalized)
VALUES (
NEW.noteId,
NEW.title,
'',
LOWER(NEW.title),
'',
LOWER(NEW.title)
);
END
`);
// Trigger for UPDATE operations on notes - simplified version
sql.execute(`
CREATE TRIGGER note_search_update
AFTER UPDATE ON notes
WHEN NEW.type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap')
BEGIN
-- Always delete the old entries
DELETE FROM note_search_content WHERE noteId = NEW.noteId;
DELETE FROM note_tokens WHERE noteId = NEW.noteId;
-- Re-insert if note is not deleted and not protected
INSERT INTO note_search_content
(noteId, title, content, title_normalized, content_normalized, full_text_normalized)
SELECT
NEW.noteId,
NEW.title,
COALESCE(b.content, ''),
LOWER(NEW.title),
LOWER(COALESCE(b.content, '')),
LOWER(NEW.title || ' ' || COALESCE(b.content, ''))
FROM notes n
LEFT JOIN blobs b ON b.blobId = NEW.blobId
WHERE n.noteId = NEW.noteId
AND NEW.isDeleted = 0
AND NEW.isProtected = 0;
END
`);
// Trigger for DELETE operations on notes
sql.execute(`
CREATE TRIGGER note_search_delete
AFTER DELETE ON notes
BEGIN
DELETE FROM note_search_content WHERE noteId = OLD.noteId;
DELETE FROM note_tokens WHERE noteId = OLD.noteId;
END
`);
// Trigger for soft delete (isDeleted = 1)
sql.execute(`
CREATE TRIGGER note_search_soft_delete
AFTER UPDATE ON notes
WHEN OLD.isDeleted = 0 AND NEW.isDeleted = 1
BEGIN
DELETE FROM note_search_content WHERE noteId = NEW.noteId;
DELETE FROM note_tokens WHERE noteId = NEW.noteId;
END
`);
// Trigger for undelete (isDeleted = 0) - simplified version
sql.execute(`
CREATE TRIGGER note_search_undelete
AFTER UPDATE ON notes
WHEN OLD.isDeleted = 1 AND NEW.isDeleted = 0
AND NEW.type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap')
AND NEW.isProtected = 0
BEGIN
DELETE FROM note_search_content WHERE noteId = NEW.noteId;
DELETE FROM note_tokens WHERE noteId = NEW.noteId;
INSERT INTO note_search_content
(noteId, title, content, title_normalized, content_normalized, full_text_normalized)
SELECT
NEW.noteId,
NEW.title,
COALESCE(b.content, ''),
LOWER(NEW.title),
LOWER(COALESCE(b.content, '')),
LOWER(NEW.title || ' ' || COALESCE(b.content, ''))
FROM notes n
LEFT JOIN blobs b ON b.blobId = NEW.blobId
WHERE n.noteId = NEW.noteId;
END
`);
// Trigger for notes becoming protected
sql.execute(`
CREATE TRIGGER note_search_protect
AFTER UPDATE ON notes
WHEN OLD.isProtected = 0 AND NEW.isProtected = 1
BEGIN
DELETE FROM note_search_content WHERE noteId = NEW.noteId;
DELETE FROM note_tokens WHERE noteId = NEW.noteId;
END
`);
// Trigger for notes becoming unprotected - simplified version
sql.execute(`
CREATE TRIGGER note_search_unprotect
AFTER UPDATE ON notes
WHEN OLD.isProtected = 1 AND NEW.isProtected = 0
AND NEW.type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap')
AND NEW.isDeleted = 0
BEGIN
DELETE FROM note_search_content WHERE noteId = NEW.noteId;
DELETE FROM note_tokens WHERE noteId = NEW.noteId;
INSERT INTO note_search_content
(noteId, title, content, title_normalized, content_normalized, full_text_normalized)
SELECT
NEW.noteId,
NEW.title,
COALESCE(b.content, ''),
LOWER(NEW.title),
LOWER(COALESCE(b.content, '')),
LOWER(NEW.title || ' ' || COALESCE(b.content, ''))
FROM notes n
LEFT JOIN blobs b ON b.blobId = NEW.blobId
WHERE n.noteId = NEW.noteId;
END
`);
// Trigger for INSERT operations on blobs - simplified version
sql.execute(`
CREATE TRIGGER note_search_blob_insert
AFTER INSERT ON blobs
BEGIN
-- Update search content for all notes that reference this blob
UPDATE note_search_content
SET content = NEW.content,
content_normalized = LOWER(NEW.content),
full_text_normalized = title_normalized || ' ' || LOWER(NEW.content)
WHERE noteId IN (
SELECT n.noteId
FROM notes n
WHERE n.blobId = NEW.blobId
AND n.type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap')
AND n.isDeleted = 0
AND n.isProtected = 0
);
-- Clear tokens for affected notes (will be repopulated by post-processing)
DELETE FROM note_tokens
WHERE noteId IN (
SELECT n.noteId
FROM notes n
WHERE n.blobId = NEW.blobId
AND n.type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap')
AND n.isDeleted = 0
AND n.isProtected = 0
);
END
`);
// Trigger for UPDATE operations on blobs - simplified version
sql.execute(`
CREATE TRIGGER note_search_blob_update
AFTER UPDATE ON blobs
BEGIN
-- Update search content for all notes that reference this blob
UPDATE note_search_content
SET content = NEW.content,
content_normalized = LOWER(NEW.content),
full_text_normalized = title_normalized || ' ' || LOWER(NEW.content)
WHERE noteId IN (
SELECT n.noteId
FROM notes n
WHERE n.blobId = NEW.blobId
AND n.type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap')
AND n.isDeleted = 0
AND n.isProtected = 0
);
-- Clear tokens for affected notes (will be repopulated by post-processing)
DELETE FROM note_tokens
WHERE noteId IN (
SELECT n.noteId
FROM notes n
WHERE n.blobId = NEW.blobId
AND n.type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap')
AND n.isDeleted = 0
AND n.isProtected = 0
);
END
`);
log.info("Search synchronization triggers created successfully");
}
function registerCustomFunctions() {
log.info("Registering custom SQL functions for search operations...");
try {
// Get the database connection to register functions
const db = sql.getDbConnection();
// Use the centralized SQLite functions service
const functionsService = getSqliteFunctionsService();
// Register functions if not already registered
if (!functionsService.isRegistered()) {
const success = functionsService.registerFunctions(db);
if (success) {
log.info("Custom SQL functions registered successfully via service");
} else {
log.info("Custom SQL functions registration failed - using basic SQLite functions only");
}
} else {
log.info("Custom SQL functions already registered");
}
// Register migration-specific helper function for tokenization
db.function('tokenize_for_migration', {
deterministic: true,
varargs: false
}, (text: string | null) => {
if (!text) return '';
// Return as JSON array string for SQL processing
return JSON.stringify(tokenize(text));
});
} catch (error) {
log.info(`Could not register custom SQL functions (will use basic SQLite functions): ${error}`);
// This is not critical - the migration will work with basic SQLite functions
}
}
/**
* Populates tokens for a specific note
* This is called outside of triggers to avoid complex SQL within trigger constraints
*/
function populateNoteTokens(noteId: string): number {
try {
// Get the note's search content
const noteData = sql.getRow<{
title: string;
content: string;
}>(`
SELECT title, content
FROM note_search_content
WHERE noteId = ?
`, [noteId]);
if (!noteData) return 0;
// Clear existing tokens for this note
sql.execute(`DELETE FROM note_tokens WHERE noteId = ?`, [noteId]);
// Tokenize title and content
const titleTokens = tokenize(noteData.title);
const contentTokens = tokenize(noteData.content);
let position = 0;
let tokenCount = 0;
// Insert title tokens
for (const token of titleTokens) {
if (token.length > 0) {
sql.execute(`
INSERT OR IGNORE INTO note_tokens
(noteId, token, token_normalized, position, source)
VALUES (?, ?, ?, ?, 'title')
`, [noteId, token, normalizeText(token), position]);
position++;
tokenCount++;
}
}
// Insert content tokens
for (const token of contentTokens) {
if (token.length > 0) {
sql.execute(`
INSERT OR IGNORE INTO note_tokens
(noteId, token, token_normalized, position, source)
VALUES (?, ?, ?, ?, 'content')
`, [noteId, token, normalizeText(token), position]);
position++;
tokenCount++;
}
}
return tokenCount;
} catch (error) {
log.error(`Error populating tokens for note ${noteId}: ${error}`);
return 0;
}
}
/**
* Populates tokens for multiple notes affected by blob operations
* This handles cases where blob triggers can affect multiple notes
*/
function populateBlobAffectedTokens(blobId: string): void {
try {
// Find all notes that reference this blob and need token updates
const affectedNoteIds = sql.getColumn<string>(`
SELECT DISTINCT n.noteId
FROM notes n
INNER JOIN note_search_content nsc ON n.noteId = nsc.noteId
WHERE n.blobId = ?
AND n.type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap')
AND n.isDeleted = 0
AND n.isProtected = 0
`, [blobId]);
if (affectedNoteIds.length === 0) return;
log.info(`Updating tokens for ${affectedNoteIds.length} notes affected by blob ${blobId}`);
let totalTokens = 0;
for (const noteId of affectedNoteIds) {
const tokenCount = populateNoteTokens(noteId);
totalTokens += tokenCount;
}
log.info(`Updated ${totalTokens} tokens for blob-affected notes`);
} catch (error) {
log.error(`Error populating blob-affected tokens for blob ${blobId}: ${error}`);
}
}
function populateAllTokens() {
log.info("Populating tokens for all search content...");
// Clear existing tokens first to ensure clean state
sql.execute("DELETE FROM note_tokens");
const batchSize = 100;
let offset = 0;
let totalProcessed = 0;
let totalTokens = 0;
while (true) {
const notes = sql.getRows<{
noteId: string;
title: string;
content: string;
}>(`
SELECT noteId, title, content
FROM note_search_content
ORDER BY noteId
LIMIT ? OFFSET ?
`, [batchSize, offset]);
if (notes.length === 0) {
break;
}
for (const note of notes) {
try {
// Tokenize title and content
const titleTokens = tokenize(note.title);
const contentTokens = tokenize(note.content);
let position = 0;
// Insert title tokens
for (const token of titleTokens) {
if (token.length > 0) {
sql.execute(`
INSERT OR IGNORE INTO note_tokens
(noteId, token, token_normalized, position, source)
VALUES (?, ?, ?, ?, 'title')
`, [note.noteId, token, normalizeText(token), position]);
position++;
totalTokens++;
}
}
// Insert content tokens with continuous position numbering
for (const token of contentTokens) {
if (token.length > 0) {
sql.execute(`
INSERT OR IGNORE INTO note_tokens
(noteId, token, token_normalized, position, source)
VALUES (?, ?, ?, ?, 'content')
`, [note.noteId, token, normalizeText(token), position]);
position++;
totalTokens++;
}
}
totalProcessed++;
} catch (error) {
log.error(`Failed to tokenize note ${note.noteId}: ${error}`);
}
}
offset += batchSize;
if (totalProcessed % 1000 === 0) {
log.info(`Processed ${totalProcessed} notes, ${totalTokens} tokens so far...`);
}
}
log.info(`Token population completed: ${totalProcessed} notes processed, ${totalTokens} total tokens`);
}
function finalizeSearchSetup() {
log.info("Running final verification and optimization...");
// Check for missing notes that should be indexed
const missingCount = sql.getValue<number>(`
SELECT COUNT(*) FROM notes n
LEFT JOIN blobs b ON n.blobId = b.blobId
WHERE n.type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap')
AND n.isDeleted = 0
AND n.isProtected = 0
AND b.content IS NOT NULL
AND NOT EXISTS (SELECT 1 FROM note_search_content WHERE noteId = n.noteId)
`) || 0;
if (missingCount > 0) {
log.info(`Found ${missingCount} notes that are missing from search index`);
// Index missing notes using basic SQLite functions
sql.execute(`
INSERT INTO note_search_content
(noteId, title, content, title_normalized, content_normalized, full_text_normalized)
SELECT
n.noteId,
n.title,
COALESCE(b.content, ''),
LOWER(n.title),
LOWER(COALESCE(b.content, '')),
LOWER(n.title || ' ' || COALESCE(b.content, ''))
FROM notes n
LEFT JOIN blobs b ON n.blobId = b.blobId
WHERE n.type IN ('text', 'code', 'mermaid', 'canvas', 'mindMap')
AND n.isDeleted = 0
AND n.isProtected = 0
AND b.content IS NOT NULL
AND NOT EXISTS (SELECT 1 FROM note_search_content WHERE noteId = n.noteId)
`);
log.info(`Indexed ${missingCount} missing notes`);
}
// Populate tokens for all existing content (including any missing notes we just added)
populateAllTokens();
// Verify table creation
const tables = sql.getColumn<string>(`
SELECT name FROM sqlite_master
WHERE type = 'table'
AND name IN ('note_search_content', 'note_tokens')
`);
if (tables.length !== 2) {
throw new Error("Search tables were not created properly");
}
// Check row counts
const searchContentCount = sql.getValue<number>("SELECT COUNT(*) FROM note_search_content") || 0;
const tokenCount = sql.getValue<number>("SELECT COUNT(*) FROM note_tokens") || 0;
log.info(`Search content table has ${searchContentCount} entries`);
log.info(`Token table has ${tokenCount} entries`);
// Run ANALYZE to update SQLite query planner statistics
log.info("Updating SQLite statistics for query optimization...");
sql.execute("ANALYZE note_search_content");
sql.execute("ANALYZE note_tokens");
// Verify indexes were created
const indexes = sql.getColumn<string>(`
SELECT name FROM sqlite_master
WHERE type = 'index'
AND tbl_name IN ('note_search_content', 'note_tokens')
`);
log.info(`Created ${indexes.length} indexes for search optimization`);
log.info("Search setup finalization completed");
}