Trilium/apps/server/src/services/search/utils/text_utils.ts

"use strict";

import { normalize } from "../../utils.js";

/**
 * Shared text processing utilities for search functionality
 */

// Configuration constants for fuzzy matching
export const FUZZY_SEARCH_CONFIG = {
    // Minimum token length for fuzzy operators to prevent false positives
    MIN_FUZZY_TOKEN_LENGTH: 3,
    // Maximum edit distance for fuzzy matching
    MAX_EDIT_DISTANCE: 2,
    // Maximum proximity distance for phrase matching (in words)
    MAX_PHRASE_PROXIMITY: 10,
    // Absolute hard limits for extreme cases - only to prevent system crashes
    ABSOLUTE_MAX_CONTENT_SIZE: 100 * 1024 * 1024, // 100MB - extreme upper limit to prevent OOM
    ABSOLUTE_MAX_WORD_COUNT: 2000000, // 2M words - extreme upper limit for word processing
    // Performance warning thresholds - inform user but still attempt search
    PERFORMANCE_WARNING_SIZE: 5 * 1024 * 1024, // 5MB - warn about potential performance impact
    PERFORMANCE_WARNING_WORDS: 100000, // 100K words - warn about word count impact
    // Progressive processing thresholds for very large content
    PROGRESSIVE_PROCESSING_SIZE: 10 * 1024 * 1024, // 10MB - use progressive processing
    PROGRESSIVE_PROCESSING_WORDS: 500000, // 500K words - use progressive processing
    // Performance thresholds
    EARLY_TERMINATION_THRESHOLD: 3,
} as const;

/**
 * Normalizes text by removing diacritics and converting to lowercase.
 * This is the centralized text normalization function used across all search components.
 * Uses the shared normalize function from utils for consistency.
 *
 * Examples:
 * - "café" -> "cafe"
 * - "naïve" -> "naive"
 * - "HELLO WORLD" -> "hello world"
 *
 * @param text The text to normalize
 * @returns The normalized text
 */
export function normalizeSearchText(text: string): string {
    if (!text || typeof text !== 'string') {
        return '';
    }

    // Use shared normalize function for consistency across the codebase
    return normalize(text);
}

/**
 * Optimized edit distance calculation using single array and early termination.
 * This is significantly more memory efficient than the 2D matrix approach and includes
 * early termination optimizations for better performance.
 *
 * @param str1 First string
 * @param str2 Second string
 * @param maxDistance Maximum allowed distance (for early termination)
 * @returns The edit distance between the strings, or maxDistance + 1 if exceeded
 */
export function calculateOptimizedEditDistance(str1: string, str2: string, maxDistance: number = FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE): number {
    // Input validation
    if (typeof str1 !== 'string' || typeof str2 !== 'string') {
        throw new Error('Both arguments must be strings');
    }

    if (maxDistance < 0 || !Number.isInteger(maxDistance)) {
        throw new Error('maxDistance must be a non-negative integer');
    }

    const len1 = str1.length;
    const len2 = str2.length;

    // Performance guard: if strings are too long, limit processing
    const maxStringLength = 1000;
    if (len1 > maxStringLength || len2 > maxStringLength) {
        // For very long strings, fall back to simple length-based heuristic
        return Math.abs(len1 - len2) <= maxDistance ? Math.abs(len1 - len2) : maxDistance + 1;
    }

    // Early termination: if length difference exceeds max distance
    if (Math.abs(len1 - len2) > maxDistance) {
        return maxDistance + 1;
    }

    // Handle edge cases
    if (len1 === 0) return len2 <= maxDistance ? len2 : maxDistance + 1;
    if (len2 === 0) return len1 <= maxDistance ? len1 : maxDistance + 1;

    // Use single array optimization for better memory usage
    let previousRow = Array.from({ length: len2 + 1 }, (_, i) => i);
    let currentRow = new Array(len2 + 1);

    for (let i = 1; i <= len1; i++) {
        currentRow[0] = i;
        let minInRow = i;

        for (let j = 1; j <= len2; j++) {
            const cost = str1[i - 1] === str2[j - 1] ? 0 : 1;
            currentRow[j] = Math.min(
                previousRow[j] + 1,        // deletion
                currentRow[j - 1] + 1,     // insertion
                previousRow[j - 1] + cost  // substitution
            );

            // Track minimum value in current row for early termination
            if (currentRow[j] < minInRow) {
                minInRow = currentRow[j];
            }
        }

        // Early termination: if minimum distance in row exceeds threshold
        if (minInRow > maxDistance) {
            return maxDistance + 1;
        }

        // Swap arrays for next iteration
        [previousRow, currentRow] = [currentRow, previousRow];
    }

    const result = previousRow[len2];
    return result <= maxDistance ? result : maxDistance + 1;
}

/**
 * Validates that tokens meet minimum requirements for fuzzy operators.
 *
 * @param tokens Array of search tokens
 * @param operator The search operator being used
 * @returns Validation result with success status and error message
 */
export function validateFuzzySearchTokens(tokens: string[], operator: string): { isValid: boolean; error?: string } {
    if (!operator || typeof operator !== 'string') {
        return {
            isValid: false,
            error: 'Invalid operator: operator must be a non-empty string'
        };
    }

    if (!Array.isArray(tokens)) {
        return {
            isValid: false,
            error: 'Invalid tokens: tokens must be an array'
        };
    }

    if (tokens.length === 0) {
        return {
            isValid: false,
            error: 'Invalid tokens: at least one token is required'
        };
    }

    // Check for null, undefined, or non-string tokens
    const invalidTypeTokens = tokens.filter(token =>
        token == null || typeof token !== 'string'
    );

    if (invalidTypeTokens.length > 0) {
        return {
            isValid: false,
            error: 'Invalid tokens: all tokens must be non-null strings'
        };
    }

    // Check for empty string tokens
    const emptyTokens = tokens.filter(token => token.trim().length === 0);

    if (emptyTokens.length > 0) {
        return {
            isValid: false,
            error: 'Invalid tokens: empty or whitespace-only tokens are not allowed'
        };
    }

    if (operator !== '~=' && operator !== '~*') {
        return { isValid: true };
    }

    // Check minimum token length for fuzzy operators
    const shortTokens = tokens.filter(token => token.length < FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH);

    if (shortTokens.length > 0) {
        return {
            isValid: false,
            error: `Fuzzy search operators (~=, ~*) require tokens of at least ${FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH} characters. Invalid tokens: ${shortTokens.join(', ')}`
        };
    }

    // Check for excessively long tokens that could cause performance issues
    const maxTokenLength = 100; // Reasonable limit for search tokens
    const longTokens = tokens.filter(token => token.length > maxTokenLength);

    if (longTokens.length > 0) {
        return {
            isValid: false,
            error: `Tokens are too long (max ${maxTokenLength} characters). Long tokens: ${longTokens.map(t => t.substring(0, 20) + '...').join(', ')}`
        };
    }

    return { isValid: true };
}

/**
 * Validates and preprocesses content for search operations.
 * Philosophy: Try to search everything! Only block truly extreme cases that could crash the system.
 *
 * @param content The content to validate and preprocess
 * @param noteId The note ID (for logging purposes)
 * @returns Processed content, only null for truly extreme cases that could cause system instability
 */
export function validateAndPreprocessContent(content: string, noteId?: string): string | null {
    if (!content || typeof content !== 'string') {
        return null;
    }

    // Only block content that could actually crash the system (100MB+)
    if (content.length > FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_CONTENT_SIZE) {
        console.error(`Content size exceeds absolute system limit for note ${noteId || 'unknown'}: ${content.length} bytes - this could cause system instability`);
        // Only in truly extreme cases, truncate to prevent system crash
        return content.substring(0, FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_CONTENT_SIZE);
    }

    // Warn about very large content but still process it
    if (content.length > FUZZY_SEARCH_CONFIG.PERFORMANCE_WARNING_SIZE) {
        console.info(`Large content for note ${noteId || 'unknown'}: ${content.length} bytes - processing may take time but will attempt full search`);
    }

    // For word count, be even more permissive - only block truly extreme cases
    const wordCount = content.split(/\s+/).length;
    if (wordCount > FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_WORD_COUNT) {
        console.error(`Word count exceeds absolute system limit for note ${noteId || 'unknown'}: ${wordCount} words - this could cause system instability`);
        // Only in truly extreme cases, truncate to prevent system crash
        return content.split(/\s+/).slice(0, FUZZY_SEARCH_CONFIG.ABSOLUTE_MAX_WORD_COUNT).join(' ');
    }

    // Warn about high word counts but still process them
    if (wordCount > FUZZY_SEARCH_CONFIG.PERFORMANCE_WARNING_WORDS) {
        console.info(`High word count for note ${noteId || 'unknown'}: ${wordCount} words - phrase matching may take time but will attempt full search`);
    }

    // Progressive processing warning for very large content
    if (content.length > FUZZY_SEARCH_CONFIG.PROGRESSIVE_PROCESSING_SIZE || wordCount > FUZZY_SEARCH_CONFIG.PROGRESSIVE_PROCESSING_WORDS) {
        console.info(`Very large content for note ${noteId || 'unknown'} - using progressive processing to maintain responsiveness`);
    }

    return content;
}

/**
 * Escapes special regex characters in a string for use in RegExp constructor
 */
function escapeRegExp(string: string): string {
    return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}

/**
 * Checks if a word matches a token with fuzzy matching and returns the matched word.
 * Optimized for common case where distances are small.
 *
 * @param token The search token (should be normalized)
 * @param text The text to match against (should be normalized)
 * @param maxDistance Maximum allowed edit distance
 * @returns The matched word if found, null otherwise
 */
export function fuzzyMatchWordWithResult(token: string, text: string, maxDistance: number = FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE): string | null {
    // Input validation
    if (typeof token !== 'string' || typeof text !== 'string') {
        return null;
    }

    if (token.length === 0 || text.length === 0) {
        return null;
    }

    try {
        // Normalize both strings for comparison
        const normalizedToken = token.toLowerCase();
        const normalizedText = text.toLowerCase();

        // Exact match check first (most common case)
        if (normalizedText.includes(normalizedToken)) {
            // Find the exact match in the original text to preserve case
            const exactMatch = text.match(new RegExp(escapeRegExp(token), 'i'));
            return exactMatch ? exactMatch[0] : token;
        }

        // For fuzzy matching, we need to check individual words in the text
        // Split the text into words and check each word against the token
        const words = normalizedText.split(/\s+/).filter(word => word.length > 0);
        const originalWords = text.split(/\s+/).filter(word => word.length > 0);

        for (let i = 0; i < words.length; i++) {
            const word = words[i];
            const originalWord = originalWords[i];

            // Skip if word is too different in length for fuzzy matching
            if (Math.abs(word.length - normalizedToken.length) > maxDistance) {
                continue;
            }

            // For very short tokens or very different lengths, be more strict
            if (normalizedToken.length < 4 || Math.abs(word.length - normalizedToken.length) > 2) {
                continue;
            }

            // Use optimized edit distance calculation
            const distance = calculateOptimizedEditDistance(normalizedToken, word, maxDistance);
            if (distance <= maxDistance) {
                return originalWord; // Return the original word with case preserved
            }
        }

        return null;
    } catch (error) {
        // Log error and return null for safety
        console.warn('Error in fuzzy word matching:', error);
        return null;
    }
}

/**
 * Checks if a word matches a token with fuzzy matching.
 * Optimized for common case where distances are small.
 *
 * @param token The search token (should be normalized)
 * @param word The word to match against (should be normalized)
 * @param maxDistance Maximum allowed edit distance
 * @returns True if the word matches the token within the distance threshold
 */
export function fuzzyMatchWord(token: string, text: string, maxDistance: number = FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE): boolean {
    return fuzzyMatchWordWithResult(token, text, maxDistance) !== null;
}