mirror of
https://github.com/zadam/trilium.git
synced 2025-11-05 04:45:47 +01:00
feat(search): implement additional weights for search_results, normalize text as well
This commit is contained in:
@@ -2,6 +2,27 @@
|
|||||||
|
|
||||||
import beccaService from "../../becca/becca_service.js";
|
import beccaService from "../../becca/becca_service.js";
|
||||||
import becca from "../../becca/becca.js";
|
import becca from "../../becca/becca.js";
|
||||||
|
import {
|
||||||
|
normalizeSearchText,
|
||||||
|
calculateOptimizedEditDistance,
|
||||||
|
FUZZY_SEARCH_CONFIG
|
||||||
|
} from "./utils/text_utils.js";
|
||||||
|
|
||||||
|
// Scoring constants for better maintainability
|
||||||
|
const SCORE_WEIGHTS = {
|
||||||
|
NOTE_ID_EXACT_MATCH: 1000,
|
||||||
|
TITLE_EXACT_MATCH: 2000,
|
||||||
|
TITLE_PREFIX_MATCH: 500,
|
||||||
|
TITLE_WORD_MATCH: 300,
|
||||||
|
TOKEN_EXACT_MATCH: 4,
|
||||||
|
TOKEN_PREFIX_MATCH: 2,
|
||||||
|
TOKEN_CONTAINS_MATCH: 1,
|
||||||
|
TOKEN_FUZZY_MATCH: 0.5,
|
||||||
|
TITLE_FACTOR: 2.0,
|
||||||
|
PATH_FACTOR: 0.3,
|
||||||
|
HIDDEN_NOTE_PENALTY: 3
|
||||||
|
} as const;
|
||||||
|
|
||||||
|
|
||||||
class SearchResult {
|
class SearchResult {
|
||||||
notePathArray: string[];
|
notePathArray: string[];
|
||||||
@@ -27,49 +48,92 @@ class SearchResult {
|
|||||||
this.score = 0;
|
this.score = 0;
|
||||||
|
|
||||||
const note = becca.notes[this.noteId];
|
const note = becca.notes[this.noteId];
|
||||||
const normalizedQuery = fulltextQuery.toLowerCase();
|
const normalizedQuery = normalizeSearchText(fulltextQuery.toLowerCase());
|
||||||
const normalizedTitle = note.title.toLowerCase();
|
const normalizedTitle = normalizeSearchText(note.title.toLowerCase());
|
||||||
|
|
||||||
// Note ID exact match, much higher score
|
// Note ID exact match, much higher score
|
||||||
if (note.noteId.toLowerCase() === fulltextQuery) {
|
if (note.noteId.toLowerCase() === fulltextQuery) {
|
||||||
this.score += 1000;
|
this.score += SCORE_WEIGHTS.NOTE_ID_EXACT_MATCH;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Title matching scores, make sure to always win
|
// Title matching scores with fuzzy matching support
|
||||||
if (normalizedTitle === normalizedQuery) {
|
if (normalizedTitle === normalizedQuery) {
|
||||||
this.score += 2000; // Increased from 1000 to ensure exact matches always win
|
this.score += SCORE_WEIGHTS.TITLE_EXACT_MATCH;
|
||||||
} else if (normalizedTitle.startsWith(normalizedQuery)) {
|
} else if (normalizedTitle.startsWith(normalizedQuery)) {
|
||||||
this.score += 500; // Increased to give more weight to prefix matches
|
this.score += SCORE_WEIGHTS.TITLE_PREFIX_MATCH;
|
||||||
} else if (normalizedTitle.includes(` ${normalizedQuery} `) || normalizedTitle.startsWith(`${normalizedQuery} `) || normalizedTitle.endsWith(` ${normalizedQuery}`)) {
|
} else if (this.isWordMatch(normalizedTitle, normalizedQuery)) {
|
||||||
this.score += 300; // Increased to better distinguish word matches
|
this.score += SCORE_WEIGHTS.TITLE_WORD_MATCH;
|
||||||
|
} else {
|
||||||
|
// Try fuzzy matching for typos
|
||||||
|
const fuzzyScore = this.calculateFuzzyTitleScore(normalizedTitle, normalizedQuery);
|
||||||
|
this.score += fuzzyScore;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add scores for partial matches with adjusted weights
|
// Add scores for token matches
|
||||||
this.addScoreForStrings(tokens, note.title, 2.0); // Increased to give more weight to title matches
|
this.addScoreForStrings(tokens, note.title, SCORE_WEIGHTS.TITLE_FACTOR);
|
||||||
this.addScoreForStrings(tokens, this.notePathTitle, 0.3); // Reduced to further de-emphasize path matches
|
this.addScoreForStrings(tokens, this.notePathTitle, SCORE_WEIGHTS.PATH_FACTOR);
|
||||||
|
|
||||||
if (note.isInHiddenSubtree()) {
|
if (note.isInHiddenSubtree()) {
|
||||||
this.score = this.score / 3; // Increased penalty for hidden notes
|
this.score = this.score / SCORE_WEIGHTS.HIDDEN_NOTE_PENALTY;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
addScoreForStrings(tokens: string[], str: string, factor: number) {
|
addScoreForStrings(tokens: string[], str: string, factor: number) {
|
||||||
const chunks = str.toLowerCase().split(" ");
|
const normalizedStr = normalizeSearchText(str.toLowerCase());
|
||||||
|
const chunks = normalizedStr.split(" ");
|
||||||
|
|
||||||
let tokenScore = 0;
|
let tokenScore = 0;
|
||||||
for (const chunk of chunks) {
|
for (const chunk of chunks) {
|
||||||
for (const token of tokens) {
|
for (const token of tokens) {
|
||||||
if (chunk === token) {
|
const normalizedToken = normalizeSearchText(token.toLowerCase());
|
||||||
tokenScore += 4 * token.length * factor;
|
|
||||||
} else if (chunk.startsWith(token)) {
|
if (chunk === normalizedToken) {
|
||||||
tokenScore += 2 * token.length * factor;
|
tokenScore += SCORE_WEIGHTS.TOKEN_EXACT_MATCH * token.length * factor;
|
||||||
} else if (chunk.includes(token)) {
|
} else if (chunk.startsWith(normalizedToken)) {
|
||||||
tokenScore += token.length * factor;
|
tokenScore += SCORE_WEIGHTS.TOKEN_PREFIX_MATCH * token.length * factor;
|
||||||
|
} else if (chunk.includes(normalizedToken)) {
|
||||||
|
tokenScore += SCORE_WEIGHTS.TOKEN_CONTAINS_MATCH * token.length * factor;
|
||||||
|
} else {
|
||||||
|
// Try fuzzy matching for individual tokens
|
||||||
|
const editDistance = calculateOptimizedEditDistance(chunk, normalizedToken, FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE);
|
||||||
|
if (editDistance <= FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE && normalizedToken.length >= FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH) {
|
||||||
|
const fuzzyWeight = SCORE_WEIGHTS.TOKEN_FUZZY_MATCH * (1 - editDistance / FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE);
|
||||||
|
tokenScore += fuzzyWeight * token.length * factor;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
this.score += tokenScore;
|
this.score += tokenScore;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks if the query matches as a complete word in the text
|
||||||
|
*/
|
||||||
|
private isWordMatch(text: string, query: string): boolean {
|
||||||
|
return text.includes(` ${query} `) ||
|
||||||
|
text.startsWith(`${query} `) ||
|
||||||
|
text.endsWith(` ${query}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculates fuzzy matching score for title matches
|
||||||
|
*/
|
||||||
|
private calculateFuzzyTitleScore(title: string, query: string): number {
|
||||||
|
const editDistance = calculateOptimizedEditDistance(title, query, FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE);
|
||||||
|
const maxLen = Math.max(title.length, query.length);
|
||||||
|
|
||||||
|
// Only apply fuzzy matching if the query is reasonably long and edit distance is small
|
||||||
|
if (query.length >= FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH &&
|
||||||
|
editDistance <= FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE &&
|
||||||
|
editDistance / maxLen <= 0.3) {
|
||||||
|
const similarity = 1 - (editDistance / maxLen);
|
||||||
|
return SCORE_WEIGHTS.TITLE_WORD_MATCH * similarity * 0.7; // Reduced weight for fuzzy matches
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
export default SearchResult;
|
export default SearchResult;
|
||||||
|
|||||||
271
apps/server/src/services/search/utils/text_utils.ts
Normal file
271
apps/server/src/services/search/utils/text_utils.ts
Normal file
@@ -0,0 +1,271 @@
|
|||||||
|
"use strict";
|
||||||
|
|
||||||
|
import { normalize } from "../../utils.js";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Shared text processing utilities for search functionality
|
||||||
|
*/
|
||||||
|
|
||||||
|
// Configuration constants for fuzzy matching
|
||||||
|
export const FUZZY_SEARCH_CONFIG = {
|
||||||
|
// Minimum token length for fuzzy operators to prevent false positives
|
||||||
|
MIN_FUZZY_TOKEN_LENGTH: 3,
|
||||||
|
// Maximum edit distance for fuzzy matching
|
||||||
|
MAX_EDIT_DISTANCE: 2,
|
||||||
|
// Maximum proximity distance for phrase matching (in words)
|
||||||
|
MAX_PHRASE_PROXIMITY: 10,
|
||||||
|
// Content size limits for memory protection
|
||||||
|
MAX_CONTENT_SIZE: 50 * 1024, // 50KB
|
||||||
|
MAX_WORD_COUNT: 10000,
|
||||||
|
// Performance thresholds
|
||||||
|
EARLY_TERMINATION_THRESHOLD: 3,
|
||||||
|
} as const;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalizes text by removing diacritics and converting to lowercase.
|
||||||
|
* This is the centralized text normalization function used across all search components.
|
||||||
|
* Uses the shared normalize function from utils for consistency.
|
||||||
|
*
|
||||||
|
* Examples:
|
||||||
|
* - "café" -> "cafe"
|
||||||
|
* - "naïve" -> "naive"
|
||||||
|
* - "HELLO WORLD" -> "hello world"
|
||||||
|
*
|
||||||
|
* @param text The text to normalize
|
||||||
|
* @returns The normalized text
|
||||||
|
*/
|
||||||
|
export function normalizeSearchText(text: string): string {
|
||||||
|
if (!text || typeof text !== 'string') {
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use shared normalize function for consistency across the codebase
|
||||||
|
return normalize(text);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Optimized edit distance calculation using single array and early termination.
|
||||||
|
* This is significantly more memory efficient than the 2D matrix approach and includes
|
||||||
|
* early termination optimizations for better performance.
|
||||||
|
*
|
||||||
|
* @param str1 First string
|
||||||
|
* @param str2 Second string
|
||||||
|
* @param maxDistance Maximum allowed distance (for early termination)
|
||||||
|
* @returns The edit distance between the strings, or maxDistance + 1 if exceeded
|
||||||
|
*/
|
||||||
|
export function calculateOptimizedEditDistance(str1: string, str2: string, maxDistance: number = FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE): number {
|
||||||
|
// Input validation
|
||||||
|
if (typeof str1 !== 'string' || typeof str2 !== 'string') {
|
||||||
|
throw new Error('Both arguments must be strings');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (maxDistance < 0 || !Number.isInteger(maxDistance)) {
|
||||||
|
throw new Error('maxDistance must be a non-negative integer');
|
||||||
|
}
|
||||||
|
|
||||||
|
const len1 = str1.length;
|
||||||
|
const len2 = str2.length;
|
||||||
|
|
||||||
|
// Performance guard: if strings are too long, limit processing
|
||||||
|
const maxStringLength = 1000;
|
||||||
|
if (len1 > maxStringLength || len2 > maxStringLength) {
|
||||||
|
// For very long strings, fall back to simple length-based heuristic
|
||||||
|
return Math.abs(len1 - len2) <= maxDistance ? Math.abs(len1 - len2) : maxDistance + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Early termination: if length difference exceeds max distance
|
||||||
|
if (Math.abs(len1 - len2) > maxDistance) {
|
||||||
|
return maxDistance + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle edge cases
|
||||||
|
if (len1 === 0) return len2 <= maxDistance ? len2 : maxDistance + 1;
|
||||||
|
if (len2 === 0) return len1 <= maxDistance ? len1 : maxDistance + 1;
|
||||||
|
|
||||||
|
// Use single array optimization for better memory usage
|
||||||
|
let previousRow = Array.from({ length: len2 + 1 }, (_, i) => i);
|
||||||
|
let currentRow = new Array(len2 + 1);
|
||||||
|
|
||||||
|
for (let i = 1; i <= len1; i++) {
|
||||||
|
currentRow[0] = i;
|
||||||
|
let minInRow = i;
|
||||||
|
|
||||||
|
for (let j = 1; j <= len2; j++) {
|
||||||
|
const cost = str1[i - 1] === str2[j - 1] ? 0 : 1;
|
||||||
|
currentRow[j] = Math.min(
|
||||||
|
previousRow[j] + 1, // deletion
|
||||||
|
currentRow[j - 1] + 1, // insertion
|
||||||
|
previousRow[j - 1] + cost // substitution
|
||||||
|
);
|
||||||
|
|
||||||
|
// Track minimum value in current row for early termination
|
||||||
|
if (currentRow[j] < minInRow) {
|
||||||
|
minInRow = currentRow[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Early termination: if minimum distance in row exceeds threshold
|
||||||
|
if (minInRow > maxDistance) {
|
||||||
|
return maxDistance + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Swap arrays for next iteration
|
||||||
|
[previousRow, currentRow] = [currentRow, previousRow];
|
||||||
|
}
|
||||||
|
|
||||||
|
const result = previousRow[len2];
|
||||||
|
return result <= maxDistance ? result : maxDistance + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Validates that tokens meet minimum requirements for fuzzy operators.
|
||||||
|
*
|
||||||
|
* @param tokens Array of search tokens
|
||||||
|
* @param operator The search operator being used
|
||||||
|
* @returns Validation result with success status and error message
|
||||||
|
*/
|
||||||
|
export function validateFuzzySearchTokens(tokens: string[], operator: string): { isValid: boolean; error?: string } {
|
||||||
|
if (!operator || typeof operator !== 'string') {
|
||||||
|
return {
|
||||||
|
isValid: false,
|
||||||
|
error: 'Invalid operator: operator must be a non-empty string'
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!Array.isArray(tokens)) {
|
||||||
|
return {
|
||||||
|
isValid: false,
|
||||||
|
error: 'Invalid tokens: tokens must be an array'
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tokens.length === 0) {
|
||||||
|
return {
|
||||||
|
isValid: false,
|
||||||
|
error: 'Invalid tokens: at least one token is required'
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for null, undefined, or non-string tokens
|
||||||
|
const invalidTypeTokens = tokens.filter(token =>
|
||||||
|
token == null || typeof token !== 'string'
|
||||||
|
);
|
||||||
|
|
||||||
|
if (invalidTypeTokens.length > 0) {
|
||||||
|
return {
|
||||||
|
isValid: false,
|
||||||
|
error: 'Invalid tokens: all tokens must be non-null strings'
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for empty string tokens
|
||||||
|
const emptyTokens = tokens.filter(token => token.trim().length === 0);
|
||||||
|
|
||||||
|
if (emptyTokens.length > 0) {
|
||||||
|
return {
|
||||||
|
isValid: false,
|
||||||
|
error: 'Invalid tokens: empty or whitespace-only tokens are not allowed'
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (operator !== '~=' && operator !== '~*') {
|
||||||
|
return { isValid: true };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check minimum token length for fuzzy operators
|
||||||
|
const shortTokens = tokens.filter(token => token.length < FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH);
|
||||||
|
|
||||||
|
if (shortTokens.length > 0) {
|
||||||
|
return {
|
||||||
|
isValid: false,
|
||||||
|
error: `Fuzzy search operators (~=, ~*) require tokens of at least ${FUZZY_SEARCH_CONFIG.MIN_FUZZY_TOKEN_LENGTH} characters. Invalid tokens: ${shortTokens.join(', ')}`
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for excessively long tokens that could cause performance issues
|
||||||
|
const maxTokenLength = 100; // Reasonable limit for search tokens
|
||||||
|
const longTokens = tokens.filter(token => token.length > maxTokenLength);
|
||||||
|
|
||||||
|
if (longTokens.length > 0) {
|
||||||
|
return {
|
||||||
|
isValid: false,
|
||||||
|
error: `Tokens are too long (max ${maxTokenLength} characters). Long tokens: ${longTokens.map(t => t.substring(0, 20) + '...').join(', ')}`
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
return { isValid: true };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Validates and preprocesses content for search operations with size limits.
|
||||||
|
*
|
||||||
|
* @param content The content to validate and preprocess
|
||||||
|
* @param noteId The note ID (for logging purposes)
|
||||||
|
* @returns Processed content or null if content exceeds limits
|
||||||
|
*/
|
||||||
|
export function validateAndPreprocessContent(content: string, noteId?: string): string | null {
|
||||||
|
if (!content || typeof content !== 'string') {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check content size limits
|
||||||
|
if (content.length > FUZZY_SEARCH_CONFIG.MAX_CONTENT_SIZE) {
|
||||||
|
console.warn(`Content size exceeds limit for note ${noteId || 'unknown'}: ${content.length} bytes`);
|
||||||
|
return content.substring(0, FUZZY_SEARCH_CONFIG.MAX_CONTENT_SIZE);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check word count limits for phrase matching
|
||||||
|
const wordCount = content.split(/\s+/).length;
|
||||||
|
if (wordCount > FUZZY_SEARCH_CONFIG.MAX_WORD_COUNT) {
|
||||||
|
console.warn(`Word count exceeds limit for note ${noteId || 'unknown'}: ${wordCount} words`);
|
||||||
|
// Take first MAX_WORD_COUNT words
|
||||||
|
return content.split(/\s+/).slice(0, FUZZY_SEARCH_CONFIG.MAX_WORD_COUNT).join(' ');
|
||||||
|
}
|
||||||
|
|
||||||
|
return content;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks if a word matches a token with fuzzy matching.
|
||||||
|
* Optimized for common case where distances are small.
|
||||||
|
*
|
||||||
|
* @param token The search token (should be normalized)
|
||||||
|
* @param word The word to match against (should be normalized)
|
||||||
|
* @param maxDistance Maximum allowed edit distance
|
||||||
|
* @returns True if the word matches the token within the distance threshold
|
||||||
|
*/
|
||||||
|
export function fuzzyMatchWord(token: string, word: string, maxDistance: number = FUZZY_SEARCH_CONFIG.MAX_EDIT_DISTANCE): boolean {
|
||||||
|
// Input validation
|
||||||
|
if (typeof token !== 'string' || typeof word !== 'string') {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (token.length === 0 || word.length === 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Exact match check first (most common case)
|
||||||
|
if (word.includes(token)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Length difference check for early exit
|
||||||
|
if (Math.abs(word.length - token.length) > maxDistance) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// For very short tokens or very different lengths, be more strict
|
||||||
|
if (token.length < 4 || Math.abs(word.length - token.length) > 2) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use optimized edit distance calculation
|
||||||
|
const distance = calculateOptimizedEditDistance(token, word, maxDistance);
|
||||||
|
return distance <= maxDistance;
|
||||||
|
} catch (error) {
|
||||||
|
// Log error and return false for safety
|
||||||
|
console.warn('Error in fuzzy word matching:', error);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user