mirror of
https://github.com/zadam/trilium.git
synced 2025-11-08 14:25:51 +01:00
do a better job of extracting context
This commit is contained in:
@@ -5,6 +5,7 @@ import options from "../options.js";
|
||||
import log from "../log.js";
|
||||
import type { Message } from "./ai_interface.js";
|
||||
import { cosineSimilarity } from "./embeddings/vector_store.js";
|
||||
import sanitizeHtml from "sanitize-html";
|
||||
|
||||
/**
|
||||
* TriliumContextService provides intelligent context management for working with large knowledge bases
|
||||
@@ -351,15 +352,16 @@ Example: ["exact topic mentioned", "related concept 1", "related concept 2"]`;
|
||||
context += `--- NOTE ${index + 1}: ${source.title} ---\n`;
|
||||
|
||||
if (source.content) {
|
||||
// Clean up HTML content before adding it to the context
|
||||
let cleanContent = this.sanitizeNoteContent(source.content, source.type, source.mime);
|
||||
|
||||
// Truncate content if it's too long
|
||||
const maxContentLength = 1000;
|
||||
let content = source.content;
|
||||
|
||||
if (content.length > maxContentLength) {
|
||||
content = content.substring(0, maxContentLength) + " [content truncated due to length]";
|
||||
if (cleanContent.length > maxContentLength) {
|
||||
cleanContent = cleanContent.substring(0, maxContentLength) + " [content truncated due to length]";
|
||||
}
|
||||
|
||||
context += `${content}\n`;
|
||||
context += `${cleanContent}\n`;
|
||||
} else {
|
||||
context += "[This note doesn't contain textual content]\n";
|
||||
}
|
||||
@@ -373,6 +375,45 @@ Example: ["exact topic mentioned", "related concept 1", "related concept 2"]`;
|
||||
return context;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sanitize note content for use in context, removing HTML tags
|
||||
*/
|
||||
private sanitizeNoteContent(content: string, type?: string, mime?: string): string {
|
||||
if (!content) return '';
|
||||
|
||||
// If it's likely HTML content
|
||||
if (
|
||||
(type === 'text' && mime === 'text/html') ||
|
||||
content.includes('<div') ||
|
||||
content.includes('<p>') ||
|
||||
content.includes('<span')
|
||||
) {
|
||||
// Use sanitizeHtml to remove all HTML tags
|
||||
content = sanitizeHtml(content, {
|
||||
allowedTags: [],
|
||||
allowedAttributes: {},
|
||||
textFilter: (text) => {
|
||||
// Replace multiple newlines with a single one
|
||||
return text.replace(/\n\s*\n/g, '\n\n');
|
||||
}
|
||||
});
|
||||
|
||||
// Additional cleanup for remaining HTML entities
|
||||
content = content
|
||||
.replace(/ /g, ' ')
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/&/g, '&')
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, "'");
|
||||
}
|
||||
|
||||
// Normalize whitespace
|
||||
content = content.replace(/\s+/g, ' ').trim();
|
||||
|
||||
return content;
|
||||
}
|
||||
|
||||
/**
|
||||
* Process a user query with the Trilium-specific approach:
|
||||
* 1. Generate search queries from the original question
|
||||
|
||||
Reference in New Issue
Block a user