mirror of
https://github.com/zadam/trilium.git
synced 2026-05-07 08:25:47 +02:00
chore(search): use loop to prevent nested strip tags injection
This commit is contained in:
@@ -8,6 +8,7 @@ import SearchContext from "../search_context.js";
|
||||
import becca from "../../../becca/becca.js";
|
||||
import beccaService from "../../../becca/becca_service.js";
|
||||
import { normalize, removeDiacritic, escapeHtml, escapeRegExp } from "../../utils.js";
|
||||
import { stripHtmlTags } from "../utils/text_utils.js";
|
||||
import log from "../../log.js";
|
||||
import hoistedNoteService from "../../hoisted_note.js";
|
||||
import type BNote from "../../../becca/entities/bnote.js";
|
||||
@@ -494,10 +495,9 @@ function extractContentSnippet(noteId: string, searchTokens: string[], maxLength
|
||||
return ""; // Protected but no session available
|
||||
}
|
||||
|
||||
// Strip HTML tags for text notes — use fast regex for snippet extraction
|
||||
// (striptags library is ~18x slower and not needed for search snippets)
|
||||
// Strip HTML tags for text notes
|
||||
if (note.type === "text") {
|
||||
content = content.replace(/<[^>]*>/g, "");
|
||||
content = stripHtmlTags(content);
|
||||
}
|
||||
|
||||
if (!content) {
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import { describe, it, expect } from "vitest";
|
||||
import { calculateOptimizedEditDistance, validateFuzzySearchTokens, fuzzyMatchWord } from './text_utils.js';
|
||||
import { calculateOptimizedEditDistance, validateFuzzySearchTokens, fuzzyMatchWord, stripHtmlTags } from './text_utils.js';
|
||||
|
||||
describe('Fuzzy Search Core', () => {
|
||||
describe('calculateOptimizedEditDistance', () => {
|
||||
@@ -62,4 +62,69 @@ describe('Fuzzy Search Core', () => {
|
||||
expect(fuzzyMatchWord('a', 'b')).toBe(false); // Very short tokens
|
||||
});
|
||||
});
|
||||
|
||||
describe('stripHtmlTags', () => {
|
||||
it('strips simple HTML tags', () => {
|
||||
expect(stripHtmlTags('<p>Hello</p>')).toBe('Hello');
|
||||
expect(stripHtmlTags('<div><span>World</span></div>')).toBe('World');
|
||||
expect(stripHtmlTags('<b>Bold</b> and <i>italic</i>')).toBe('Bold and italic');
|
||||
});
|
||||
|
||||
it('handles self-closing tags', () => {
|
||||
expect(stripHtmlTags('Line1<br/>Line2')).toBe('Line1Line2');
|
||||
expect(stripHtmlTags('Image: <img src="x.png"/>')).toBe('Image: ');
|
||||
});
|
||||
|
||||
it('handles tags with attributes', () => {
|
||||
expect(stripHtmlTags('<a href="url">Link</a>')).toBe('Link');
|
||||
expect(stripHtmlTags('<div class="foo" id="bar">Content</div>')).toBe('Content');
|
||||
});
|
||||
|
||||
it('handles nested tag patterns securely', () => {
|
||||
// Security property: no complete <tag> patterns remain after stripping
|
||||
// Residual `>` chars are harmless for XSS
|
||||
|
||||
// Nested tags: inner tag removed, then outer tag removed
|
||||
// <a<b>c> → <ac> → '' (but leaves residual `c>`)
|
||||
const result1 = stripHtmlTags('<a<b>c>text');
|
||||
expect(result1).not.toMatch(/<[a-z]/i); // No opening tags remain
|
||||
expect(result1).toBe('c>text'); // Residual text is safe
|
||||
|
||||
// Complex nesting leaves no exploitable patterns
|
||||
const result2 = stripHtmlTags('<scr<script>ipt>alert(1)</script>');
|
||||
expect(result2).not.toMatch(/<script/i);
|
||||
expect(result2).not.toMatch(/<\/script/i);
|
||||
|
||||
// Double-nested removal
|
||||
const result3 = stripHtmlTags('<<b>script>code');
|
||||
expect(result3).toBe('script>code'); // <b> removed, then < alone doesn't match
|
||||
expect(result3).not.toMatch(/<[a-z]/i);
|
||||
});
|
||||
|
||||
it('handles unclosed tags', () => {
|
||||
expect(stripHtmlTags('<p>Unclosed paragraph')).toBe('Unclosed paragraph');
|
||||
expect(stripHtmlTags('Text with <b>unclosed bold')).toBe('Text with unclosed bold');
|
||||
});
|
||||
|
||||
it('handles empty and null input', () => {
|
||||
expect(stripHtmlTags('')).toBe('');
|
||||
expect(stripHtmlTags(null as any)).toBe('');
|
||||
expect(stripHtmlTags(undefined as any)).toBe('');
|
||||
});
|
||||
|
||||
it('returns plain text unchanged', () => {
|
||||
expect(stripHtmlTags('Just plain text')).toBe('Just plain text');
|
||||
expect(stripHtmlTags('No tags here!')).toBe('No tags here!');
|
||||
});
|
||||
|
||||
it('handles angle brackets in text', () => {
|
||||
// Standalone > without matching < is preserved
|
||||
expect(stripHtmlTags('Text > with > symbols')).toBe('Text > with > symbols');
|
||||
// Note: `< 10 >` looks like a tag to the regex - this is a known limitation
|
||||
// For search snippets, this is acceptable as it's still safe (no XSS)
|
||||
expect(stripHtmlTags('Math: 5 < 10 > 3')).toBe('Math: 5 3');
|
||||
// But properly escaped content works
|
||||
expect(stripHtmlTags('5 < 10')).toBe('5 < 10');
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -49,6 +49,30 @@ export function normalizeSearchText(text: string): string {
|
||||
return normalize(text);
|
||||
}
|
||||
|
||||
/**
|
||||
* Strips HTML tags from content for snippet extraction.
|
||||
* Uses iterative replacement to handle nested/malformed tags like `<scr<script>ipt>`.
|
||||
*
|
||||
* @param html The HTML content to strip
|
||||
* @returns Plain text with all HTML tags removed
|
||||
*/
|
||||
export function stripHtmlTags(html: string): string {
|
||||
if (!html || typeof html !== "string") {
|
||||
return "";
|
||||
}
|
||||
|
||||
let result = html;
|
||||
let previous: string;
|
||||
|
||||
// Loop until no more tags — handles nested cases like <scr<script>ipt>
|
||||
do {
|
||||
previous = result;
|
||||
result = result.replace(/<[^>]*>/g, "");
|
||||
} while (result !== previous);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Optimized edit distance calculation using single array and early termination.
|
||||
* This is significantly more memory efficient than the 2D matrix approach and includes
|
||||
|
||||
Reference in New Issue
Block a user