chore(search): use loop to prevent nested strip tags injection

This commit is contained in:
Elian Doran
2026-04-13 14:03:17 +03:00
parent e40504b7f0
commit f58dd12983
3 changed files with 93 additions and 4 deletions

View File

@@ -8,6 +8,7 @@ import SearchContext from "../search_context.js";
import becca from "../../../becca/becca.js";
import beccaService from "../../../becca/becca_service.js";
import { normalize, removeDiacritic, escapeHtml, escapeRegExp } from "../../utils.js";
import { stripHtmlTags } from "../utils/text_utils.js";
import log from "../../log.js";
import hoistedNoteService from "../../hoisted_note.js";
import type BNote from "../../../becca/entities/bnote.js";
@@ -494,10 +495,9 @@ function extractContentSnippet(noteId: string, searchTokens: string[], maxLength
return ""; // Protected but no session available
}
// Strip HTML tags for text notes — use fast regex for snippet extraction
// (striptags library is ~18x slower and not needed for search snippets)
// Strip HTML tags for text notes
if (note.type === "text") {
content = content.replace(/<[^>]*>/g, "");
content = stripHtmlTags(content);
}
if (!content) {

View File

@@ -1,5 +1,5 @@
import { describe, it, expect } from "vitest";
import { calculateOptimizedEditDistance, validateFuzzySearchTokens, fuzzyMatchWord } from './text_utils.js';
import { calculateOptimizedEditDistance, validateFuzzySearchTokens, fuzzyMatchWord, stripHtmlTags } from './text_utils.js';
describe('Fuzzy Search Core', () => {
describe('calculateOptimizedEditDistance', () => {
@@ -62,4 +62,69 @@ describe('Fuzzy Search Core', () => {
expect(fuzzyMatchWord('a', 'b')).toBe(false); // Very short tokens
});
});
describe('stripHtmlTags', () => {
it('strips simple HTML tags', () => {
expect(stripHtmlTags('<p>Hello</p>')).toBe('Hello');
expect(stripHtmlTags('<div><span>World</span></div>')).toBe('World');
expect(stripHtmlTags('<b>Bold</b> and <i>italic</i>')).toBe('Bold and italic');
});
it('handles self-closing tags', () => {
expect(stripHtmlTags('Line1<br/>Line2')).toBe('Line1Line2');
expect(stripHtmlTags('Image: <img src="x.png"/>')).toBe('Image: ');
});
it('handles tags with attributes', () => {
expect(stripHtmlTags('<a href="url">Link</a>')).toBe('Link');
expect(stripHtmlTags('<div class="foo" id="bar">Content</div>')).toBe('Content');
});
it('handles nested tag patterns securely', () => {
// Security property: no complete <tag> patterns remain after stripping
// Residual `>` chars are harmless for XSS
// Nested tags: inner tag removed, then outer tag removed
// <a<b>c> → <ac> → '' (but leaves residual `c>`)
const result1 = stripHtmlTags('<a<b>c>text');
expect(result1).not.toMatch(/<[a-z]/i); // No opening tags remain
expect(result1).toBe('c>text'); // Residual text is safe
// Complex nesting leaves no exploitable patterns
const result2 = stripHtmlTags('<scr<script>ipt>alert(1)</script>');
expect(result2).not.toMatch(/<script/i);
expect(result2).not.toMatch(/<\/script/i);
// Double-nested removal
const result3 = stripHtmlTags('<<b>script>code');
expect(result3).toBe('script>code'); // <b> removed, then < alone doesn't match
expect(result3).not.toMatch(/<[a-z]/i);
});
it('handles unclosed tags', () => {
expect(stripHtmlTags('<p>Unclosed paragraph')).toBe('Unclosed paragraph');
expect(stripHtmlTags('Text with <b>unclosed bold')).toBe('Text with unclosed bold');
});
it('handles empty and null input', () => {
expect(stripHtmlTags('')).toBe('');
expect(stripHtmlTags(null as any)).toBe('');
expect(stripHtmlTags(undefined as any)).toBe('');
});
it('returns plain text unchanged', () => {
expect(stripHtmlTags('Just plain text')).toBe('Just plain text');
expect(stripHtmlTags('No tags here!')).toBe('No tags here!');
});
it('handles angle brackets in text', () => {
// Standalone > without matching < is preserved
expect(stripHtmlTags('Text > with > symbols')).toBe('Text > with > symbols');
// Note: `< 10 >` looks like a tag to the regex - this is a known limitation
// For search snippets, this is acceptable as it's still safe (no XSS)
expect(stripHtmlTags('Math: 5 < 10 > 3')).toBe('Math: 5 3');
// But properly escaped content works
expect(stripHtmlTags('5 &lt; 10')).toBe('5 &lt; 10');
});
});
});

View File

@@ -49,6 +49,30 @@ export function normalizeSearchText(text: string): string {
return normalize(text);
}
/**
* Strips HTML tags from content for snippet extraction.
* Uses iterative replacement to handle nested/malformed tags like `<scr<script>ipt>`.
*
* @param html The HTML content to strip
* @returns Plain text with all HTML tags removed
*/
export function stripHtmlTags(html: string): string {
if (!html || typeof html !== "string") {
return "";
}
let result = html;
let previous: string;
// Loop until no more tags — handles nested cases like <scr<script>ipt>
do {
previous = result;
result = result.replace(/<[^>]*>/g, "");
} while (result !== previous);
return result;
}
/**
* Optimized edit distance calculation using single array and early termination.
* This is significantly more memory efficient than the 2D matrix approach and includes