chore(search): use loop to prevent nested strip tags injection

2026-05-07 08:25:47 +02:00 · 2026-04-13 14:03:17 +03:00
parent e40504b7f0
commit f58dd12983
3 changed files with 93 additions and 4 deletions
--- a/apps/server/src/services/search/services/search.ts
+++ b/apps/server/src/services/search/services/search.ts
@@ -8,6 +8,7 @@ import SearchContext from "../search_context.js";
 import becca from "../../../becca/becca.js";
 import beccaService from "../../../becca/becca_service.js";
 import { normalize, removeDiacritic, escapeHtml, escapeRegExp } from "../../utils.js";
+import { stripHtmlTags } from "../utils/text_utils.js";
 import log from "../../log.js";
 import hoistedNoteService from "../../hoisted_note.js";
 import type BNote from "../../../becca/entities/bnote.js";
@@ -494,10 +495,9 @@ function extractContentSnippet(noteId: string, searchTokens: string[], maxLength
            return ""; // Protected but no session available
        }

-        // Strip HTML tags for text notes — use fast regex for snippet extraction
-        // (striptags library is ~18x slower and not needed for search snippets)
+        // Strip HTML tags for text notes
        if (note.type === "text") {
-            content = content.replace(/<[^>]*>/g, "");
+            content = stripHtmlTags(content);
        }

        if (!content) {
--- a/apps/server/src/services/search/utils/text_utils.spec.ts
+++ b/apps/server/src/services/search/utils/text_utils.spec.ts
@@ -1,5 +1,5 @@
 import { describe, it, expect } from "vitest";
-import { calculateOptimizedEditDistance, validateFuzzySearchTokens, fuzzyMatchWord } from './text_utils.js';
+import { calculateOptimizedEditDistance, validateFuzzySearchTokens, fuzzyMatchWord, stripHtmlTags } from './text_utils.js';

 describe('Fuzzy Search Core', () => {
    describe('calculateOptimizedEditDistance', () => {
@@ -62,4 +62,69 @@ describe('Fuzzy Search Core', () => {
            expect(fuzzyMatchWord('a', 'b')).toBe(false); // Very short tokens
        });
    });
+
+    describe('stripHtmlTags', () => {
+        it('strips simple HTML tags', () => {
+            expect(stripHtmlTags('<p>Hello</p>')).toBe('Hello');
+            expect(stripHtmlTags('<div><span>World</span></div>')).toBe('World');
+            expect(stripHtmlTags('<b>Bold</b> and <i>italic</i>')).toBe('Bold and italic');
+        });
+
+        it('handles self-closing tags', () => {
+            expect(stripHtmlTags('Line1<br/>Line2')).toBe('Line1Line2');
+            expect(stripHtmlTags('Image: <img src="x.png"/>')).toBe('Image: ');
+        });
+
+        it('handles tags with attributes', () => {
+            expect(stripHtmlTags('<a href="url">Link</a>')).toBe('Link');
+            expect(stripHtmlTags('<div class="foo" id="bar">Content</div>')).toBe('Content');
+        });
+
+        it('handles nested tag patterns securely', () => {
+            // Security property: no complete <tag> patterns remain after stripping
+            // Residual `>` chars are harmless for XSS
+
+            // Nested tags: inner tag removed, then outer tag removed
+            // <a<b>c> → <ac> → '' (but leaves residual `c>`)
+            const result1 = stripHtmlTags('<a<b>c>text');
+            expect(result1).not.toMatch(/<[a-z]/i); // No opening tags remain
+            expect(result1).toBe('c>text'); // Residual text is safe
+
+            // Complex nesting leaves no exploitable patterns
+            const result2 = stripHtmlTags('<scr<script>ipt>alert(1)</script>');
+            expect(result2).not.toMatch(/<script/i);
+            expect(result2).not.toMatch(/<\/script/i);
+
+            // Double-nested removal
+            const result3 = stripHtmlTags('<<b>script>code');
+            expect(result3).toBe('script>code'); // <b> removed, then < alone doesn't match
+            expect(result3).not.toMatch(/<[a-z]/i);
+        });
+
+        it('handles unclosed tags', () => {
+            expect(stripHtmlTags('<p>Unclosed paragraph')).toBe('Unclosed paragraph');
+            expect(stripHtmlTags('Text with <b>unclosed bold')).toBe('Text with unclosed bold');
+        });
+
+        it('handles empty and null input', () => {
+            expect(stripHtmlTags('')).toBe('');
+            expect(stripHtmlTags(null as any)).toBe('');
+            expect(stripHtmlTags(undefined as any)).toBe('');
+        });
+
+        it('returns plain text unchanged', () => {
+            expect(stripHtmlTags('Just plain text')).toBe('Just plain text');
+            expect(stripHtmlTags('No tags here!')).toBe('No tags here!');
+        });
+
+        it('handles angle brackets in text', () => {
+            // Standalone > without matching < is preserved
+            expect(stripHtmlTags('Text > with > symbols')).toBe('Text > with > symbols');
+            // Note: `< 10 >` looks like a tag to the regex - this is a known limitation
+            // For search snippets, this is acceptable as it's still safe (no XSS)
+            expect(stripHtmlTags('Math: 5 < 10 > 3')).toBe('Math: 5  3');
+            // But properly escaped content works
+            expect(stripHtmlTags('5 &lt; 10')).toBe('5 &lt; 10');
+        });
+    });
 });
--- a/apps/server/src/services/search/utils/text_utils.ts
+++ b/apps/server/src/services/search/utils/text_utils.ts
@@ -49,6 +49,30 @@ export function normalizeSearchText(text: string): string {
    return normalize(text);
 }

+/**
+ * Strips HTML tags from content for snippet extraction.
+ * Uses iterative replacement to handle nested/malformed tags like `<scr<script>ipt>`.
+ *
+ * @param html The HTML content to strip
+ * @returns Plain text with all HTML tags removed
+ */
+export function stripHtmlTags(html: string): string {
+    if (!html || typeof html !== "string") {
+        return "";
+    }
+
+    let result = html;
+    let previous: string;
+
+    // Loop until no more tags — handles nested cases like <scr<script>ipt>
+    do {
+        previous = result;
+        result = result.replace(/<[^>]*>/g, "");
+    } while (result !== previous);
+
+    return result;
+}
+
 /**
 * Optimized edit distance calculation using single array and early termination.
 * This is significantly more memory efficient than the 2D matrix approach and includes