Keep whole lines for code highlighting in search (#1871)

We now use SimpleSpanFragmenter with a fragment size of 200 for each highlighted field. For code fields we ensure that each line is complete.
2026-07-04 10:59:11 +02:00 · 2021-11-25 13:48:36 +01:00
parent f0ab0950a2
commit 1082d39120
7 changed files with 427 additions and 15 deletions
--- a/scm-webapp/src/main/java/sonia/scm/search/LuceneHighlighter.java
+++ b/scm-webapp/src/main/java/sonia/scm/search/LuceneHighlighter.java
@@ -0,0 +1,87 @@
+/*
+ * MIT License
+ *
+ * Copyright (c) 2020-present Cloudogu GmbH and Contributors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+package sonia.scm.search;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.highlight.*;
+
+import java.io.IOException;
+import java.util.Arrays;
+
+public final class LuceneHighlighter {
+
+  private static final String PRE_TAG = "<|[[--";
+  private static final String POST_TAG = "--]]|>";
+
+  private static final int MAX_NUM_FRAGMENTS = 5;
+  private static final int FRAGMENT_SIZE = 200;
+
+  private final Analyzer analyzer;
+  private final Highlighter highlighter;
+
+  public LuceneHighlighter(Analyzer analyzer, Query query) {
+    this.analyzer = analyzer;
+    QueryScorer scorer = new QueryScorer(query);
+    this.highlighter = new Highlighter(new SimpleHTMLFormatter(PRE_TAG, POST_TAG), scorer);
+    this.highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer, FRAGMENT_SIZE));
+  }
+
+  public String[] highlight(String fieldName, Indexed.Analyzer fieldAnalyzer, String value) throws InvalidTokenOffsetsException, IOException {
+    String[] fragments = highlighter.getBestFragments(analyzer, fieldName, value, MAX_NUM_FRAGMENTS);
+    if (fieldAnalyzer == Indexed.Analyzer.CODE) {
+      fragments = keepWholeLine(value, fragments);
+    }
+    return Arrays.stream(fragments)
+      .map(fragment -> fragment.replace(PRE_TAG, "<>").replace(POST_TAG, "</>"))
+      .toArray(String[]::new);
+  }
+
+  private String[] keepWholeLine(String content, String[] fragments) {
+    return Arrays.stream(fragments)
+      .map(fragment -> keepWholeLine(content, fragment))
+      .toArray(String[]::new);
+  }
+
+  private String keepWholeLine(String content, String fragment) {
+    String raw = fragment.replace(PRE_TAG, "").replace(POST_TAG, "");
+    int index = content.indexOf(raw);
+
+    int start = content.lastIndexOf('\n', index);
+    if (start < 0) {
+      start = 0;
+    }
+
+    String snippet = content.substring(start, index) + fragment;
+
+    int end = content.indexOf('\n', index + raw.length());
+    if (end < 0) {
+      end = content.length();
+    }
+
+    return snippet + content.substring(index + raw.length(), end) + "\n";
+  }
+
+}
--- a/scm-webapp/src/main/java/sonia/scm/search/QueryResultFactory.java
+++ b/scm-webapp/src/main/java/sonia/scm/search/QueryResultFactory.java
@@ -30,10 +30,7 @@ import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.TopDocs;
-import org.apache.lucene.search.highlight.Highlighter;
 import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
-import org.apache.lucene.search.highlight.QueryScorer;
-import org.apache.lucene.search.highlight.SimpleHTMLFormatter;

 import java.io.IOException;
 import java.util.ArrayList;
@@ -47,23 +44,14 @@ import static java.util.Optional.of;

 public class QueryResultFactory {

-  private final Analyzer analyzer;
-  private final Highlighter highlighter;
+  private final LuceneHighlighter highlighter;
  private final IndexSearcher searcher;
  private final LuceneSearchableType searchableType;

  public QueryResultFactory(Analyzer analyzer, IndexSearcher searcher, LuceneSearchableType searchableType, Query query) {
-    this.analyzer = analyzer;
    this.searcher = searcher;
    this.searchableType = searchableType;
-    this.highlighter = createHighlighter(query);
-  }
-
-  private Highlighter createHighlighter(Query query) {
-    return new Highlighter(
-      new SimpleHTMLFormatter("<>", "</>"),
-      new QueryScorer(query)
-    );
+    this.highlighter = new LuceneHighlighter(analyzer, query);
  }

  public QueryResult create(TopDocs topDocs) throws IOException, InvalidTokenOffsetsException {
@@ -98,7 +86,7 @@ public class QueryResultFactory {
  }

  private String[] createFragments(LuceneSearchableField field, String value) throws InvalidTokenOffsetsException, IOException {
-    return highlighter.getBestFragments(analyzer, field.getName(), value, 5);
+    return highlighter.highlight(field.getName(), field.getAnalyzer(), value);
  }

 }