mirror of
https://github.com/scm-manager/scm-manager.git
synced 2026-05-06 10:06:21 +02:00
Keep whole lines for code highlighting in search (#1871)
We now use SimpleSpanFragmenter with a fragment size of 200 for each highlighted field. For code fields we ensure that each line is complete.
This commit is contained in:
@@ -0,0 +1,87 @@
|
||||
/*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2020-present Cloudogu GmbH and Contributors
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
package sonia.scm.search;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.highlight.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
public final class LuceneHighlighter {
|
||||
|
||||
private static final String PRE_TAG = "<|[[--";
|
||||
private static final String POST_TAG = "--]]|>";
|
||||
|
||||
private static final int MAX_NUM_FRAGMENTS = 5;
|
||||
private static final int FRAGMENT_SIZE = 200;
|
||||
|
||||
private final Analyzer analyzer;
|
||||
private final Highlighter highlighter;
|
||||
|
||||
public LuceneHighlighter(Analyzer analyzer, Query query) {
|
||||
this.analyzer = analyzer;
|
||||
QueryScorer scorer = new QueryScorer(query);
|
||||
this.highlighter = new Highlighter(new SimpleHTMLFormatter(PRE_TAG, POST_TAG), scorer);
|
||||
this.highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer, FRAGMENT_SIZE));
|
||||
}
|
||||
|
||||
public String[] highlight(String fieldName, Indexed.Analyzer fieldAnalyzer, String value) throws InvalidTokenOffsetsException, IOException {
|
||||
String[] fragments = highlighter.getBestFragments(analyzer, fieldName, value, MAX_NUM_FRAGMENTS);
|
||||
if (fieldAnalyzer == Indexed.Analyzer.CODE) {
|
||||
fragments = keepWholeLine(value, fragments);
|
||||
}
|
||||
return Arrays.stream(fragments)
|
||||
.map(fragment -> fragment.replace(PRE_TAG, "<>").replace(POST_TAG, "</>"))
|
||||
.toArray(String[]::new);
|
||||
}
|
||||
|
||||
private String[] keepWholeLine(String content, String[] fragments) {
|
||||
return Arrays.stream(fragments)
|
||||
.map(fragment -> keepWholeLine(content, fragment))
|
||||
.toArray(String[]::new);
|
||||
}
|
||||
|
||||
private String keepWholeLine(String content, String fragment) {
|
||||
String raw = fragment.replace(PRE_TAG, "").replace(POST_TAG, "");
|
||||
int index = content.indexOf(raw);
|
||||
|
||||
int start = content.lastIndexOf('\n', index);
|
||||
if (start < 0) {
|
||||
start = 0;
|
||||
}
|
||||
|
||||
String snippet = content.substring(start, index) + fragment;
|
||||
|
||||
int end = content.indexOf('\n', index + raw.length());
|
||||
if (end < 0) {
|
||||
end = content.length();
|
||||
}
|
||||
|
||||
return snippet + content.substring(index + raw.length(), end) + "\n";
|
||||
}
|
||||
|
||||
}
|
||||
@@ -30,10 +30,7 @@ import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.search.highlight.Highlighter;
|
||||
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
|
||||
import org.apache.lucene.search.highlight.QueryScorer;
|
||||
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
@@ -47,23 +44,14 @@ import static java.util.Optional.of;
|
||||
|
||||
public class QueryResultFactory {
|
||||
|
||||
private final Analyzer analyzer;
|
||||
private final Highlighter highlighter;
|
||||
private final LuceneHighlighter highlighter;
|
||||
private final IndexSearcher searcher;
|
||||
private final LuceneSearchableType searchableType;
|
||||
|
||||
public QueryResultFactory(Analyzer analyzer, IndexSearcher searcher, LuceneSearchableType searchableType, Query query) {
|
||||
this.analyzer = analyzer;
|
||||
this.searcher = searcher;
|
||||
this.searchableType = searchableType;
|
||||
this.highlighter = createHighlighter(query);
|
||||
}
|
||||
|
||||
private Highlighter createHighlighter(Query query) {
|
||||
return new Highlighter(
|
||||
new SimpleHTMLFormatter("<>", "</>"),
|
||||
new QueryScorer(query)
|
||||
);
|
||||
this.highlighter = new LuceneHighlighter(analyzer, query);
|
||||
}
|
||||
|
||||
public QueryResult create(TopDocs topDocs) throws IOException, InvalidTokenOffsetsException {
|
||||
@@ -98,7 +86,7 @@ public class QueryResultFactory {
|
||||
}
|
||||
|
||||
private String[] createFragments(LuceneSearchableField field, String value) throws InvalidTokenOffsetsException, IOException {
|
||||
return highlighter.getBestFragments(analyzer, field.getName(), value, 5);
|
||||
return highlighter.highlight(field.getName(), field.getAnalyzer(), value);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user