From 1082d391200dec1807b407245d18cb40ecb8d928 Mon Sep 17 00:00:00 2001 From: Sebastian Sdorra Date: Thu, 25 Nov 2021 13:48:36 +0100 Subject: [PATCH] Keep whole lines for code highlighting in search (#1871) We now use SimpleSpanFragmenter with a fragment size of 200 for each highlighted field. For code fields we ensure that each line is complete. --- gradle/changelog/search_highlighter.yaml | 2 + .../sonia/scm/search/LuceneHighlighter.java | 87 +++++++++++++ .../sonia/scm/search/QueryResultFactory.java | 18 +-- .../scm/search/LuceneHighlighterTest.java | 118 +++++++++++++++++ .../resources/sonia/scm/search/Button.tsx.txt | 120 ++++++++++++++++++ .../sonia/scm/search/GameOfLife.java.txt | 91 +++++++++++++ .../resources/sonia/scm/search/content.txt | 6 + 7 files changed, 427 insertions(+), 15 deletions(-) create mode 100644 gradle/changelog/search_highlighter.yaml create mode 100644 scm-webapp/src/main/java/sonia/scm/search/LuceneHighlighter.java create mode 100644 scm-webapp/src/test/java/sonia/scm/search/LuceneHighlighterTest.java create mode 100644 scm-webapp/src/test/resources/sonia/scm/search/Button.tsx.txt create mode 100644 scm-webapp/src/test/resources/sonia/scm/search/GameOfLife.java.txt create mode 100644 scm-webapp/src/test/resources/sonia/scm/search/content.txt diff --git a/gradle/changelog/search_highlighter.yaml b/gradle/changelog/search_highlighter.yaml new file mode 100644 index 0000000000..2f1d4906a0 --- /dev/null +++ b/gradle/changelog/search_highlighter.yaml @@ -0,0 +1,2 @@ +- type: changed + description: Keep whole lines for code highlighting in search ([#1871](https://github.com/scm-manager/scm-manager/pull/1871)) diff --git a/scm-webapp/src/main/java/sonia/scm/search/LuceneHighlighter.java b/scm-webapp/src/main/java/sonia/scm/search/LuceneHighlighter.java new file mode 100644 index 0000000000..53772f23dd --- /dev/null +++ b/scm-webapp/src/main/java/sonia/scm/search/LuceneHighlighter.java @@ -0,0 +1,87 @@ +/* + * MIT License + * + * Copyright (c) 2020-present Cloudogu GmbH and Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package sonia.scm.search; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.highlight.*; + +import java.io.IOException; +import java.util.Arrays; + +public final class LuceneHighlighter { + + private static final String PRE_TAG = "<|[[--"; + private static final String POST_TAG = "--]]|>"; + + private static final int MAX_NUM_FRAGMENTS = 5; + private static final int FRAGMENT_SIZE = 200; + + private final Analyzer analyzer; + private final Highlighter highlighter; + + public LuceneHighlighter(Analyzer analyzer, Query query) { + this.analyzer = analyzer; + QueryScorer scorer = new QueryScorer(query); + this.highlighter = new Highlighter(new SimpleHTMLFormatter(PRE_TAG, POST_TAG), scorer); + this.highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer, FRAGMENT_SIZE)); + } + + public String[] highlight(String fieldName, Indexed.Analyzer fieldAnalyzer, String value) throws InvalidTokenOffsetsException, IOException { + String[] fragments = highlighter.getBestFragments(analyzer, fieldName, value, MAX_NUM_FRAGMENTS); + if (fieldAnalyzer == Indexed.Analyzer.CODE) { + fragments = keepWholeLine(value, fragments); + } + return Arrays.stream(fragments) + .map(fragment -> fragment.replace(PRE_TAG, "<>").replace(POST_TAG, "")) + .toArray(String[]::new); + } + + private String[] keepWholeLine(String content, String[] fragments) { + return Arrays.stream(fragments) + .map(fragment -> keepWholeLine(content, fragment)) + .toArray(String[]::new); + } + + private String keepWholeLine(String content, String fragment) { + String raw = fragment.replace(PRE_TAG, "").replace(POST_TAG, ""); + int index = content.indexOf(raw); + + int start = content.lastIndexOf('\n', index); + if (start < 0) { + start = 0; + } + + String snippet = content.substring(start, index) + fragment; + + int end = content.indexOf('\n', index + raw.length()); + if (end < 0) { + end = content.length(); + } + + return snippet + content.substring(index + raw.length(), end) + "\n"; + } + +} diff --git a/scm-webapp/src/main/java/sonia/scm/search/QueryResultFactory.java b/scm-webapp/src/main/java/sonia/scm/search/QueryResultFactory.java index 460eda6a30..68ee7b2717 100644 --- a/scm-webapp/src/main/java/sonia/scm/search/QueryResultFactory.java +++ b/scm-webapp/src/main/java/sonia/scm/search/QueryResultFactory.java @@ -30,10 +30,7 @@ import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; -import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.InvalidTokenOffsetsException; -import org.apache.lucene.search.highlight.QueryScorer; -import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import java.io.IOException; import java.util.ArrayList; @@ -47,23 +44,14 @@ import static java.util.Optional.of; public class QueryResultFactory { - private final Analyzer analyzer; - private final Highlighter highlighter; + private final LuceneHighlighter highlighter; private final IndexSearcher searcher; private final LuceneSearchableType searchableType; public QueryResultFactory(Analyzer analyzer, IndexSearcher searcher, LuceneSearchableType searchableType, Query query) { - this.analyzer = analyzer; this.searcher = searcher; this.searchableType = searchableType; - this.highlighter = createHighlighter(query); - } - - private Highlighter createHighlighter(Query query) { - return new Highlighter( - new SimpleHTMLFormatter("<>", ""), - new QueryScorer(query) - ); + this.highlighter = new LuceneHighlighter(analyzer, query); } public QueryResult create(TopDocs topDocs) throws IOException, InvalidTokenOffsetsException { @@ -98,7 +86,7 @@ public class QueryResultFactory { } private String[] createFragments(LuceneSearchableField field, String value) throws InvalidTokenOffsetsException, IOException { - return highlighter.getBestFragments(analyzer, field.getName(), value, 5); + return highlighter.highlight(field.getName(), field.getAnalyzer(), value); } } diff --git a/scm-webapp/src/test/java/sonia/scm/search/LuceneHighlighterTest.java b/scm-webapp/src/test/java/sonia/scm/search/LuceneHighlighterTest.java new file mode 100644 index 0000000000..7cad613fbc --- /dev/null +++ b/scm-webapp/src/test/java/sonia/scm/search/LuceneHighlighterTest.java @@ -0,0 +1,118 @@ +/* + * MIT License + * + * Copyright (c) 2020-present Cloudogu GmbH and Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package sonia.scm.search; + +import com.google.common.io.Resources; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.highlight.InvalidTokenOffsetsException; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.net.URL; +import java.nio.charset.StandardCharsets; + +import static org.assertj.core.api.Assertions.assertThat; + +class LuceneHighlighterTest { + + @Test + void shouldHighlightText() throws InvalidTokenOffsetsException, IOException { + StandardAnalyzer analyzer = new StandardAnalyzer(); + + Query query = new TermQuery(new Term("content", "golgafrinchan")); + + String content = content("content"); + + LuceneHighlighter highlighter = new LuceneHighlighter(analyzer, query); + String[] snippets = highlighter.highlight("content", Indexed.Analyzer.DEFAULT, content); + + assertThat(snippets).hasSize(1).allSatisfy( + snippet -> assertThat(snippet).contains("<>Golgafrinchan") + ); + } + + @Test + void shouldHighlightCodeAndKeepLines() throws IOException, InvalidTokenOffsetsException { + String[] snippets = highlightCode("GameOfLife.java", "die"); + + assertThat(snippets).hasSize(1).allSatisfy( + snippet -> assertThat(snippet.split("\n")).contains( + "\t\t\t\tint neighbors= getNeighbors(above, same, below);", + "\t\t\t\tif(neighbors < 2 || neighbors > 3){", + "\t\t\t\t\tnewGen[row]+= \"_\";//<2 or >3 neighbors -> <>die", + "\t\t\t\t}else if(neighbors == 3){", + "\t\t\t\t\tnewGen[row]+= \"#\";//3 neighbors -> spawn/live" + ) + ); + } + + @Test + void shouldHighlightCodeInTsx() throws IOException, InvalidTokenOffsetsException { + String[] snippets = highlightCode("Button.tsx", "inherit"); + + assertThat(snippets).hasSize(1).allSatisfy( + snippet -> assertThat(snippet.split("\n")).contains( + "}) => {", + " const renderIcon = () => {", + " return <>{icon ? inherit\" className=\"is-medium pr-1\" /> : null};", + " };" + ) + ); + } + + @Test + void shouldHighlightFirstCodeLine() throws InvalidTokenOffsetsException, IOException { + String[] snippets = highlightCode("GameOfLife.java", "gameoflife"); + + assertThat(snippets).hasSize(1); + } + + @Test + void shouldHighlightLastCodeLine() throws InvalidTokenOffsetsException, IOException { + String[] snippets = highlightCode("Button.tsx", "default"); + + assertThat(snippets).hasSize(1); + } + + private String[] highlightCode(String resource, String search) throws IOException, InvalidTokenOffsetsException { + NonNaturalLanguageAnalyzer analyzer = new NonNaturalLanguageAnalyzer(); + Query query = new TermQuery(new Term("content", search)); + + String content = content(resource); + + LuceneHighlighter highlighter = new LuceneHighlighter(analyzer, query); + return highlighter.highlight("content", Indexed.Analyzer.CODE, content); + } + + @SuppressWarnings("UnstableApiUsage") + private String content(String resource) throws IOException { + URL url = Resources.getResource("sonia/scm/search/" + resource + ".txt"); + return Resources.toString(url, StandardCharsets.UTF_8); + } + +} diff --git a/scm-webapp/src/test/resources/sonia/scm/search/Button.tsx.txt b/scm-webapp/src/test/resources/sonia/scm/search/Button.tsx.txt new file mode 100644 index 0000000000..08576a3eee --- /dev/null +++ b/scm-webapp/src/test/resources/sonia/scm/search/Button.tsx.txt @@ -0,0 +1,120 @@ +/* + * MIT License + * + * Copyright (c) 2020-present Cloudogu GmbH and Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +import React, { FC, MouseEvent, ReactNode, KeyboardEvent } from "react"; +import classNames from "classnames"; +import { Link } from "react-router-dom"; +import Icon from "../Icon"; +import { createAttributesForTesting } from "../devBuild"; + +export type ButtonProps = { + label?: string; + title?: string; + loading?: boolean; + disabled?: boolean; + action?: (event: MouseEvent | KeyboardEvent) => void; + link?: string; + className?: string; + icon?: string; + fullWidth?: boolean; + reducedMobile?: boolean; + children?: ReactNode; + testId?: string; +}; + +type Props = ButtonProps & { + type?: "button" | "submit" | "reset"; + color?: string; +}; + +const Button: FC = ({ + link, + className, + icon, + fullWidth, + reducedMobile, + testId, + children, + label, + type = "button", + title, + loading, + disabled, + action, + color = "primary", +}) => { + const renderIcon = () => { + return <>{icon ? : null}; + }; + + if (link && !disabled) { + return ( + + {renderIcon()}{" "} + {(label || children) && ( + <> + {label} {children} + + )} + + ); + } + + return ( + + ); +}; + +export default Button; diff --git a/scm-webapp/src/test/resources/sonia/scm/search/GameOfLife.java.txt b/scm-webapp/src/test/resources/sonia/scm/search/GameOfLife.java.txt new file mode 100644 index 0000000000..e5c820c75f --- /dev/null +++ b/scm-webapp/src/test/resources/sonia/scm/search/GameOfLife.java.txt @@ -0,0 +1,91 @@ +public class GameOfLife{ + public static void main(String[] args){ + String[] dish= { + "_#_", + "_#_", + "_#_",}; + int gens= 3; + for(int i= 0;i < gens;i++){ + System.out.println("Generation " + i + ":"); + print(dish); + dish= life(dish); + } + } + + public static String[] life(String[] dish){ + String[] newGen= new String[dish.length]; + for(int row= 0;row < dish.length;row++){//each row + newGen[row]= ""; + for(int i= 0;i < dish[row].length();i++){//each char in the row + String above= "";//neighbors above + String same= "";//neighbors in the same row + String below= "";//neighbors below + if(i == 0){//all the way on the left + //no one above if on the top row + //otherwise grab the neighbors from above + above= (row == 0) ? null : dish[row - 1].substring(i, + i + 2); + same= dish[row].substring(i + 1, i + 2); + //no one below if on the bottom row + //otherwise grab the neighbors from below + below= (row == dish.length - 1) ? null : dish[row + 1] + .substring(i, i + 2); + }else if(i == dish[row].length() - 1){//right + //no one above if on the top row + //otherwise grab the neighbors from above + above= (row == 0) ? null : dish[row - 1].substring(i - 1, + i + 1); + same= dish[row].substring(i - 1, i); + //no one below if on the bottom row + //otherwise grab the neighbors from below + below= (row == dish.length - 1) ? null : dish[row + 1] + .substring(i - 1, i + 1); + }else{//anywhere else + //no one above if on the top row + //otherwise grab the neighbors from above + above= (row == 0) ? null : dish[row - 1].substring(i - 1, + i + 2); + same= dish[row].substring(i - 1, i) + + dish[row].substring(i + 1, i + 2); + //no one below if on the bottom row + //otherwise grab the neighbors from below + below= (row == dish.length - 1) ? null : dish[row + 1] + .substring(i - 1, i + 2); + } + int neighbors= getNeighbors(above, same, below); + if(neighbors < 2 || neighbors > 3){ + newGen[row]+= "_";//<2 or >3 neighbors -> die + }else if(neighbors == 3){ + newGen[row]+= "#";//3 neighbors -> spawn/live + }else{ + newGen[row]+= dish[row].charAt(i);//2 neighbors -> stay + } + } + } + return newGen; + } + + public static int getNeighbors(String above, String same, String below){ + int ans= 0; + if(above != null){//no one above + for(char x: above.toCharArray()){//each neighbor from above + if(x == '#') ans++;//count it if someone is here + } + } + for(char x: same.toCharArray()){//two on either side + if(x == '#') ans++;//count it if someone is here + } + if(below != null){//no one below + for(char x: below.toCharArray()){//each neighbor below + if(x == '#') ans++;//count it if someone is here + } + } + return ans; + } + + public static void print(String[] dish){ + for(String s: dish){ + System.out.println(s); + } + } +} diff --git a/scm-webapp/src/test/resources/sonia/scm/search/content.txt b/scm-webapp/src/test/resources/sonia/scm/search/content.txt new file mode 100644 index 0000000000..49720d9995 --- /dev/null +++ b/scm-webapp/src/test/resources/sonia/scm/search/content.txt @@ -0,0 +1,6 @@ +In The Restaurant at the End of the Universe (published in 1980), Zaphod is separated from the others and finds he is part of a conspiracy to uncover who really runs the Universe. Zaphod meets Zarniwoop, a conspirator and editor for The Guide, who knows where to find the secret ruler. Zaphod becomes briefly reunited with the others for a trip to Milliways, the restaurant of the title. Zaphod and Ford decide to steal a ship from there, which turns out to be a stunt ship pre-programmed to plunge into a star as a special effect in a stage show. Unable to change course, the main characters get Marvin to run the teleporter they find in the ship, which is working other than having no automatic control (someone must remain behind to operate it), and Marvin seemingly sacrifices himself. Zaphod and Trillian discover that the Universe is in the safe hands of a simple man living on a remote planet in a wooden shack with his cat. + +Ford and Arthur, meanwhile, end up on a spacecraft full of the outcasts of the Golgafrinchan civilisation. The ship crashes on prehistoric Earth; Ford and Arthur are stranded, and it becomes clear that the inept Golgafrinchans are the ancestors of modern humans, having displaced the Earth's indigenous hominids. This has disrupted the Earth's programming so that when Ford and Arthur manage to extract the final readout from Arthur's subconscious mind by pulling lettered tiles from a Scrabble set, it is "What do you get if you multiply six by nine?" Arthur then comments, "I've always said there was something fundamentally wrong with the universe." + +The book was adapted from the remaining material in the radio series—covering from the fifth episode to the twelfth episode, although the ordering was greatly changed (in particular, the events of Fit the Sixth, with Ford and Arthur being stranded on pre-historic Earth, end the book, and their rescue in Fit the Seventh is deleted), and most of the Brontitall incident was omitted, instead of the Haggunenon sequence, co-written by John Loyd, the Disaster Area stunt ship was substituted—this having first been introduced in the LP version. Adams himself considered Restaurant to be his best novel of the five. +