From 5b4d032611a9c03bdaef9af02b4779102fc325f0 Mon Sep 17 00:00:00 2001 From: Sebastian Sdorra Date: Wed, 8 Sep 2021 10:56:57 +0200 Subject: [PATCH] Fix query for enum fields (#1800) The enum fields were not searchable, because they were stored without analysation or transformation, but if an enum field was searched for within a query, the StandardAnalyzer was used. This means that the enum was stored in the index as an uppercase string, but the query searches for lowercase (the StandardAnalyzer uses a lowercase filter). To fix this problem we are now using the KeywordAnalyzer for every non tokenized field. The StandardAnalyzer is only used for tokenized fields, which does not specify an other analyzer such code, path or id. For enum fields we have introduced a new analyzer which uses an uppercase filter by default, this makes it possible to ignore case during search for enum fields. --- .../sonia/scm/search/AnalyzerFactory.java | 44 ++++++++++++++++-- .../sonia/scm/search/LuceneQueryBuilder.java | 3 +- .../scm/search/LuceneSearchableField.java | 2 + .../sonia/scm/search/ValueExtractors.java | 9 +++- .../scm/search/LuceneQueryBuilderTest.java | 46 ++++++++++++++++++- .../sonia/scm/search/ValueExtractorsTest.java | 11 +++++ 6 files changed, 105 insertions(+), 10 deletions(-) diff --git a/scm-webapp/src/main/java/sonia/scm/search/AnalyzerFactory.java b/scm-webapp/src/main/java/sonia/scm/search/AnalyzerFactory.java index 7a2767d21c..7cbff86f4d 100644 --- a/scm-webapp/src/main/java/sonia/scm/search/AnalyzerFactory.java +++ b/scm-webapp/src/main/java/sonia/scm/search/AnalyzerFactory.java @@ -25,16 +25,21 @@ package sonia.scm.search; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.core.KeywordAnalyzer; +import org.apache.lucene.analysis.core.KeywordTokenizerFactory; +import org.apache.lucene.analysis.core.UpperCaseFilterFactory; +import org.apache.lucene.analysis.custom.CustomAnalyzer; import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; import org.apache.lucene.analysis.standard.StandardAnalyzer; +import java.io.IOException; import java.util.HashMap; import java.util.Map; public class AnalyzerFactory { public Analyzer create(LuceneSearchableType type) { - Analyzer defaultAnalyzer = createDefaultAnalyzer(); + Analyzer defaultAnalyzer = createNonTokenizedAnalyzer(); Map analyzerMap = new HashMap<>(); for (LuceneSearchableField field : type.getAllFields()) { @@ -44,13 +49,42 @@ public class AnalyzerFactory { return new PerFieldAnalyzerWrapper(defaultAnalyzer, analyzerMap); } - private Analyzer createDefaultAnalyzer() { - return new StandardAnalyzer(); + private Analyzer createNonTokenizedAnalyzer() { + return new KeywordAnalyzer(); } private void addFieldAnalyzer(Map analyzerMap, LuceneSearchableField field) { - if (field.getAnalyzer() != Indexed.Analyzer.DEFAULT) { - analyzerMap.put(field.getName(), new NonNaturalLanguageAnalyzer()); + Analyzer analyzer = createAnalyzer(field); + if (analyzer != null) { + analyzerMap.put(field.getName(), analyzer); + } + } + + private Analyzer createAnalyzer(LuceneSearchableField field) { + if (field.isTokenized()) { + return createTokenizedAnalyzer(field.getAnalyzer()); + } else if (field.getType().isEnum()) { + return createEnumAnalyzer(); + } else { + return null; + } + } + + private Analyzer createTokenizedAnalyzer(Indexed.Analyzer analyzer) { + if (analyzer == Indexed.Analyzer.DEFAULT) { + return new StandardAnalyzer(); + } + return new NonNaturalLanguageAnalyzer(); + } + + private Analyzer createEnumAnalyzer() { + try { + return CustomAnalyzer.builder() + .withTokenizer(KeywordTokenizerFactory.class) + .addTokenFilter(UpperCaseFilterFactory.class) + .build(); + } catch (IOException ex) { + throw new IllegalStateException("failed to create enum analyzer", ex); } } diff --git a/scm-webapp/src/main/java/sonia/scm/search/LuceneQueryBuilder.java b/scm-webapp/src/main/java/sonia/scm/search/LuceneQueryBuilder.java index 7ac161bc46..dc7963a2e7 100644 --- a/scm-webapp/src/main/java/sonia/scm/search/LuceneQueryBuilder.java +++ b/scm-webapp/src/main/java/sonia/scm/search/LuceneQueryBuilder.java @@ -48,7 +48,6 @@ import org.slf4j.LoggerFactory; import javax.annotation.Nonnull; import java.io.IOException; -import java.util.Locale; public class LuceneQueryBuilder extends QueryBuilder { @@ -138,7 +137,7 @@ public class LuceneQueryBuilder extends QueryBuilder { throw new NoDefaultQueryFieldsFoundException(searchableType.getType()); } - String queryString = queryParams.getQueryString().toLowerCase(Locale.ENGLISH); + String queryString = queryParams.getQueryString(); boolean hasWildcard = containsWildcard(queryString); BooleanQuery.Builder builder = new BooleanQuery.Builder(); diff --git a/scm-webapp/src/main/java/sonia/scm/search/LuceneSearchableField.java b/scm-webapp/src/main/java/sonia/scm/search/LuceneSearchableField.java index 25c86ce151..6b3e592bea 100644 --- a/scm-webapp/src/main/java/sonia/scm/search/LuceneSearchableField.java +++ b/scm-webapp/src/main/java/sonia/scm/search/LuceneSearchableField.java @@ -43,6 +43,7 @@ class LuceneSearchableField implements SearchableField { private final PointsConfig pointsConfig; private final Indexed.Analyzer analyzer; private final boolean searchable; + private final boolean tokenized; LuceneSearchableField(Field field, Indexed indexed) { this.name = name(field, indexed); @@ -54,6 +55,7 @@ class LuceneSearchableField implements SearchableField { this.pointsConfig = IndexableFields.pointConfig(field); this.analyzer = indexed.analyzer(); this.searchable = indexed.type().isSearchable(); + this.tokenized = indexed.type().isTokenized() && String.class.isAssignableFrom(type); } Object value(Document document) { diff --git a/scm-webapp/src/main/java/sonia/scm/search/ValueExtractors.java b/scm-webapp/src/main/java/sonia/scm/search/ValueExtractors.java index 82c8538061..0476d3ef4f 100644 --- a/scm-webapp/src/main/java/sonia/scm/search/ValueExtractors.java +++ b/scm-webapp/src/main/java/sonia/scm/search/ValueExtractors.java @@ -28,6 +28,7 @@ import org.apache.lucene.index.IndexableField; import javax.annotation.Nonnull; import java.time.Instant; +import java.util.Locale; final class ValueExtractors { @@ -54,7 +55,13 @@ final class ValueExtractors { @SuppressWarnings({"unchecked", "rawtypes"}) private static ValueExtractor enumExtractor(String name, Class type) { - return doc -> Enum.valueOf(type, doc.get(name)); + return doc -> { + String value = doc.get(name); + if (value != null) { + return Enum.valueOf(type, value.toUpperCase(Locale.ENGLISH)); + } + return null; + }; } @Nonnull diff --git a/scm-webapp/src/test/java/sonia/scm/search/LuceneQueryBuilderTest.java b/scm-webapp/src/test/java/sonia/scm/search/LuceneQueryBuilderTest.java index 0ae549e498..8945eca061 100644 --- a/scm-webapp/src/test/java/sonia/scm/search/LuceneQueryBuilderTest.java +++ b/scm-webapp/src/test/java/sonia/scm/search/LuceneQueryBuilderTest.java @@ -55,6 +55,7 @@ import java.io.IOException; import java.time.Instant; import java.time.temporal.ChronoUnit; import java.util.List; +import java.util.Locale; import java.util.function.Consumer; import java.util.stream.Collectors; @@ -545,6 +546,26 @@ class LuceneQueryBuilderTest { assertThat(hit.getRepositoryId()).contains("4211"); } + @Test + void shouldQueryByEnumField() throws IOException { + try (IndexWriter writer = writer()) { + writer.addDocument(animalPerson("Trillian", Animal.PENGUIN)); + } + + QueryResult result = query(PersonWithAnimal.class, "animal:penguin"); + assertThat(result.getTotalHits()).isOne(); + } + + @Test + void shouldQueryByEnumFieldAndIgnoreCase() throws IOException { + try (IndexWriter writer = writer()) { + writer.addDocument(animalPerson("Arthur", Animal.ALPACA)); + } + + QueryResult result = query(PersonWithAnimal.class, "animal:AlPaCa"); + assertThat(result.getTotalHits()).isOne(); + } + private QueryResult query(Class type, String queryString) throws IOException { return query(type, queryString, null, null); } @@ -555,7 +576,7 @@ class LuceneQueryBuilderTest { LuceneSearchableType searchableType = resolver.resolve(type); lenient().when(opener.openForRead(searchableType, "default")).thenReturn(reader); LuceneQueryBuilder builder = new LuceneQueryBuilder( - opener, "default", searchableType, new StandardAnalyzer() + opener, "default", searchableType, new AnalyzerFactory().create(searchableType) ); return builder.count(queryString).getTotalHits(); } @@ -579,7 +600,7 @@ class LuceneQueryBuilderTest { lenient().when(opener.openForRead(searchableType, "default")).thenReturn(reader); LuceneQueryBuilder builder = new LuceneQueryBuilder<>( - opener, "default", searchableType, new StandardAnalyzer() + opener, "default", searchableType, new AnalyzerFactory().create(searchableType) ); consumer.accept(builder); return builder.execute(queryString); @@ -648,6 +669,13 @@ class LuceneQueryBuilderTest { return document; } + private Document animalPerson(String name, Animal animal) { + Document document = new Document(); + document.add(new TextField("name", name, Field.Store.YES)); + document.add(new StringField("animal", animal.name(), Field.Store.YES)); + return document; + } + @Getter @IndexedType static class Types { @@ -693,4 +721,18 @@ class LuceneQueryBuilderTest { private String content; } + enum Animal { + PENGUIN, ALPACA + } + + @Getter + @IndexedType + static class PersonWithAnimal { + @Indexed(defaultQuery = true) + private String name; + + @Indexed + private Animal animal; + } + } diff --git a/scm-webapp/src/test/java/sonia/scm/search/ValueExtractorsTest.java b/scm-webapp/src/test/java/sonia/scm/search/ValueExtractorsTest.java index 446148b0c5..4c9a16e611 100644 --- a/scm-webapp/src/test/java/sonia/scm/search/ValueExtractorsTest.java +++ b/scm-webapp/src/test/java/sonia/scm/search/ValueExtractorsTest.java @@ -43,6 +43,17 @@ class ValueExtractorsTest { assertThat(value).isEqualTo(Animal.PENGUIN); } + @Test + void shouldExtractEnumLowerCaseValue() { + Document document = new Document(); + document.add(new StoredField("animal", "alpaca")); + + ValueExtractor extractor = ValueExtractors.create("animal", Animal.class); + Object value = extractor.extract(document); + + assertThat(value).isEqualTo(Animal.ALPACA); + } + enum Animal { PENGUIN, ALPACA }