From 1c1730facd416c3f8f1d6345962faff483279266 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20B=C3=BCscher?= Date: Tue, 24 Mar 2020 17:22:29 +0100 Subject: [PATCH] Mask wildcard query special characters on keyword queries (#53127) (#53512) Wildcard queries on keyword fields get normalized, however this normalization step should exclude the two special characters * and ? in order to keep the wildcard query itself intact. Closes #46300 --- .../index/mapper/KeywordFieldMapper.java | 1 + .../index/mapper/StringFieldType.java | 44 +++++-- .../index/mapper/TypeFieldMapper.java | 25 ++++ .../index/query/RangeQueryBuilderTests.java | 7 -- .../query/WildcardQueryBuilderTests.java | 11 +- .../search/query/SearchQueryIT.java | 118 +++++++++++++++++- 6 files changed, 185 insertions(+), 21 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java index 6b6c4a53a5f..778fa3450a5 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java @@ -202,6 +202,7 @@ public final class KeywordFieldMapper extends FieldMapper { this.splitQueriesOnWhitespace = ref.splitQueriesOnWhitespace; } + @Override public KeywordFieldType clone() { return new KeywordFieldType(this); } diff --git a/server/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java b/server/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java index 4ddda3df0af..05bf6b61d1d 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java @@ -21,8 +21,6 @@ package org.elasticsearch.index.mapper; import org.apache.lucene.index.Term; import org.apache.lucene.search.FuzzyQuery; -import org.apache.lucene.search.MatchAllDocsQuery; -import org.apache.lucene.search.MatchNoDocsQuery; import org.apache.lucene.search.MultiTermQuery; import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.Query; @@ -31,6 +29,7 @@ import org.apache.lucene.search.TermInSetQuery; import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; import org.elasticsearch.ElasticsearchException; import org.elasticsearch.common.lucene.BytesRefs; import org.elasticsearch.common.unit.Fuzziness; @@ -38,6 +37,8 @@ import org.elasticsearch.index.query.QueryShardContext; import org.elasticsearch.index.query.support.QueryParsers; import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import static org.elasticsearch.search.SearchService.ALLOW_EXPENSIVE_QUERIES; @@ -47,6 +48,8 @@ import static org.elasticsearch.search.SearchService.ALLOW_EXPENSIVE_QUERIES; * can be implemented. */ public abstract class StringFieldType extends TermBasedFieldType { + private static final Pattern WILDCARD_PATTERN = Pattern.compile("(\\\\.)|([?*]+)"); + public StringFieldType() {} protected StringFieldType(MappedFieldType ref) { @@ -92,16 +95,41 @@ public abstract class StringFieldType extends TermBasedFieldType { @Override public Query wildcardQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) { - Query termQuery = termQuery(value, context); - if (termQuery instanceof MatchNoDocsQuery || termQuery instanceof MatchAllDocsQuery) { - return termQuery; - } - + failIfNotIndexed(); if (context.allowExpensiveQueries() == false) { throw new ElasticsearchException("[wildcard] queries cannot be executed when '" + ALLOW_EXPENSIVE_QUERIES.getKey() + "' is set to false."); } - Term term = MappedFieldType.extractTerm(termQuery); + + Term term; + if (searchAnalyzer() != null) { + // we want to normalize everything except wildcard characters, e.g. F?o Ba* to f?o ba*, even if e.g there + // is a char_filter that would otherwise remove them + Matcher wildcardMatcher = WILDCARD_PATTERN.matcher(value); + BytesRefBuilder sb = new BytesRefBuilder(); + int last = 0; + + while (wildcardMatcher.find()) { + if (wildcardMatcher.start() > 0) { + String chunk = value.substring(last, wildcardMatcher.start()); + + BytesRef normalized = searchAnalyzer().normalize(name(), chunk); + sb.append(normalized); + } + // append the matched group - without normalizing + sb.append(new BytesRef(wildcardMatcher.group())); + + last = wildcardMatcher.end(); + } + if (last < value.length()) { + String chunk = value.substring(last); + BytesRef normalized = searchAnalyzer().normalize(name(), chunk); + sb.append(normalized); + } + term = new Term(name(), sb.toBytesRef()); + } else { + term = new Term(name(), indexedValueForSearch(value)); + } WildcardQuery query = new WildcardQuery(term); QueryParsers.setRewriteMethod(query, method); diff --git a/server/src/main/java/org/elasticsearch/index/mapper/TypeFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/TypeFieldMapper.java index c4d9ef966ca..1795e4a629b 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/TypeFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/TypeFieldMapper.java @@ -31,10 +31,13 @@ import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.ConstantScoreQuery; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.MatchNoDocsQuery; +import org.apache.lucene.search.MultiTermQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermInSetQuery; import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.util.BytesRef; +import org.elasticsearch.ElasticsearchException; import org.elasticsearch.common.lucene.Lucene; import org.elasticsearch.common.lucene.search.Queries; import org.elasticsearch.common.xcontent.XContentBuilder; @@ -42,6 +45,7 @@ import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.fielddata.IndexFieldData; import org.elasticsearch.index.fielddata.plain.ConstantIndexFieldData; import org.elasticsearch.index.query.QueryShardContext; +import org.elasticsearch.index.query.support.QueryParsers; import java.io.IOException; import java.util.Arrays; @@ -51,6 +55,8 @@ import java.util.Map; import java.util.Set; import java.util.function.Function; +import static org.elasticsearch.search.SearchService.ALLOW_EXPENSIVE_QUERIES; + public class TypeFieldMapper extends MetadataFieldMapper { public static final String NAME = "_type"; @@ -170,6 +176,25 @@ public class TypeFieldMapper extends MetadataFieldMapper { } return result; } + + @Override + public Query wildcardQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) { + Query termQuery = termQuery(value, context); + if (termQuery instanceof MatchNoDocsQuery || termQuery instanceof MatchAllDocsQuery) { + return termQuery; + } + + if (context.allowExpensiveQueries() == false) { + throw new ElasticsearchException("[wildcard] queries cannot be executed when '" + + ALLOW_EXPENSIVE_QUERIES.getKey() + "' is set to false."); + } + Term term = MappedFieldType.extractTerm(termQuery); + + WildcardQuery query = new WildcardQuery(term); + QueryParsers.setRewriteMethod(query, method); + return query; + } + } /** diff --git a/server/src/test/java/org/elasticsearch/index/query/RangeQueryBuilderTests.java b/server/src/test/java/org/elasticsearch/index/query/RangeQueryBuilderTests.java index a282bbe987d..6c36e35b055 100644 --- a/server/src/test/java/org/elasticsearch/index/query/RangeQueryBuilderTests.java +++ b/server/src/test/java/org/elasticsearch/index/query/RangeQueryBuilderTests.java @@ -560,13 +560,6 @@ public class RangeQueryBuilderTests extends AbstractQueryTestCase> nodePlugins() { - return Collections.singleton(InternalSettingsPlugin.class); + return Arrays.asList(InternalSettingsPlugin.class, MockAnalysisPlugin.class); } @Override @@ -1897,6 +1910,107 @@ public class SearchQueryIT extends ESIntegTestCase { } + /** + * Test that wildcard queries on keyword fields get normalized + */ + public void testWildcardQueryNormalizationOnKeywordField() { + assertAcked(prepareCreate("test") + .setSettings(Settings.builder() + .put("index.analysis.normalizer.lowercase_normalizer.type", "custom") + .putList("index.analysis.normalizer.lowercase_normalizer.filter", "lowercase") + .build()) + .addMapping("_doc", "field1", "type=keyword,normalizer=lowercase_normalizer")); + client().prepareIndex("test", "_doc", "1").setSource("field1", "Bbb Aaa").get(); + refresh(); + + { + WildcardQueryBuilder wildCardQuery = wildcardQuery("field1", "Bb*"); + SearchResponse searchResponse = client().prepareSearch().setQuery(wildCardQuery).get(); + assertHitCount(searchResponse, 1L); + + wildCardQuery = wildcardQuery("field1", "bb*"); + searchResponse = client().prepareSearch().setQuery(wildCardQuery).get(); + assertHitCount(searchResponse, 1L); + } + } + + /** + * Test that wildcard queries on text fields get normalized + */ + public void testWildcardQueryNormalizationOnTextField() { + assertAcked(prepareCreate("test") + .setSettings(Settings.builder() + .put("index.analysis.analyzer.lowercase_analyzer.type", "custom") + .put("index.analysis.analyzer.lowercase_analyzer.tokenizer", "standard") + .putList("index.analysis.analyzer.lowercase_analyzer.filter", "lowercase") + .build()) + .addMapping("_doc", "field1", "type=text,analyzer=lowercase_analyzer")); + client().prepareIndex("test", "_doc", "1").setSource("field1", "Bbb Aaa").get(); + refresh(); + + { + WildcardQueryBuilder wildCardQuery = wildcardQuery("field1", "Bb*"); + SearchResponse searchResponse = client().prepareSearch().setQuery(wildCardQuery).get(); + assertHitCount(searchResponse, 1L); + + wildCardQuery = wildcardQuery("field1", "bb*"); + searchResponse = client().prepareSearch().setQuery(wildCardQuery).get(); + assertHitCount(searchResponse, 1L); + } + } + + /** + * Reserved characters should be excluded when the normalization is applied for keyword fields. + * See https://github.com/elastic/elasticsearch/issues/46300 for details. + */ + public void testWildcardQueryNormalizationKeywordSpecialCharacters() { + assertAcked(prepareCreate("test") + .setSettings(Settings.builder().put("index.analysis.char_filter.no_wildcard.type", "mock_pattern_replace") + .put("index.analysis.normalizer.no_wildcard.type", "custom") + .put("index.analysis.normalizer.no_wildcard.char_filter", "no_wildcard").build()) + .addMapping("_doc", "field", "type=keyword,normalizer=no_wildcard")); + client().prepareIndex("test", "_doc", "1").setSource("field", "label-1").get(); + refresh(); + + WildcardQueryBuilder wildCardQuery = wildcardQuery("field", "la*"); + SearchResponse searchResponse = client().prepareSearch().setQuery(wildCardQuery).get(); + assertHitCount(searchResponse, 1L); + + wildCardQuery = wildcardQuery("field", "la*el-?"); + searchResponse = client().prepareSearch().setQuery(wildCardQuery).get(); + assertHitCount(searchResponse, 1L); + } + + public static class MockAnalysisPlugin extends Plugin implements AnalysisPlugin { + + @Override + public Map> getCharFilters() { + return singletonMap("mock_pattern_replace", (indexSettings, env, name, settings) -> { + class Factory implements NormalizingCharFilterFactory { + + private final Pattern pattern = Regex.compile("[\\*\\?]", null); + + @Override + public String name() { + return name; + } + + @Override + public Reader create(Reader reader) { + return new PatternReplaceCharFilter(pattern, "", reader); + } + } + return new Factory(); + }); + } + + @Override + public Map> getTokenizers() { + return singletonMap("keyword", (indexSettings, environment, name, settings) -> TokenizerFactory.newFactory(name, + () -> new MockTokenizer(MockTokenizer.KEYWORD, false))); + } + } + /** * Test correct handling {@link SpanBooleanQueryRewriteWithMaxClause#rewrite(IndexReader, MultiTermQuery)}. That rewrite method is e.g. * set for fuzzy queries with "constant_score" rewrite nested inside a `span_multi` query and would cause NPEs due to an unset