Mask wildcard query special characters on keyword queries (#53127) (#53512)

Wildcard queries on keyword fields get normalized, however this normalization step should exclude the two special characters * and ? in order to keep the wildcard query itself intact. Closes #46300
2020-03-24 17:22:29 +01:00 · 2020-03-24 17:22:29 +01:00 · 1c1730facd
parent 6b457abbd3
commit 1c1730facd
6 changed files with 185 additions and 21 deletions
--- a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java
+++ b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java
@ -202,6 +202,7 @@ public final class KeywordFieldMapper extends FieldMapper {
            this.splitQueriesOnWhitespace = ref.splitQueriesOnWhitespace;
        }
        @Override
        public KeywordFieldType clone() {
            return new KeywordFieldType(this);
        }
--- a/server/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java
+++ b/server/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java
@ -21,8 +21,6 @@ package org.elasticsearch.index.mapper;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.FuzzyQuery;
 import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.search.MatchNoDocsQuery;
 import org.apache.lucene.search.MultiTermQuery;
 import org.apache.lucene.search.PrefixQuery;
 import org.apache.lucene.search.Query;
@ -31,6 +29,7 @@ import org.apache.lucene.search.TermInSetQuery;
 import org.apache.lucene.search.TermRangeQuery;
 import org.apache.lucene.search.WildcardQuery;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
 import org.elasticsearch.ElasticsearchException;
 import org.elasticsearch.common.lucene.BytesRefs;
 import org.elasticsearch.common.unit.Fuzziness;
@ -38,6 +37,8 @@ import org.elasticsearch.index.query.QueryShardContext;
 import org.elasticsearch.index.query.support.QueryParsers;
 import java.util.List;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import static org.elasticsearch.search.SearchService.ALLOW_EXPENSIVE_QUERIES;
@ -47,6 +48,8 @@ import static org.elasticsearch.search.SearchService.ALLOW_EXPENSIVE_QUERIES;
 * can be implemented. */
 public abstract class StringFieldType extends TermBasedFieldType {
    private static final Pattern WILDCARD_PATTERN = Pattern.compile("(\\\\.)|([?*]+)");
    public StringFieldType() {}
    protected StringFieldType(MappedFieldType ref) {
@ -92,16 +95,41 @@ public abstract class StringFieldType extends TermBasedFieldType {
    @Override
    public Query wildcardQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) {
-        Query termQuery = termQuery(value, context);
+        failIfNotIndexed();
        if (termQuery instanceof MatchNoDocsQuery || termQuery instanceof MatchAllDocsQuery) {
            return termQuery;
        }
        if (context.allowExpensiveQueries() == false) {
            throw new ElasticsearchException("[wildcard] queries cannot be executed when '" +
                    ALLOW_EXPENSIVE_QUERIES.getKey() + "' is set to false.");
        }
-        Term term = MappedFieldType.extractTerm(termQuery);
+
        Term term;
        if (searchAnalyzer() != null) {
            // we want to normalize everything except wildcard characters, e.g. F?o Ba* to f?o ba*, even if e.g there
            // is a char_filter that would otherwise remove them
            Matcher wildcardMatcher = WILDCARD_PATTERN.matcher(value);
            BytesRefBuilder sb = new BytesRefBuilder();
            int last = 0;
            while (wildcardMatcher.find()) {
                if (wildcardMatcher.start() > 0) {
                    String chunk = value.substring(last, wildcardMatcher.start());
                    BytesRef normalized = searchAnalyzer().normalize(name(), chunk);
                    sb.append(normalized);
                }
                // append the matched group - without normalizing
                sb.append(new BytesRef(wildcardMatcher.group()));
                last = wildcardMatcher.end();
            }
            if (last < value.length()) {
                String chunk = value.substring(last);
                BytesRef normalized = searchAnalyzer().normalize(name(), chunk);
                sb.append(normalized);
            }
            term = new Term(name(), sb.toBytesRef());
        } else {
            term = new Term(name(), indexedValueForSearch(value));
        }
        WildcardQuery query = new WildcardQuery(term);
        QueryParsers.setRewriteMethod(query, method);
--- a/server/src/main/java/org/elasticsearch/index/mapper/TypeFieldMapper.java
+++ b/server/src/main/java/org/elasticsearch/index/mapper/TypeFieldMapper.java
@ -31,10 +31,13 @@ import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.ConstantScoreQuery;
 import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.search.MatchNoDocsQuery;
 import org.apache.lucene.search.MultiTermQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.TermInSetQuery;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.WildcardQuery;
 import org.apache.lucene.util.BytesRef;
 import org.elasticsearch.ElasticsearchException;
 import org.elasticsearch.common.lucene.Lucene;
 import org.elasticsearch.common.lucene.search.Queries;
 import org.elasticsearch.common.xcontent.XContentBuilder;
@ -42,6 +45,7 @@ import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.fielddata.IndexFieldData;
 import org.elasticsearch.index.fielddata.plain.ConstantIndexFieldData;
 import org.elasticsearch.index.query.QueryShardContext;
 import org.elasticsearch.index.query.support.QueryParsers;
 import java.io.IOException;
 import java.util.Arrays;
@ -51,6 +55,8 @@ import java.util.Map;
 import java.util.Set;
 import java.util.function.Function;
 import static org.elasticsearch.search.SearchService.ALLOW_EXPENSIVE_QUERIES;
 public class TypeFieldMapper extends MetadataFieldMapper {
    public static final String NAME = "_type";
@ -170,6 +176,25 @@ public class TypeFieldMapper extends MetadataFieldMapper {
            }
            return result;
        }
        @Override
        public Query wildcardQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) {
            Query termQuery = termQuery(value, context);
            if (termQuery instanceof MatchNoDocsQuery || termQuery instanceof MatchAllDocsQuery) {
                return termQuery;
            }
            if (context.allowExpensiveQueries() == false) {
                throw new ElasticsearchException("[wildcard] queries cannot be executed when '" +
                        ALLOW_EXPENSIVE_QUERIES.getKey() + "' is set to false.");
            }
            Term term = MappedFieldType.extractTerm(termQuery);
            WildcardQuery query = new WildcardQuery(term);
            QueryParsers.setRewriteMethod(query, method);
            return query;
        }
    }
    /**
--- a/server/src/test/java/org/elasticsearch/index/query/RangeQueryBuilderTests.java
+++ b/server/src/test/java/org/elasticsearch/index/query/RangeQueryBuilderTests.java
@ -560,13 +560,6 @@ public class RangeQueryBuilderTests extends AbstractQueryTestCase<RangeQueryBuil
        assertEquals(ShapeRelation.INTERSECTS, builder.relation());
    }
    public void testTypeField() throws IOException {
        RangeQueryBuilder builder = QueryBuilders.rangeQuery("_type")
            .from("value1");
        builder.doToQuery(createShardContext());
        assertWarnings(QueryShardContext.TYPES_DEPRECATION_MESSAGE);
    }
    /**
     * Range queries should generally be cacheable, at least the ones we create randomly.
     * This test makes sure we also test the non-cacheable cases regularly.
--- a/server/src/test/java/org/elasticsearch/index/query/WildcardQueryBuilderTests.java
+++ b/server/src/test/java/org/elasticsearch/index/query/WildcardQueryBuilderTests.java
@ -27,6 +27,7 @@ import org.elasticsearch.test.AbstractQueryTestCase;
 import java.io.IOException;
 import java.util.HashMap;
 import java.util.Locale;
 import java.util.Map;
 import static org.hamcrest.Matchers.equalTo;
@ -75,7 +76,9 @@ public class WildcardQueryBuilderTests extends AbstractQueryTestCase<WildcardQue
            assertThat(wildcardQuery.getField(), equalTo(expectedFieldName));
            assertThat(wildcardQuery.getTerm().field(), equalTo(expectedFieldName));
-            assertThat(wildcardQuery.getTerm().text(), equalTo(queryBuilder.value()));
+            // wildcard queries get normalized
            String text = wildcardQuery.getTerm().text().toLowerCase(Locale.ROOT);
            assertThat(text, equalTo(text));
        } else {
            Query expected = new MatchNoDocsQuery("unknown field [" + expectedFieldName + "]");
            assertEquals(expected, query);
@ -138,14 +141,14 @@ public class WildcardQueryBuilderTests extends AbstractQueryTestCase<WildcardQue
        builder.doToQuery(createShardContext());
        assertWarnings(QueryShardContext.TYPES_DEPRECATION_MESSAGE);
    }
-    
+
    public void testRewriteIndexQueryToMatchNone() throws IOException {
        WildcardQueryBuilder query = new WildcardQueryBuilder("_index", "does_not_exist");
        QueryShardContext queryShardContext = createShardContext();
        QueryBuilder rewritten = query.rewrite(queryShardContext);
        assertThat(rewritten, instanceOf(MatchNoneQueryBuilder.class));
-    }   
+    }
-    
+
    public void testRewriteIndexQueryNotMatchNone() throws IOException {
        String fullIndexName = getIndex().getName();
        String firstHalfOfIndexName = fullIndexName.substring(0,fullIndexName.length()/2);
--- a/server/src/test/java/org/elasticsearch/search/query/SearchQueryIT.java
+++ b/server/src/test/java/org/elasticsearch/search/query/SearchQueryIT.java
@ -19,6 +19,8 @@
 package org.elasticsearch.search.query;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.pattern.PatternReplaceCharFilter;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.search.MultiTermQuery;
 import org.apache.lucene.search.join.ScoreMode;
@ -32,11 +34,15 @@ import org.elasticsearch.bootstrap.JavaVersion;
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.document.DocumentField;
 import org.elasticsearch.common.lucene.search.SpanBooleanQueryRewriteWithMaxClause;
 import org.elasticsearch.common.regex.Regex;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.time.DateFormatter;
 import org.elasticsearch.common.xcontent.XContentBuilder;
 import org.elasticsearch.common.xcontent.XContentFactory;
 import org.elasticsearch.common.xcontent.XContentType;
 import org.elasticsearch.index.analysis.CharFilterFactory;
 import org.elasticsearch.index.analysis.NormalizingCharFilterFactory;
 import org.elasticsearch.index.analysis.TokenizerFactory;
 import org.elasticsearch.index.query.BoolQueryBuilder;
 import org.elasticsearch.index.query.MatchQueryBuilder;
 import org.elasticsearch.index.query.MultiMatchQueryBuilder;
@ -45,11 +51,14 @@ import org.elasticsearch.index.query.QueryBuilder;
 import org.elasticsearch.index.query.QueryBuilders;
 import org.elasticsearch.index.query.RangeQueryBuilder;
 import org.elasticsearch.index.query.TermQueryBuilder;
 import org.elasticsearch.index.query.WildcardQueryBuilder;
 import org.elasticsearch.index.query.WrapperQueryBuilder;
 import org.elasticsearch.index.query.functionscore.ScoreFunctionBuilders;
 import org.elasticsearch.index.search.MatchQuery;
 import org.elasticsearch.indices.IndicesService;
 import org.elasticsearch.indices.TermsLookup;
 import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
 import org.elasticsearch.plugins.AnalysisPlugin;
 import org.elasticsearch.plugins.Plugin;
 import org.elasticsearch.rest.RestStatus;
 import org.elasticsearch.search.SearchHit;
@ -60,16 +69,20 @@ import org.elasticsearch.test.InternalSettingsPlugin;
 import org.elasticsearch.test.junit.annotations.TestIssueLogging;
 import java.io.IOException;
 import java.io.Reader;
 import java.time.Instant;
 import java.time.ZoneId;
 import java.time.ZoneOffset;
 import java.time.ZonedDateTime;
 import java.time.format.DateTimeFormatter;
 import java.util.Arrays;
 import java.util.Collection;
-import java.util.Collections;
+import java.util.Map;
 import java.util.Random;
 import java.util.concurrent.ExecutionException;
 import java.util.regex.Pattern;
 import static java.util.Collections.singletonMap;
 import static org.elasticsearch.action.support.WriteRequest.RefreshPolicy.IMMEDIATE;
 import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_SHARDS;
 import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
@ -120,7 +133,7 @@ public class SearchQueryIT extends ESIntegTestCase {
    @Override
    protected Collection<Class<? extends Plugin>> nodePlugins() {
-        return Collections.singleton(InternalSettingsPlugin.class);
+        return Arrays.asList(InternalSettingsPlugin.class, MockAnalysisPlugin.class);
    }
    @Override
@ -1897,6 +1910,107 @@ public class SearchQueryIT extends ESIntegTestCase {
   }
   /**
    * Test that wildcard queries on keyword fields get normalized
    */
    public void testWildcardQueryNormalizationOnKeywordField() {
       assertAcked(prepareCreate("test")
               .setSettings(Settings.builder()
                       .put("index.analysis.normalizer.lowercase_normalizer.type", "custom")
                       .putList("index.analysis.normalizer.lowercase_normalizer.filter", "lowercase")
                       .build())
                .addMapping("_doc", "field1", "type=keyword,normalizer=lowercase_normalizer"));
       client().prepareIndex("test", "_doc", "1").setSource("field1", "Bbb Aaa").get();
       refresh();
        {
            WildcardQueryBuilder wildCardQuery = wildcardQuery("field1", "Bb*");
            SearchResponse searchResponse = client().prepareSearch().setQuery(wildCardQuery).get();
            assertHitCount(searchResponse, 1L);
            wildCardQuery = wildcardQuery("field1", "bb*");
            searchResponse = client().prepareSearch().setQuery(wildCardQuery).get();
            assertHitCount(searchResponse, 1L);
        }
   }
    /**
     * Test that wildcard queries on text fields get normalized
     */
     public void testWildcardQueryNormalizationOnTextField() {
        assertAcked(prepareCreate("test")
                .setSettings(Settings.builder()
                        .put("index.analysis.analyzer.lowercase_analyzer.type", "custom")
                        .put("index.analysis.analyzer.lowercase_analyzer.tokenizer", "standard")
                        .putList("index.analysis.analyzer.lowercase_analyzer.filter", "lowercase")
                        .build())
                 .addMapping("_doc", "field1", "type=text,analyzer=lowercase_analyzer"));
        client().prepareIndex("test", "_doc", "1").setSource("field1", "Bbb Aaa").get();
        refresh();
         {
             WildcardQueryBuilder wildCardQuery = wildcardQuery("field1", "Bb*");
             SearchResponse searchResponse = client().prepareSearch().setQuery(wildCardQuery).get();
             assertHitCount(searchResponse, 1L);
             wildCardQuery = wildcardQuery("field1", "bb*");
             searchResponse = client().prepareSearch().setQuery(wildCardQuery).get();
             assertHitCount(searchResponse, 1L);
         }
    }
    /**
     * Reserved characters should be excluded when the normalization is applied for keyword fields.
     * See https://github.com/elastic/elasticsearch/issues/46300 for details.
     */
    public void testWildcardQueryNormalizationKeywordSpecialCharacters() {
        assertAcked(prepareCreate("test")
                .setSettings(Settings.builder().put("index.analysis.char_filter.no_wildcard.type", "mock_pattern_replace")
                        .put("index.analysis.normalizer.no_wildcard.type", "custom")
                        .put("index.analysis.normalizer.no_wildcard.char_filter", "no_wildcard").build())
                .addMapping("_doc", "field", "type=keyword,normalizer=no_wildcard"));
        client().prepareIndex("test", "_doc", "1").setSource("field", "label-1").get();
        refresh();
        WildcardQueryBuilder wildCardQuery = wildcardQuery("field", "la*");
        SearchResponse searchResponse = client().prepareSearch().setQuery(wildCardQuery).get();
        assertHitCount(searchResponse, 1L);
        wildCardQuery = wildcardQuery("field", "la*el-?");
        searchResponse = client().prepareSearch().setQuery(wildCardQuery).get();
        assertHitCount(searchResponse, 1L);
    }
    public static class MockAnalysisPlugin extends Plugin implements AnalysisPlugin {
        @Override
        public Map<String, AnalysisProvider<CharFilterFactory>> getCharFilters() {
            return singletonMap("mock_pattern_replace", (indexSettings, env, name, settings) -> {
                class Factory implements NormalizingCharFilterFactory {
                    private final Pattern pattern = Regex.compile("[\\*\\?]", null);
                    @Override
                    public String name() {
                        return name;
                    }
                    @Override
                    public Reader create(Reader reader) {
                        return new PatternReplaceCharFilter(pattern, "", reader);
                    }
                }
                return new Factory();
            });
        }
        @Override
        public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
            return singletonMap("keyword", (indexSettings, environment, name, settings) -> TokenizerFactory.newFactory(name,
                    () -> new MockTokenizer(MockTokenizer.KEYWORD, false)));
        }
    }
    /**
     * Test correct handling {@link SpanBooleanQueryRewriteWithMaxClause#rewrite(IndexReader, MultiTermQuery)}. That rewrite method is e.g.
     * set for fuzzy queries with "constant_score" rewrite nested inside a `span_multi` query and would cause NPEs due to an unset