[7.x Backport] Prevent SigTerms/SigText from running on fields they do not support (#57485)

SigTerms cannot run on fields that are not searchable, and SigText cannot run on fields that do not have analyzers. Both of these situations fail today with an esoteric exception, so this just formalizes the constraint by throwing an IllegalArgumentException up front. In practice, the only affected field seems to be the `binary` field, which is neither searchable or has a default analyzer (e.g. even numeric and keyword fields have a default analyzer despite not being tokenized) Adds supported-type tests, and makes some changes to the test itself to allow testing sigtext (indexing _source). Also a few tweaks to the test to avoid bad randomization (negative numbers, etc).
2020-06-02 16:03:37 -04:00 · 2020-06-02 16:03:37 -04:00 · 79ac69cfa3
parent f360020048
commit 79ac69cfa3
4 changed files with 56 additions and 8 deletions
--- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTermsAggregatorFactory.java
+++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTermsAggregatorFactory.java
@ -205,7 +205,12 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
            }
        }

-        if (!config.unmapped()) {
+        if (config.unmapped() == false) {
+            if (config.fieldContext().fieldType().isSearchable() == false) {
+                throw new IllegalArgumentException("SignificantText aggregation requires fields to be searchable, but ["
+                    + config.fieldContext().fieldType().name() + "] is not");
+            }
+
            this.fieldType = config.fieldContext().fieldType();
            this.indexedFieldName = fieldType.name();
        }
@ -248,6 +253,10 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
    }

    private long getBackgroundFrequency(String value) throws IOException {
+        // fieldType can be null if the field is unmapped, but theoretically this method should only be called
+        // when constructing buckets.  Assert to ensure this is the case
+        // TODO this is a bad setup and it should be refactored
+        assert fieldType != null;
        Query query = fieldType.termQuery(value, queryShardContext);
        if (query instanceof TermQuery) {
            // for types that use the inverted index, we prefer using a caching terms
--- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTextAggregatorFactory.java
+++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTextAggregatorFactory.java
@ -79,6 +79,10 @@ public class SignificantTextAggregatorFactory extends AggregatorFactory
        // Note that if the field is unmapped (its field type is null), we don't fail,
        // and just use the given field name as a placeholder.
        this.fieldType = queryShardContext.fieldMapper(fieldName);
+        if (fieldType != null && fieldType.indexAnalyzer() == null) {
+            throw new IllegalArgumentException("Field [" + fieldType.name() + "] has no analyzer, but SignificantText " +
+                "requires an analyzed field");
+        }
        this.indexedFieldName = fieldType != null ? fieldType.name() : fieldName;
        this.sourceFieldNames = sourceFieldNames == null
            ? new String[] { indexedFieldName }
@ -120,6 +124,10 @@ public class SignificantTextAggregatorFactory extends AggregatorFactory
    }

    private long getBackgroundFrequency(String value) throws IOException {
+        // fieldType can be null if the field is unmapped, but theoretically this method should only be called
+        // when constructing buckets.  Assert to ensure this is the case
+        // TODO this is a bad setup and it should be refactored
+        assert fieldType != null;
        Query query = fieldType.termQuery(value, queryShardContext);
        if (query instanceof TermQuery) {
            // for types that use the inverted index, we prefer using a caching terms
--- a/server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTermsAggregatorTests.java
+++ b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTermsAggregatorTests.java
@ -39,6 +39,7 @@ import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BytesRef;
 import org.elasticsearch.index.analysis.AnalyzerScope;
 import org.elasticsearch.index.analysis.NamedAnalyzer;
+import org.elasticsearch.index.mapper.BinaryFieldMapper;
 import org.elasticsearch.index.mapper.KeywordFieldMapper;
 import org.elasticsearch.index.mapper.MappedFieldType;
 import org.elasticsearch.index.mapper.NumberFieldMapper;
@ -52,6 +53,8 @@ import org.elasticsearch.index.query.QueryBuilders;
 import org.elasticsearch.search.aggregations.AggregationBuilder;
 import org.elasticsearch.search.aggregations.AggregatorTestCase;
 import org.elasticsearch.search.aggregations.bucket.terms.SignificantTermsAggregatorFactory.ExecutionMode;
+import org.elasticsearch.search.aggregations.support.CoreValuesSourceType;
+import org.elasticsearch.search.aggregations.support.ValuesSourceType;
 import org.junit.Before;

 import java.io.IOException;
@ -83,16 +86,18 @@ public class SignificantTermsAggregatorTests extends AggregatorTestCase {
        return new SignificantTermsAggregationBuilder("foo").field(fieldName);
    }

-    /* NOTE - commented out instead of deleted, we need to backport https://github.com/elastic/elasticsearch/pull/52851 to support this.
    @Override
    protected List<ValuesSourceType> getSupportedValuesSourceTypes() {
        return Arrays.asList(CoreValuesSourceType.NUMERIC,
-            CoreValuesSourceType.BYTES);
+            CoreValuesSourceType.BYTES,
+            CoreValuesSourceType.BOOLEAN,
+            CoreValuesSourceType.DATE,
+            CoreValuesSourceType.IP);
    }

    @Override
    protected List<String> unsupportedMappedFieldTypes() {
-        return List.of(
+        return Arrays.asList(
            NumberFieldMapper.NumberType.DOUBLE.typeName(), // floating points are not supported at all
            NumberFieldMapper.NumberType.FLOAT.typeName(),
            NumberFieldMapper.NumberType.HALF_FLOAT.typeName(),
@ -100,8 +105,6 @@ public class SignificantTermsAggregatorTests extends AggregatorTestCase {
        );
    }

-     */
-
    /**
     * For each provided field type, we also register an alias with name {@code <field>-alias}.
     */
--- a/server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTextAggregatorTests.java
+++ b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTextAggregatorTests.java
@ -34,14 +34,17 @@ import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BytesRef;
 import org.elasticsearch.index.analysis.AnalyzerScope;
 import org.elasticsearch.index.analysis.NamedAnalyzer;
+import org.elasticsearch.index.mapper.BinaryFieldMapper;
+import org.elasticsearch.index.mapper.GeoPointFieldMapper;
 import org.elasticsearch.index.mapper.MappedFieldType;
 import org.elasticsearch.index.mapper.TextFieldMapper.TextFieldType;
+import org.elasticsearch.search.aggregations.AggregationBuilder;
 import org.elasticsearch.search.aggregations.AggregatorTestCase;
 import org.elasticsearch.search.aggregations.bucket.sampler.InternalSampler;
 import org.elasticsearch.search.aggregations.bucket.sampler.SamplerAggregationBuilder;
-import org.elasticsearch.search.aggregations.bucket.terms.SignificantTerms;
-import org.elasticsearch.search.aggregations.bucket.terms.SignificantTextAggregationBuilder;
 import org.elasticsearch.search.aggregations.support.AggregationInspectionHelper;
+import org.elasticsearch.search.aggregations.support.CoreValuesSourceType;
+import org.elasticsearch.search.aggregations.support.ValuesSourceType;

 import java.io.IOException;
 import java.util.Arrays;
@ -65,6 +68,31 @@ public class SignificantTextAggregatorTests extends AggregatorTestCase {
            Function.identity()));
    }

+    @Override
+    protected AggregationBuilder createAggBuilderForTypeTest(MappedFieldType fieldType, String fieldName) {
+        return new SignificantTextAggregationBuilder("foo", fieldName);
+    }
+
+    @Override
+    protected List<ValuesSourceType> getSupportedValuesSourceTypes() {
+        // TODO it is likely accidental that SigText supports anything other than Bytes, and then only text fields
+        return Arrays.asList(CoreValuesSourceType.NUMERIC,
+            CoreValuesSourceType.BYTES,
+            CoreValuesSourceType.RANGE,
+            CoreValuesSourceType.GEOPOINT,
+            CoreValuesSourceType.BOOLEAN,
+            CoreValuesSourceType.DATE,
+            CoreValuesSourceType.IP);
+    }
+
+    @Override
+    protected List<String> unsupportedMappedFieldTypes() {
+        return Arrays.asList(
+            BinaryFieldMapper.CONTENT_TYPE, // binary fields are not supported because they do not have analyzers
+            GeoPointFieldMapper.CONTENT_TYPE // geopoint fields cannot use term queries
+        );
+    }
+
    /**
     * Uses the significant text aggregation to find the keywords in text fields
     */