[7.x Backport] Prevent SigTerms/SigText from running on fields they do not support (#57485)

SigTerms cannot run on fields that are not searchable, and SigText
cannot run on fields that do not have analyzers.  Both of these
situations fail today with an esoteric exception, so this just formalizes
the constraint by throwing an IllegalArgumentException up front.

In practice, the only affected field seems to be the `binary` field,
which is neither searchable or has a default analyzer (e.g. even numeric
and keyword fields have a default analyzer despite not being tokenized)

Adds supported-type tests, and makes some changes to the test itself
to allow testing sigtext (indexing _source).

Also a few tweaks to the test to avoid bad randomization (negative
numbers, etc).
This commit is contained in:
Zachary Tong 2020-06-02 16:03:37 -04:00 committed by GitHub
parent f360020048
commit 79ac69cfa3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 56 additions and 8 deletions

View File

@ -205,7 +205,12 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
}
}
if (!config.unmapped()) {
if (config.unmapped() == false) {
if (config.fieldContext().fieldType().isSearchable() == false) {
throw new IllegalArgumentException("SignificantText aggregation requires fields to be searchable, but ["
+ config.fieldContext().fieldType().name() + "] is not");
}
this.fieldType = config.fieldContext().fieldType();
this.indexedFieldName = fieldType.name();
}
@ -248,6 +253,10 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
}
private long getBackgroundFrequency(String value) throws IOException {
// fieldType can be null if the field is unmapped, but theoretically this method should only be called
// when constructing buckets. Assert to ensure this is the case
// TODO this is a bad setup and it should be refactored
assert fieldType != null;
Query query = fieldType.termQuery(value, queryShardContext);
if (query instanceof TermQuery) {
// for types that use the inverted index, we prefer using a caching terms

View File

@ -79,6 +79,10 @@ public class SignificantTextAggregatorFactory extends AggregatorFactory
// Note that if the field is unmapped (its field type is null), we don't fail,
// and just use the given field name as a placeholder.
this.fieldType = queryShardContext.fieldMapper(fieldName);
if (fieldType != null && fieldType.indexAnalyzer() == null) {
throw new IllegalArgumentException("Field [" + fieldType.name() + "] has no analyzer, but SignificantText " +
"requires an analyzed field");
}
this.indexedFieldName = fieldType != null ? fieldType.name() : fieldName;
this.sourceFieldNames = sourceFieldNames == null
? new String[] { indexedFieldName }
@ -120,6 +124,10 @@ public class SignificantTextAggregatorFactory extends AggregatorFactory
}
private long getBackgroundFrequency(String value) throws IOException {
// fieldType can be null if the field is unmapped, but theoretically this method should only be called
// when constructing buckets. Assert to ensure this is the case
// TODO this is a bad setup and it should be refactored
assert fieldType != null;
Query query = fieldType.termQuery(value, queryShardContext);
if (query instanceof TermQuery) {
// for types that use the inverted index, we prefer using a caching terms

View File

@ -39,6 +39,7 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.index.analysis.AnalyzerScope;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.mapper.BinaryFieldMapper;
import org.elasticsearch.index.mapper.KeywordFieldMapper;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.NumberFieldMapper;
@ -52,6 +53,8 @@ import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.aggregations.AggregationBuilder;
import org.elasticsearch.search.aggregations.AggregatorTestCase;
import org.elasticsearch.search.aggregations.bucket.terms.SignificantTermsAggregatorFactory.ExecutionMode;
import org.elasticsearch.search.aggregations.support.CoreValuesSourceType;
import org.elasticsearch.search.aggregations.support.ValuesSourceType;
import org.junit.Before;
import java.io.IOException;
@ -83,16 +86,18 @@ public class SignificantTermsAggregatorTests extends AggregatorTestCase {
return new SignificantTermsAggregationBuilder("foo").field(fieldName);
}
/* NOTE - commented out instead of deleted, we need to backport https://github.com/elastic/elasticsearch/pull/52851 to support this.
@Override
protected List<ValuesSourceType> getSupportedValuesSourceTypes() {
return Arrays.asList(CoreValuesSourceType.NUMERIC,
CoreValuesSourceType.BYTES);
CoreValuesSourceType.BYTES,
CoreValuesSourceType.BOOLEAN,
CoreValuesSourceType.DATE,
CoreValuesSourceType.IP);
}
@Override
protected List<String> unsupportedMappedFieldTypes() {
return List.of(
return Arrays.asList(
NumberFieldMapper.NumberType.DOUBLE.typeName(), // floating points are not supported at all
NumberFieldMapper.NumberType.FLOAT.typeName(),
NumberFieldMapper.NumberType.HALF_FLOAT.typeName(),
@ -100,8 +105,6 @@ public class SignificantTermsAggregatorTests extends AggregatorTestCase {
);
}
*/
/**
* For each provided field type, we also register an alias with name {@code <field>-alias}.
*/

View File

@ -34,14 +34,17 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.index.analysis.AnalyzerScope;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.mapper.BinaryFieldMapper;
import org.elasticsearch.index.mapper.GeoPointFieldMapper;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.TextFieldMapper.TextFieldType;
import org.elasticsearch.search.aggregations.AggregationBuilder;
import org.elasticsearch.search.aggregations.AggregatorTestCase;
import org.elasticsearch.search.aggregations.bucket.sampler.InternalSampler;
import org.elasticsearch.search.aggregations.bucket.sampler.SamplerAggregationBuilder;
import org.elasticsearch.search.aggregations.bucket.terms.SignificantTerms;
import org.elasticsearch.search.aggregations.bucket.terms.SignificantTextAggregationBuilder;
import org.elasticsearch.search.aggregations.support.AggregationInspectionHelper;
import org.elasticsearch.search.aggregations.support.CoreValuesSourceType;
import org.elasticsearch.search.aggregations.support.ValuesSourceType;
import java.io.IOException;
import java.util.Arrays;
@ -65,6 +68,31 @@ public class SignificantTextAggregatorTests extends AggregatorTestCase {
Function.identity()));
}
@Override
protected AggregationBuilder createAggBuilderForTypeTest(MappedFieldType fieldType, String fieldName) {
return new SignificantTextAggregationBuilder("foo", fieldName);
}
@Override
protected List<ValuesSourceType> getSupportedValuesSourceTypes() {
// TODO it is likely accidental that SigText supports anything other than Bytes, and then only text fields
return Arrays.asList(CoreValuesSourceType.NUMERIC,
CoreValuesSourceType.BYTES,
CoreValuesSourceType.RANGE,
CoreValuesSourceType.GEOPOINT,
CoreValuesSourceType.BOOLEAN,
CoreValuesSourceType.DATE,
CoreValuesSourceType.IP);
}
@Override
protected List<String> unsupportedMappedFieldTypes() {
return Arrays.asList(
BinaryFieldMapper.CONTENT_TYPE, // binary fields are not supported because they do not have analyzers
GeoPointFieldMapper.CONTENT_TYPE // geopoint fields cannot use term queries
);
}
/**
* Uses the significant text aggregation to find the keywords in text fields
*/