refactor: unify terms and significant_terms parsing

Both need the requiredSize, shardSize, minDocCount and shardMinDocCount.
Parsing should not be duplicated.
This commit is contained in:
Britta Weber 2014-05-12 23:39:24 +02:00
parent e041b5992c
commit 8e3bcb5e2f
6 changed files with 287 additions and 148 deletions

View File

@ -22,6 +22,7 @@ package org.elasticsearch.search.aggregations.bucket.significant;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.index.query.FilterBuilder;
import org.elasticsearch.search.aggregations.AggregationBuilder;
import org.elasticsearch.search.aggregations.bucket.terms.AbstractTermsParametersParser;
import java.io.IOException;
@ -34,10 +35,10 @@ import java.io.IOException;
public class SignificantTermsBuilder extends AggregationBuilder<SignificantTermsBuilder> {
private String field;
private int requiredSize = SignificantTermsParser.DEFAULT_REQUIRED_SIZE;
private int shardSize = SignificantTermsParser.DEFAULT_SHARD_SIZE;
private int minDocCount = SignificantTermsParser.DEFAULT_MIN_DOC_COUNT;
private int shardMinDocCount = SignificantTermsParser.DEFAULT_SHARD_MIN_DOC_COUNT;
private int requiredSize = AbstractTermsParametersParser.DEFAULT_REQUIRED_SIZE;
private int shardSize = AbstractTermsParametersParser.DEFAULT_SHARD_SIZE;
private int minDocCount = AbstractTermsParametersParser.DEFAULT_MIN_DOC_COUNT;
private int shardMinDocCount = AbstractTermsParametersParser.DEFAULT_SHARD_MIN_DOC_COUNT;
private String executionHint;
private String includePattern;
private int includeFlags;
@ -136,16 +137,16 @@ public class SignificantTermsBuilder extends AggregationBuilder<SignificantTerms
if (field != null) {
builder.field("field", field);
}
if (minDocCount != SignificantTermsParser.DEFAULT_MIN_DOC_COUNT) {
if (minDocCount != AbstractTermsParametersParser.DEFAULT_MIN_DOC_COUNT) {
builder.field("minDocCount", minDocCount);
}
if (shardMinDocCount != SignificantTermsParser.DEFAULT_SHARD_MIN_DOC_COUNT) {
if (shardMinDocCount != AbstractTermsParametersParser.DEFAULT_SHARD_MIN_DOC_COUNT) {
builder.field("shardMinDocCount", shardMinDocCount);
}
if (requiredSize != SignificantTermsParser.DEFAULT_REQUIRED_SIZE) {
if (requiredSize != AbstractTermsParametersParser.DEFAULT_REQUIRED_SIZE) {
builder.field("size", requiredSize);
}
if (shardSize != SignificantTermsParser.DEFAULT_SHARD_SIZE) {
if (shardSize != AbstractTermsParametersParser.DEFAULT_SHARD_SIZE) {
builder.field("shard_size", shardSize);
}
if (executionHint != null) {
@ -173,7 +174,7 @@ public class SignificantTermsBuilder extends AggregationBuilder<SignificantTerms
}
if (filterBuilder != null) {
builder.field(SignificantTermsParser.BACKGROUND_FILTER.getPreferredName());
builder.field(SignificantTermsParametersParser.BACKGROUND_FILTER.getPreferredName());
filterBuilder.toXContent(builder, params);
}

View File

@ -0,0 +1,62 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.search.aggregations.bucket.significant;
import org.apache.lucene.search.Filter;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.search.SearchParseException;
import org.elasticsearch.search.aggregations.bucket.terms.AbstractTermsParametersParser;
import org.elasticsearch.search.internal.SearchContext;
import java.io.IOException;
public class SignificantTermsParametersParser extends AbstractTermsParametersParser {
public Filter getFilter() {
return filter;
}
private Filter filter = null;
@Override
public void setDefaults() {
setMinDocCount(3);
}
static final ParseField BACKGROUND_FILTER = new ParseField("background_filter");
@Override
public void parseSpecial(String aggregationName, XContentParser parser, SearchContext context, XContentParser.Token token, String currentFieldName) throws IOException {
if (token == XContentParser.Token.START_OBJECT) {
if (BACKGROUND_FILTER.match(currentFieldName)) {
filter = context.queryParserService().parseInnerFilter(parser).filter();
} else {
throw new SearchParseException(context, "Unknown key for a " + token + " in [" + aggregationName + "]: [" + currentFieldName + "].");
}
} else {
throw new SearchParseException(context, "Unknown key for a " + token + " in [" + aggregationName + "]: [" + currentFieldName + "].");
}
}
}

View File

@ -18,10 +18,7 @@
*/
package org.elasticsearch.search.aggregations.bucket.significant;
import org.apache.lucene.search.Filter;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.search.SearchParseException;
import org.elasticsearch.search.aggregations.Aggregator;
import org.elasticsearch.search.aggregations.AggregatorFactory;
import org.elasticsearch.search.aggregations.bucket.BucketUtils;
@ -36,16 +33,7 @@ import java.io.IOException;
*/
public class SignificantTermsParser implements Aggregator.Parser {
public static final int DEFAULT_REQUIRED_SIZE = 10;
public static final int DEFAULT_SHARD_SIZE = 0;
//Typically need more than one occurrence of something for it to be statistically significant
public static final int DEFAULT_MIN_DOC_COUNT = 3;
static final ParseField BACKGROUND_FILTER = new ParseField("background_filter");
static final ParseField SHARD_MIN_DOC_COUNT_FIELD_NAME = new ParseField("shard_min_doc_count");
public static final int DEFAULT_SHARD_MIN_DOC_COUNT = 1;
@Override
public String type() {
@ -54,64 +42,17 @@ public class SignificantTermsParser implements Aggregator.Parser {
@Override
public AggregatorFactory parse(String aggregationName, XContentParser parser, SearchContext context) throws IOException {
SignificantTermsParametersParser aggParser = new SignificantTermsParametersParser();
ValuesSourceParser vsParser = ValuesSourceParser.any(aggregationName, SignificantStringTerms.TYPE, context)
.scriptable(false)
.formattable(true)
.requiresSortedValues(true)
.requiresUniqueValues(true)
.build();
IncludeExclude.Parser incExcParser = new IncludeExclude.Parser(aggregationName, SignificantStringTerms.TYPE, context);
Filter filter = null;
int requiredSize = DEFAULT_REQUIRED_SIZE;
int shardSize = DEFAULT_SHARD_SIZE;
long minDocCount = DEFAULT_MIN_DOC_COUNT;
long shardMinDocCount = DEFAULT_SHARD_MIN_DOC_COUNT;
String executionHint = null;
XContentParser.Token token;
String currentFieldName = null;
while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
if (token == XContentParser.Token.FIELD_NAME) {
currentFieldName = parser.currentName();
} else if (vsParser.token(currentFieldName, token, parser)) {
continue;
} else if (incExcParser.token(currentFieldName, token, parser)) {
continue;
} else if (token == XContentParser.Token.VALUE_STRING) {
if ("execution_hint".equals(currentFieldName) || "executionHint".equals(currentFieldName)) {
executionHint = parser.text();
} else {
throw new SearchParseException(context, "Unknown key for a " + token + " in [" + aggregationName + "]: [" + currentFieldName + "].");
}
} else if (token == XContentParser.Token.VALUE_NUMBER) {
if ("size".equals(currentFieldName)) {
requiredSize = parser.intValue();
} else if ("shard_size".equals(currentFieldName) || "shardSize".equals(currentFieldName)) {
shardSize = parser.intValue();
} else if ("min_doc_count".equals(currentFieldName) || "minDocCount".equals(currentFieldName)) {
minDocCount = parser.intValue();
} else if (SHARD_MIN_DOC_COUNT_FIELD_NAME.match(currentFieldName)){
shardMinDocCount = parser.longValue();
} else {
throw new SearchParseException(context, "Unknown key for a " + token + " in [" + aggregationName + "]: [" + currentFieldName + "].");
}
} else if (token == XContentParser.Token.START_OBJECT) {
if (BACKGROUND_FILTER.match(currentFieldName)) {
filter = context.queryParserService().parseInnerFilter(parser).filter();
} else {
throw new SearchParseException(context, "Unknown key for a " + token + " in [" + aggregationName + "]: [" + currentFieldName + "].");
}
} else {
throw new SearchParseException(context, "Unexpected token " + token + " in [" + aggregationName + "].");
}
}
if (shardSize == DEFAULT_SHARD_SIZE) {
aggParser.parse(aggregationName, parser, context, vsParser, incExcParser);
int shardSize = aggParser.getShardSize();
if ( shardSize == aggParser.DEFAULT_SHARD_SIZE) {
//The user has not made a shardSize selection .
//Use default heuristic to avoid any wrong-ranking caused by distributed counting
//but request double the usual amount.
@ -119,22 +60,20 @@ public class SignificantTermsParser implements Aggregator.Parser {
//as the significance algorithm is in less of a position to down-select at shard-level -
//some of the things we want to find have only one occurrence on each shard and as
// such are impossible to differentiate from non-significant terms at that early stage.
shardSize = 2 * BucketUtils.suggestShardSideQueueSize(requiredSize, context.numberOfShards());
shardSize = 2 * BucketUtils.suggestShardSideQueueSize(aggParser.getRequiredSize(), context.numberOfShards());
}
// shard_size cannot be smaller than size as we need to at least fetch <size> entries from every shards in order to return <size>
if (shardSize < requiredSize) {
shardSize = requiredSize;
if (shardSize < aggParser.getRequiredSize()) {
shardSize = aggParser.getRequiredSize();
}
long shardMinDocCount = aggParser.getShardMinDocCount();
// shard_min_doc_count should not be larger than min_doc_count because this can cause buckets to be removed that would match the min_doc_count criteria
if (shardMinDocCount > minDocCount) {
shardMinDocCount = minDocCount;
if (shardMinDocCount > aggParser.getMinDocCount()) {
shardMinDocCount = aggParser.getMinDocCount();
}
IncludeExclude includeExclude = incExcParser.includeExclude();
return new SignificantTermsAggregatorFactory(aggregationName, vsParser.config(), requiredSize, shardSize, minDocCount, shardMinDocCount, includeExclude, executionHint, filter);
return new SignificantTermsAggregatorFactory(aggregationName, vsParser.config(), aggParser.getRequiredSize(), shardSize, aggParser.getMinDocCount(), shardMinDocCount, aggParser.getIncludeExclude(), aggParser.getExecutionHint(), aggParser.getFilter());
}
}

View File

@ -0,0 +1,121 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.search.aggregations.bucket.terms;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude;
import org.elasticsearch.search.aggregations.support.ValuesSourceParser;
import org.elasticsearch.search.internal.SearchContext;
import java.io.IOException;
public abstract class AbstractTermsParametersParser {
public static final int DEFAULT_REQUIRED_SIZE = 10;
public static final int DEFAULT_SHARD_SIZE = -1;
//Typically need more than one occurrence of something for it to be statistically significant
public static final int DEFAULT_MIN_DOC_COUNT = 1;
public static final int DEFAULT_SHARD_MIN_DOC_COUNT = 1;
static final ParseField EXECUTION_HINT_FIELD_NAME = new ParseField("execution_hint");
static final ParseField SHARD_SIZE_FIELD_NAME = new ParseField("shard_size");
static final ParseField MIN_DOC_COUNT_FIELD_NAME = new ParseField("min_doc_count");
static final ParseField SHARD_MIN_DOC_COUNT_FIELD_NAME = new ParseField("shard_min_doc_count");
public int getRequiredSize() {
return requiredSize;
}
public int getShardSize() {
return shardSize;
}
public void setMinDocCount(long minDocCount) {
this.minDocCount = minDocCount;
}
public long getMinDocCount() {
return minDocCount;
}
public long getShardMinDocCount() {
return shardMinDocCount;
}
//These are the results of the parsing.
public String getExecutionHint() {
return executionHint;
}
public IncludeExclude getIncludeExclude() {
return includeExclude;
}
private int requiredSize = DEFAULT_REQUIRED_SIZE;
private int shardSize = DEFAULT_SHARD_SIZE;
private long minDocCount = DEFAULT_MIN_DOC_COUNT;
private long shardMinDocCount = DEFAULT_SHARD_MIN_DOC_COUNT;
private String executionHint = null;
IncludeExclude includeExclude;
public void parse(String aggregationName, XContentParser parser, SearchContext context, ValuesSourceParser vsParser, IncludeExclude.Parser incExcParser) throws IOException {
XContentParser.Token token;
String currentFieldName = null;
setDefaults();
while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
if (token == XContentParser.Token.FIELD_NAME) {
currentFieldName = parser.currentName();
} else if (vsParser.token(currentFieldName, token, parser)) {
continue;
} else if (incExcParser.token(currentFieldName, token, parser)) {
continue;
} else if (token == XContentParser.Token.VALUE_STRING) {
if (EXECUTION_HINT_FIELD_NAME.match(currentFieldName)) {
executionHint = parser.text();
} else {
parseSpecial(aggregationName, parser, context, token, currentFieldName);
}
} else if (token == XContentParser.Token.VALUE_NUMBER) {
if ("size".equals(currentFieldName)) {
requiredSize = parser.intValue();
} else if (SHARD_SIZE_FIELD_NAME.match(currentFieldName)) {
shardSize = parser.intValue();
} else if (MIN_DOC_COUNT_FIELD_NAME.match(currentFieldName)) {
minDocCount = parser.intValue();
} else if (SHARD_MIN_DOC_COUNT_FIELD_NAME.match(currentFieldName)) {
shardMinDocCount = parser.longValue();
} else {
parseSpecial(aggregationName, parser, context, token, currentFieldName);
}
} else {
parseSpecial(aggregationName, parser, context, token, currentFieldName);
}
}
includeExclude = incExcParser.includeExclude();
}
public abstract void parseSpecial(String aggregationName, XContentParser parser, SearchContext context, XContentParser.Token token, String currentFieldName) throws IOException;
public abstract void setDefaults();
}

View File

@ -0,0 +1,75 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.search.aggregations.bucket.terms;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.search.SearchParseException;
import org.elasticsearch.search.internal.SearchContext;
import java.io.IOException;
public class TermsParametersParser extends AbstractTermsParametersParser {
public String getOrderKey() {
return orderKey;
}
public boolean isOrderAsc() {
return orderAsc;
}
String orderKey = "_count";
boolean orderAsc = false;
@Override
public void setDefaults() {
}
@Override
public void parseSpecial(String aggregationName, XContentParser parser, SearchContext context, XContentParser.Token token, String currentFieldName) throws IOException {
if (token == XContentParser.Token.START_OBJECT) {
if ("order".equals(currentFieldName)) {
while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
if (token == XContentParser.Token.FIELD_NAME) {
orderKey = parser.currentName();
} else if (token == XContentParser.Token.VALUE_STRING) {
String dir = parser.text();
if ("asc".equalsIgnoreCase(dir)) {
orderAsc = true;
} else if ("desc".equalsIgnoreCase(dir)) {
orderAsc = false;
} else {
throw new SearchParseException(context, "Unknown terms order direction [" + dir + "] in terms aggregation [" + aggregationName + "]");
}
} else {
throw new SearchParseException(context, "Unexpected token " + token + " for [order] in [" + aggregationName + "].");
}
}
} else {
throw new SearchParseException(context, "Unknown key for a " + token + " in [" + aggregationName + "]: [" + currentFieldName + "].");
}
} else {
throw new SearchParseException(context, "Unknown key for a " + token + " in [" + aggregationName + "]: [" + currentFieldName + "].");
}
}
}

View File

@ -19,7 +19,6 @@
package org.elasticsearch.search.aggregations.bucket.terms;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.search.SearchParseException;
import org.elasticsearch.search.aggregations.Aggregator;
import org.elasticsearch.search.aggregations.AggregatorFactory;
import org.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude;
@ -40,78 +39,22 @@ public class TermsParser implements Aggregator.Parser {
@Override
public AggregatorFactory parse(String aggregationName, XContentParser parser, SearchContext context) throws IOException {
int requiredSize = 10;
int shardSize = -1;
String orderKey = "_count";
boolean orderAsc = false;
String executionHint = null;
long minDocCount = 1;
TermsParametersParser aggParser = new TermsParametersParser();
ValuesSourceParser vsParser = ValuesSourceParser.any(aggregationName, StringTerms.TYPE, context)
.scriptable(true)
.formattable(true)
.requiresSortedValues(true)
.requiresUniqueValues(true)
.formattable(true)
.build();
IncludeExclude.Parser incExcParser = new IncludeExclude.Parser(aggregationName, StringTerms.TYPE, context);
aggParser.parse(aggregationName, parser, context, vsParser, incExcParser);
XContentParser.Token token;
String currentFieldName = null;
while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
if (token == XContentParser.Token.FIELD_NAME) {
currentFieldName = parser.currentName();
} else if (vsParser.token(currentFieldName, token, parser)) {
continue;
} else if (incExcParser.token(currentFieldName, token, parser)) {
continue;
} else if (token == XContentParser.Token.VALUE_STRING) {
if ("execution_hint".equals(currentFieldName) || "executionHint".equals(currentFieldName)) {
executionHint = parser.text();
} else {
throw new SearchParseException(context, "Unknown key for a " + token + " in [" + aggregationName + "]: [" + currentFieldName + "].");
}
} else if (token == XContentParser.Token.VALUE_NUMBER) {
if ("size".equals(currentFieldName)) {
requiredSize = parser.intValue();
} else if ("shard_size".equals(currentFieldName) || "shardSize".equals(currentFieldName)) {
shardSize = parser.intValue();
} else if ("min_doc_count".equals(currentFieldName) || "minDocCount".equals(currentFieldName)) {
minDocCount = parser.intValue();
} else {
throw new SearchParseException(context, "Unknown key for a " + token + " in [" + aggregationName + "]: [" + currentFieldName + "].");
}
} else if (token == XContentParser.Token.START_OBJECT) {
if ("order".equals(currentFieldName)) {
while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
if (token == XContentParser.Token.FIELD_NAME) {
orderKey = parser.currentName();
} else if (token == XContentParser.Token.VALUE_STRING) {
String dir = parser.text();
if ("asc".equalsIgnoreCase(dir)) {
orderAsc = true;
} else if ("desc".equalsIgnoreCase(dir)) {
orderAsc = false;
} else {
throw new SearchParseException(context, "Unknown terms order direction [" + dir + "] in terms aggregation [" + aggregationName + "]");
}
} else {
throw new SearchParseException(context, "Unexpected token " + token + " for [order] in [" + aggregationName + "].");
}
}
} else {
throw new SearchParseException(context, "Unknown key for a " + token + " in [" + aggregationName + "]: [" + currentFieldName + "].");
}
} else {
throw new SearchParseException(context, "Unexpected token " + token + " in [" + aggregationName + "].");
}
}
int shardSize = aggParser.getShardSize();
if (shardSize == 0) {
shardSize = Integer.MAX_VALUE;
}
int requiredSize = aggParser.getRequiredSize();
if (requiredSize == 0) {
requiredSize = Integer.MAX_VALUE;
}
@ -120,10 +63,8 @@ public class TermsParser implements Aggregator.Parser {
if (shardSize < requiredSize) {
shardSize = requiredSize;
}
IncludeExclude includeExclude = incExcParser.includeExclude();
InternalOrder order = resolveOrder(orderKey, orderAsc);
return new TermsAggregatorFactory(aggregationName, vsParser.config(), order, requiredSize, shardSize, minDocCount, includeExclude, executionHint);
InternalOrder order = resolveOrder(aggParser.getOrderKey(), aggParser.isOrderAsc());
return new TermsAggregatorFactory(aggregationName, vsParser.config(), order, requiredSize, shardSize, aggParser.getMinDocCount(), aggParser.getIncludeExclude(), aggParser.getExecutionHint());
}
static InternalOrder resolveOrder(String key, boolean asc) {