Expose CommonTermsQuery

Closes #2583
This commit is contained in:
Simon Willnauer 2013-01-24 13:25:28 +01:00
parent c4eab90b2e
commit 4eefcb9c82
6 changed files with 422 additions and 1 deletions

View File

@ -143,7 +143,7 @@ public class Queries {
private static Pattern spacePattern = Pattern.compile(" ");
private static Pattern lessThanPattern = Pattern.compile("<");
static int calculateMinShouldMatch(int optionalClauseCount, String spec) {
public static int calculateMinShouldMatch(int optionalClauseCount, String spec) {
int result = optionalClauseCount;
spec = spec.trim();

View File

@ -0,0 +1,169 @@
/*
* Licensed to ElasticSearch and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. ElasticSearch licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.query;
import java.io.IOException;
import org.elasticsearch.ElasticSearchIllegalArgumentException;
import org.elasticsearch.common.xcontent.XContentBuilder;
/**
* CommonTermsQuery query is a query that executes high-frequency terms in a
* optional sub-query to prevent slow queries due to "common" terms like
* stopwords. This query basically builds 2 queries off the {@link #add(Term)
* added} terms where low-frequency terms are added to a required boolean clause
* and high-frequency terms are added to an optional boolean clause. The
* optional clause is only executed if the required "low-frequency' clause
* matches. Scores produced by this query will be slightly different to plain
* {@link BooleanQuery} scorer mainly due to differences in the
* {@link Similarity#coord(int,int) number of leave queries} in the required
* boolean clause. In the most cases high-frequency terms are unlikely to
* significantly contribute to the document score unless at least one of the
* low-frequency terms are matched such that this query can improve query
* execution times significantly if applicable.
* <p>
*/
public class CommonTermsQueryBuilder extends BaseQueryBuilder implements BoostableQueryBuilder<CommonTermsQueryBuilder> {
public static enum Operator {
OR, AND
}
private final String name;
private final Object text;
private Operator highFreqOperator = null;
private Operator lowFreqOperator = null;
private String analyzer = null;
private Float boost = null;
private String minimumShouldMatch = null;
private Boolean disableCoords = null;
private Float cutoffFrequency = null;
/**
* Constructs a new common terms query.
*/
public CommonTermsQueryBuilder(String name, Object text) {
if (name == null) {
throw new ElasticSearchIllegalArgumentException("Field name must not be null");
}
if (text == null) {
throw new ElasticSearchIllegalArgumentException("Query must not be null");
}
this.text = text;
this.name = name;
}
/**
* Sets the operator to use for terms with a high document frequency
* (greater than or equal to {@link #cutoffFrequency(float)}. Defaults to
* <tt>AND</tt>.
*/
public CommonTermsQueryBuilder highFreqOperator(Operator operator) {
this.highFreqOperator = operator;
return this;
}
/**
* Sets the operator to use for terms with a low document frequency (less
* than {@link #cutoffFrequency(float)}. Defaults to <tt>AND</tt>.
*/
public CommonTermsQueryBuilder lowFreqOperator(Operator operator) {
this.lowFreqOperator = operator;
return this;
}
/**
* Explicitly set the analyzer to use. Defaults to use explicit mapping
* config for the field, or, if not set, the default search analyzer.
*/
public CommonTermsQueryBuilder analyzer(String analyzer) {
this.analyzer = analyzer;
return this;
}
/**
* Set the boost to apply to the query.
*/
public CommonTermsQueryBuilder boost(float boost) {
this.boost = boost;
return this;
}
/**
* Sets the cutoff document frequency for high / low frequent terms. A value
* in [0..1] (or absolute number >=1) representing the maximum threshold of
* a terms document frequency to be considered a low frequency term.
* Defaults to
* <tt>{@value CommonTermsQueryParser#DEFAULT_MAX_TERM_DOC_FREQ}</tt>
*/
public CommonTermsQueryBuilder cutoffFrequency(float cutoffFrequency) {
this.cutoffFrequency = cutoffFrequency;
return this;
}
/**
* Sets the minimum number of query terms that need to match in order to
* produce a hit.
*/
public CommonTermsQueryBuilder minimumShouldMatch(String minimumShouldMatch) {
this.minimumShouldMatch = minimumShouldMatch;
return this;
}
@Override
public void doXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject(CommonTermsQueryParser.NAME);
builder.startObject(name);
builder.field("query", text);
if (disableCoords != null) {
builder.field("disable_coords", disableCoords);
}
if (highFreqOperator != null) {
builder.field("high_freq_operator", highFreqOperator.toString());
}
if (lowFreqOperator != null) {
builder.field("low_freq_operator", lowFreqOperator.toString());
}
if (analyzer != null) {
builder.field("analyzer", analyzer);
}
if (boost != null) {
builder.field("boost", boost);
}
if (cutoffFrequency != null) {
builder.field("cutoff_frequency", cutoffFrequency);
}
if (minimumShouldMatch != null) {
builder.field("minimum_should_match", minimumShouldMatch);
}
builder.endObject();
builder.endObject();
}
}

View File

@ -0,0 +1,203 @@
/*
* Licensed to ElasticSearch and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. ElasticSearch licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.query;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.CommonTermsQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.UnicodeUtil;
import org.elasticsearch.ElasticSearchIllegalArgumentException;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.io.FastStringReader;
import org.elasticsearch.common.lucene.search.Queries;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.index.mapper.FieldMapper;
import org.elasticsearch.index.mapper.MapperService;
/**
*
*/
public class CommonTermsQueryParser implements QueryParser {
public static final String NAME = "common";
static final float DEFAULT_MAX_TERM_DOC_FREQ = 0.01f;
static final Occur DEFAULT_HIGH_FREQ_OCCUR = Occur.MUST;
static final Occur DEFAULT_LOW_FREQ_OCCUR = Occur.MUST;
static final boolean DEFAULT_DISABLE_COORDS = true;
@Inject
public CommonTermsQueryParser() {
}
@Override
public String[] names() {
return new String[] { NAME };
}
@Override
public Query parse(QueryParseContext parseContext) throws IOException, QueryParsingException {
XContentParser parser = parseContext.parser();
XContentParser.Token token = parser.nextToken();
if (token != XContentParser.Token.FIELD_NAME) {
throw new QueryParsingException(parseContext.index(), "[common] query malformed, no field");
}
String fieldName = parser.currentName();
Object value = null;
float boost = 1.0f;
String queryAnalyzer = null;
String minimumShouldMatch = null;
boolean disableCoords = DEFAULT_DISABLE_COORDS;
Occur highFreqOccur = DEFAULT_HIGH_FREQ_OCCUR;
Occur lowFreqOccur = DEFAULT_HIGH_FREQ_OCCUR;
float maxTermFrequency = DEFAULT_MAX_TERM_DOC_FREQ;
token = parser.nextToken();
if (token == XContentParser.Token.START_OBJECT) {
String currentFieldName = null;
while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
if (token == XContentParser.Token.FIELD_NAME) {
currentFieldName = parser.currentName();
} else if (token.isValue()) {
if ("query".equals(currentFieldName)) {
value = parser.objectText();
} else if ("analyzer".equals(currentFieldName)) {
String analyzer = parser.text();
if (parseContext.analysisService().analyzer(analyzer) == null) {
throw new QueryParsingException(parseContext.index(), "[common] analyzer [" + parser.text() + "] not found");
}
queryAnalyzer = analyzer;
} else if ("disable_coords".equals(currentFieldName)) {
disableCoords = parser.booleanValue();
} else if ("boost".equals(currentFieldName)) {
boost = parser.floatValue();
} else if ("high_freq_operator".equals(currentFieldName)) {
String op = parser.text();
if ("or".equalsIgnoreCase(op)) {
highFreqOccur = BooleanClause.Occur.SHOULD;
} else if ("and".equalsIgnoreCase(op)) {
highFreqOccur = BooleanClause.Occur.MUST;
} else {
throw new QueryParsingException(parseContext.index(),
"[common] query requires operator to be either 'and' or 'or', not [" + op + "]");
}
} else if ("low_freq_operator".equals(currentFieldName)) {
String op = parser.text();
if ("or".equalsIgnoreCase(op)) {
lowFreqOccur = BooleanClause.Occur.SHOULD;
} else if ("and".equalsIgnoreCase(op)) {
lowFreqOccur = BooleanClause.Occur.MUST;
} else {
throw new QueryParsingException(parseContext.index(),
"[common] query requires operator to be either 'and' or 'or', not [" + op + "]");
}
} else if ("minimum_should_match".equals(currentFieldName) || "minimumShouldMatch".equals(currentFieldName)) {
minimumShouldMatch = parser.textOrNull();
} else if ("cutoff_frequency".equals(currentFieldName)) {
maxTermFrequency = parser.floatValue();
} else {
throw new QueryParsingException(parseContext.index(), "[common] query does not support [" + currentFieldName + "]");
}
}
}
parser.nextToken();
} else {
value = parser.objectText();
// move to the next token
token = parser.nextToken();
if (token != XContentParser.Token.END_OBJECT) {
throw new QueryParsingException(
parseContext.index(),
"[common] query parsed in simplified form, with direct field name, but included more options than just the field name, possibly use its 'options' form, with 'query' element?");
}
}
if (value == null) {
throw new QueryParsingException(parseContext.index(), "No text specified for text query");
}
CommonTermsQuery query = new CommonTermsQuery(highFreqOccur, lowFreqOccur, maxTermFrequency, disableCoords);
int numTerms = parseQueryString(query, value.toString(), fieldName, parseContext, queryAnalyzer);
if (numTerms == 0) {
return null;
}
if (minimumShouldMatch != null) {
query.setMinimumNumberShouldMatch(Queries.calculateMinShouldMatch(numTerms, minimumShouldMatch));
}
query.setBoost(boost);
return query;
}
private final int parseQueryString(CommonTermsQuery query, String queryString, String fieldName, QueryParseContext parseContext,
String queryAnalyzer) throws IOException {
FieldMapper<?> mapper = null;
String field;
MapperService.SmartNameFieldMappers smartNameFieldMappers = parseContext.smartFieldMappers(fieldName);
if (smartNameFieldMappers != null && smartNameFieldMappers.hasMapper()) {
mapper = smartNameFieldMappers.mapper();
field = mapper.names().indexName();
} else {
field = fieldName;
}
Analyzer analyzer = null;
if (queryAnalyzer == null) {
if (mapper != null) {
analyzer = mapper.searchAnalyzer();
}
if (analyzer == null && smartNameFieldMappers != null) {
analyzer = smartNameFieldMappers.searchAnalyzer();
}
if (analyzer == null) {
analyzer = parseContext.mapperService().searchAnalyzer();
}
} else {
analyzer = parseContext.mapperService().analysisService().analyzer(queryAnalyzer);
if (analyzer == null) {
throw new ElasticSearchIllegalArgumentException("No analyzer found for [" + queryAnalyzer + "]");
}
}
// Logic similar to QueryParser#getFieldQuery
TokenStream source = analyzer.tokenStream(field, new FastStringReader(queryString.toString()));
source.reset();
CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
int count = 0;
while (source.incrementToken()) {
BytesRef ref = new BytesRef(termAtt.length() * 4); // oversize for
// UTF-8
UnicodeUtil.UTF16toUTF8(termAtt.buffer(), 0, termAtt.length(), ref);
query.add(new Term(field, ref));
count++;
}
return count;
}
}

View File

@ -65,6 +65,16 @@ public abstract class QueryBuilders {
public static MatchQueryBuilder matchQuery(String name, Object text) {
return new MatchQueryBuilder(name, text).type(MatchQueryBuilder.Type.BOOLEAN);
}
/**
* Creates a common query for the provided field name and text.
*
* @param name The field name.
* @param text The query text (to be analyzed).
*/
public static CommonTermsQueryBuilder commonTerms(String name, Object text) {
return new CommonTermsQueryBuilder(name, text);
}
/**
* Creates a match query with type "BOOLEAN" for the provided field name and text.

View File

@ -101,6 +101,8 @@ public class IndicesQueriesModule extends AbstractModule {
qpBinders.addBinding().to(FuzzyLikeThisFieldQueryParser.class).asEagerSingleton();
qpBinders.addBinding().to(WrapperQueryParser.class).asEagerSingleton();
qpBinders.addBinding().to(IndicesQueryParser.class).asEagerSingleton();
qpBinders.addBinding().to(CommonTermsQueryParser.class).asEagerSingleton();
if (ShapesAvailability.JTS_AVAILABLE) {
qpBinders.addBinding().to(GeoShapeQueryParser.class).asEagerSingleton();
}

View File

@ -24,6 +24,7 @@ import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.index.query.*;
import org.elasticsearch.index.query.CommonTermsQueryBuilder.Operator;
import org.elasticsearch.rest.RestStatus;
import org.elasticsearch.search.facet.FacetBuilders;
import org.elasticsearch.test.integration.AbstractNodesTests;
@ -104,7 +105,43 @@ public class SimpleQueryTests extends AbstractNodesTests {
assertTrue(e.getMessage().endsWith("IllegalStateException[field \"field1\" was indexed without position data; cannot run PhraseQuery (term=quick)]; }"));
}
}
@Test
public void testCommonTermsQuery() throws Exception {
try {
client.admin().indices().prepareDelete("test").execute().actionGet();
} catch (Exception e) {
// ignore
}
client.admin().indices().prepareCreate("test")
.addMapping("type1", jsonBuilder().startObject().startObject("type1").startObject("properties").startObject("field1").field("analyzer", "whitespace").field("type", "string").endObject().endObject().endObject().endObject())
.setSettings(ImmutableSettings.settingsBuilder().put("number_of_shards", 1)).execute().actionGet();
client.prepareIndex("test", "type1", "1").setSource("field1", "the quick brown fox").execute().actionGet();
client.prepareIndex("test", "type1", "2").setSource("field1", "the quick lazy huge brown fox jumps over the tree").execute().actionGet();
client.prepareIndex("test", "type1", "3").setSource("field1", "quick lazy huge brown").setRefresh(true).execute().actionGet();
SearchResponse searchResponse = client.prepareSearch().setQuery(QueryBuilders.commonTerms("field1", "the quick brown").cutoffFrequency(3)).execute().actionGet();
assertThat(searchResponse.hits().totalHits(), equalTo(2l));
assertThat(searchResponse.getHits().getHits()[0].getId(), equalTo("1"));
assertThat(searchResponse.getHits().getHits()[1].getId(), equalTo("2"));
searchResponse = client.prepareSearch().setQuery(QueryBuilders.commonTerms("field1", "the quick brown").cutoffFrequency(3).lowFreqOperator(Operator.OR)).execute().actionGet();
assertThat(searchResponse.hits().totalHits(), equalTo(3l));
assertThat(searchResponse.getHits().getHits()[0].getId(), equalTo("1"));
assertThat(searchResponse.getHits().getHits()[1].getId(), equalTo("2"));
assertThat(searchResponse.getHits().getHits()[2].getId(), equalTo("3"));
searchResponse = client.prepareSearch().setQuery(QueryBuilders.commonTerms("field1", "the quick brown").cutoffFrequency(3).analyzer("standard")).execute().actionGet();
assertThat(searchResponse.hits().totalHits(), equalTo(3l));
// standard drops "the" since its a stopword
assertThat(searchResponse.getHits().getHits()[0].getId(), equalTo("1"));
assertThat(searchResponse.getHits().getHits()[1].getId(), equalTo("3"));
assertThat(searchResponse.getHits().getHits()[2].getId(), equalTo("2"));
}
@Test
public void testOmitTermFreqsAndPositions() throws Exception {
// backwards compat test!