From 4eefcb9c82778e8cd7a0126b98872cf08740c46e Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Thu, 24 Jan 2013 13:25:28 +0100 Subject: [PATCH] Expose CommonTermsQuery Closes #2583 --- .../common/lucene/search/Queries.java | 2 +- .../index/query/CommonTermsQueryBuilder.java | 169 +++++++++++++++ .../index/query/CommonTermsQueryParser.java | 203 ++++++++++++++++++ .../index/query/QueryBuilders.java | 10 + .../indices/query/IndicesQueriesModule.java | 2 + .../search/query/SimpleQueryTests.java | 37 ++++ 6 files changed, 422 insertions(+), 1 deletion(-) create mode 100644 src/main/java/org/elasticsearch/index/query/CommonTermsQueryBuilder.java create mode 100644 src/main/java/org/elasticsearch/index/query/CommonTermsQueryParser.java diff --git a/src/main/java/org/elasticsearch/common/lucene/search/Queries.java b/src/main/java/org/elasticsearch/common/lucene/search/Queries.java index b79ac6710ed..81a4b9d8480 100644 --- a/src/main/java/org/elasticsearch/common/lucene/search/Queries.java +++ b/src/main/java/org/elasticsearch/common/lucene/search/Queries.java @@ -143,7 +143,7 @@ public class Queries { private static Pattern spacePattern = Pattern.compile(" "); private static Pattern lessThanPattern = Pattern.compile("<"); - static int calculateMinShouldMatch(int optionalClauseCount, String spec) { + public static int calculateMinShouldMatch(int optionalClauseCount, String spec) { int result = optionalClauseCount; spec = spec.trim(); diff --git a/src/main/java/org/elasticsearch/index/query/CommonTermsQueryBuilder.java b/src/main/java/org/elasticsearch/index/query/CommonTermsQueryBuilder.java new file mode 100644 index 00000000000..b972683726b --- /dev/null +++ b/src/main/java/org/elasticsearch/index/query/CommonTermsQueryBuilder.java @@ -0,0 +1,169 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.query; + +import java.io.IOException; + +import org.elasticsearch.ElasticSearchIllegalArgumentException; +import org.elasticsearch.common.xcontent.XContentBuilder; + +/** + * CommonTermsQuery query is a query that executes high-frequency terms in a + * optional sub-query to prevent slow queries due to "common" terms like + * stopwords. This query basically builds 2 queries off the {@link #add(Term) + * added} terms where low-frequency terms are added to a required boolean clause + * and high-frequency terms are added to an optional boolean clause. The + * optional clause is only executed if the required "low-frequency' clause + * matches. Scores produced by this query will be slightly different to plain + * {@link BooleanQuery} scorer mainly due to differences in the + * {@link Similarity#coord(int,int) number of leave queries} in the required + * boolean clause. In the most cases high-frequency terms are unlikely to + * significantly contribute to the document score unless at least one of the + * low-frequency terms are matched such that this query can improve query + * execution times significantly if applicable. + *

+ */ +public class CommonTermsQueryBuilder extends BaseQueryBuilder implements BoostableQueryBuilder { + + public static enum Operator { + OR, AND + } + + private final String name; + + private final Object text; + + private Operator highFreqOperator = null; + + private Operator lowFreqOperator = null; + + private String analyzer = null; + + private Float boost = null; + + private String minimumShouldMatch = null; + + private Boolean disableCoords = null; + + private Float cutoffFrequency = null; + + /** + * Constructs a new common terms query. + */ + public CommonTermsQueryBuilder(String name, Object text) { + if (name == null) { + throw new ElasticSearchIllegalArgumentException("Field name must not be null"); + } + if (text == null) { + throw new ElasticSearchIllegalArgumentException("Query must not be null"); + } + this.text = text; + this.name = name; + } + + /** + * Sets the operator to use for terms with a high document frequency + * (greater than or equal to {@link #cutoffFrequency(float)}. Defaults to + * AND. + */ + public CommonTermsQueryBuilder highFreqOperator(Operator operator) { + this.highFreqOperator = operator; + return this; + } + + /** + * Sets the operator to use for terms with a low document frequency (less + * than {@link #cutoffFrequency(float)}. Defaults to AND. + */ + public CommonTermsQueryBuilder lowFreqOperator(Operator operator) { + this.lowFreqOperator = operator; + return this; + } + + /** + * Explicitly set the analyzer to use. Defaults to use explicit mapping + * config for the field, or, if not set, the default search analyzer. + */ + public CommonTermsQueryBuilder analyzer(String analyzer) { + this.analyzer = analyzer; + return this; + } + + /** + * Set the boost to apply to the query. + */ + public CommonTermsQueryBuilder boost(float boost) { + this.boost = boost; + return this; + } + + /** + * Sets the cutoff document frequency for high / low frequent terms. A value + * in [0..1] (or absolute number >=1) representing the maximum threshold of + * a terms document frequency to be considered a low frequency term. + * Defaults to + * {@value CommonTermsQueryParser#DEFAULT_MAX_TERM_DOC_FREQ} + */ + public CommonTermsQueryBuilder cutoffFrequency(float cutoffFrequency) { + this.cutoffFrequency = cutoffFrequency; + return this; + } + + /** + * Sets the minimum number of query terms that need to match in order to + * produce a hit. + */ + public CommonTermsQueryBuilder minimumShouldMatch(String minimumShouldMatch) { + this.minimumShouldMatch = minimumShouldMatch; + return this; + } + + @Override + public void doXContent(XContentBuilder builder, Params params) throws IOException { + builder.startObject(CommonTermsQueryParser.NAME); + builder.startObject(name); + + builder.field("query", text); + if (disableCoords != null) { + builder.field("disable_coords", disableCoords); + } + if (highFreqOperator != null) { + builder.field("high_freq_operator", highFreqOperator.toString()); + } + if (lowFreqOperator != null) { + builder.field("low_freq_operator", lowFreqOperator.toString()); + } + if (analyzer != null) { + builder.field("analyzer", analyzer); + } + if (boost != null) { + builder.field("boost", boost); + } + if (cutoffFrequency != null) { + builder.field("cutoff_frequency", cutoffFrequency); + } + if (minimumShouldMatch != null) { + builder.field("minimum_should_match", minimumShouldMatch); + } + + builder.endObject(); + builder.endObject(); + } +} \ No newline at end of file diff --git a/src/main/java/org/elasticsearch/index/query/CommonTermsQueryParser.java b/src/main/java/org/elasticsearch/index/query/CommonTermsQueryParser.java new file mode 100644 index 00000000000..efc1ffa1231 --- /dev/null +++ b/src/main/java/org/elasticsearch/index/query/CommonTermsQueryParser.java @@ -0,0 +1,203 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.query; + +import java.io.IOException; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.index.Term; +import org.apache.lucene.queries.CommonTermsQuery; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.Query; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.UnicodeUtil; +import org.elasticsearch.ElasticSearchIllegalArgumentException; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.io.FastStringReader; +import org.elasticsearch.common.lucene.search.Queries; +import org.elasticsearch.common.xcontent.XContentParser; +import org.elasticsearch.index.mapper.FieldMapper; +import org.elasticsearch.index.mapper.MapperService; + +/** + * + */ +public class CommonTermsQueryParser implements QueryParser { + + public static final String NAME = "common"; + + static final float DEFAULT_MAX_TERM_DOC_FREQ = 0.01f; + + static final Occur DEFAULT_HIGH_FREQ_OCCUR = Occur.MUST; + + static final Occur DEFAULT_LOW_FREQ_OCCUR = Occur.MUST; + + static final boolean DEFAULT_DISABLE_COORDS = true; + + + @Inject + public CommonTermsQueryParser() { + } + + @Override + public String[] names() { + return new String[] { NAME }; + } + + @Override + public Query parse(QueryParseContext parseContext) throws IOException, QueryParsingException { + XContentParser parser = parseContext.parser(); + XContentParser.Token token = parser.nextToken(); + if (token != XContentParser.Token.FIELD_NAME) { + throw new QueryParsingException(parseContext.index(), "[common] query malformed, no field"); + } + String fieldName = parser.currentName(); + Object value = null; + float boost = 1.0f; + String queryAnalyzer = null; + String minimumShouldMatch = null; + boolean disableCoords = DEFAULT_DISABLE_COORDS; + Occur highFreqOccur = DEFAULT_HIGH_FREQ_OCCUR; + Occur lowFreqOccur = DEFAULT_HIGH_FREQ_OCCUR; + float maxTermFrequency = DEFAULT_MAX_TERM_DOC_FREQ; + token = parser.nextToken(); + if (token == XContentParser.Token.START_OBJECT) { + String currentFieldName = null; + while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) { + if (token == XContentParser.Token.FIELD_NAME) { + currentFieldName = parser.currentName(); + } else if (token.isValue()) { + if ("query".equals(currentFieldName)) { + value = parser.objectText(); + } else if ("analyzer".equals(currentFieldName)) { + String analyzer = parser.text(); + if (parseContext.analysisService().analyzer(analyzer) == null) { + throw new QueryParsingException(parseContext.index(), "[common] analyzer [" + parser.text() + "] not found"); + } + queryAnalyzer = analyzer; + } else if ("disable_coords".equals(currentFieldName)) { + disableCoords = parser.booleanValue(); + } else if ("boost".equals(currentFieldName)) { + boost = parser.floatValue(); + } else if ("high_freq_operator".equals(currentFieldName)) { + String op = parser.text(); + if ("or".equalsIgnoreCase(op)) { + highFreqOccur = BooleanClause.Occur.SHOULD; + } else if ("and".equalsIgnoreCase(op)) { + highFreqOccur = BooleanClause.Occur.MUST; + } else { + throw new QueryParsingException(parseContext.index(), + "[common] query requires operator to be either 'and' or 'or', not [" + op + "]"); + } + } else if ("low_freq_operator".equals(currentFieldName)) { + String op = parser.text(); + if ("or".equalsIgnoreCase(op)) { + lowFreqOccur = BooleanClause.Occur.SHOULD; + } else if ("and".equalsIgnoreCase(op)) { + lowFreqOccur = BooleanClause.Occur.MUST; + } else { + throw new QueryParsingException(parseContext.index(), + "[common] query requires operator to be either 'and' or 'or', not [" + op + "]"); + } + } else if ("minimum_should_match".equals(currentFieldName) || "minimumShouldMatch".equals(currentFieldName)) { + minimumShouldMatch = parser.textOrNull(); + } else if ("cutoff_frequency".equals(currentFieldName)) { + maxTermFrequency = parser.floatValue(); + } else { + throw new QueryParsingException(parseContext.index(), "[common] query does not support [" + currentFieldName + "]"); + } + } + } + parser.nextToken(); + } else { + value = parser.objectText(); + // move to the next token + token = parser.nextToken(); + if (token != XContentParser.Token.END_OBJECT) { + throw new QueryParsingException( + parseContext.index(), + "[common] query parsed in simplified form, with direct field name, but included more options than just the field name, possibly use its 'options' form, with 'query' element?"); + } + } + + if (value == null) { + throw new QueryParsingException(parseContext.index(), "No text specified for text query"); + } + CommonTermsQuery query = new CommonTermsQuery(highFreqOccur, lowFreqOccur, maxTermFrequency, disableCoords); + int numTerms = parseQueryString(query, value.toString(), fieldName, parseContext, queryAnalyzer); + if (numTerms == 0) { + return null; + } + if (minimumShouldMatch != null) { + query.setMinimumNumberShouldMatch(Queries.calculateMinShouldMatch(numTerms, minimumShouldMatch)); + } + query.setBoost(boost); + return query; + } + + private final int parseQueryString(CommonTermsQuery query, String queryString, String fieldName, QueryParseContext parseContext, + String queryAnalyzer) throws IOException { + FieldMapper mapper = null; + String field; + MapperService.SmartNameFieldMappers smartNameFieldMappers = parseContext.smartFieldMappers(fieldName); + if (smartNameFieldMappers != null && smartNameFieldMappers.hasMapper()) { + mapper = smartNameFieldMappers.mapper(); + field = mapper.names().indexName(); + } else { + field = fieldName; + } + + Analyzer analyzer = null; + if (queryAnalyzer == null) { + if (mapper != null) { + analyzer = mapper.searchAnalyzer(); + } + if (analyzer == null && smartNameFieldMappers != null) { + analyzer = smartNameFieldMappers.searchAnalyzer(); + } + if (analyzer == null) { + analyzer = parseContext.mapperService().searchAnalyzer(); + } + } else { + analyzer = parseContext.mapperService().analysisService().analyzer(queryAnalyzer); + if (analyzer == null) { + throw new ElasticSearchIllegalArgumentException("No analyzer found for [" + queryAnalyzer + "]"); + } + } + + // Logic similar to QueryParser#getFieldQuery + TokenStream source = analyzer.tokenStream(field, new FastStringReader(queryString.toString())); + source.reset(); + CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class); + int count = 0; + while (source.incrementToken()) { + BytesRef ref = new BytesRef(termAtt.length() * 4); // oversize for + // UTF-8 + UnicodeUtil.UTF16toUTF8(termAtt.buffer(), 0, termAtt.length(), ref); + query.add(new Term(field, ref)); + count++; + } + return count; + + } +} \ No newline at end of file diff --git a/src/main/java/org/elasticsearch/index/query/QueryBuilders.java b/src/main/java/org/elasticsearch/index/query/QueryBuilders.java index a2397dbb219..fc259f0c612 100644 --- a/src/main/java/org/elasticsearch/index/query/QueryBuilders.java +++ b/src/main/java/org/elasticsearch/index/query/QueryBuilders.java @@ -65,6 +65,16 @@ public abstract class QueryBuilders { public static MatchQueryBuilder matchQuery(String name, Object text) { return new MatchQueryBuilder(name, text).type(MatchQueryBuilder.Type.BOOLEAN); } + + /** + * Creates a common query for the provided field name and text. + * + * @param name The field name. + * @param text The query text (to be analyzed). + */ + public static CommonTermsQueryBuilder commonTerms(String name, Object text) { + return new CommonTermsQueryBuilder(name, text); + } /** * Creates a match query with type "BOOLEAN" for the provided field name and text. diff --git a/src/main/java/org/elasticsearch/indices/query/IndicesQueriesModule.java b/src/main/java/org/elasticsearch/indices/query/IndicesQueriesModule.java index 23d9dc09818..c5d1a6bae98 100644 --- a/src/main/java/org/elasticsearch/indices/query/IndicesQueriesModule.java +++ b/src/main/java/org/elasticsearch/indices/query/IndicesQueriesModule.java @@ -101,6 +101,8 @@ public class IndicesQueriesModule extends AbstractModule { qpBinders.addBinding().to(FuzzyLikeThisFieldQueryParser.class).asEagerSingleton(); qpBinders.addBinding().to(WrapperQueryParser.class).asEagerSingleton(); qpBinders.addBinding().to(IndicesQueryParser.class).asEagerSingleton(); + qpBinders.addBinding().to(CommonTermsQueryParser.class).asEagerSingleton(); + if (ShapesAvailability.JTS_AVAILABLE) { qpBinders.addBinding().to(GeoShapeQueryParser.class).asEagerSingleton(); } diff --git a/src/test/java/org/elasticsearch/test/integration/search/query/SimpleQueryTests.java b/src/test/java/org/elasticsearch/test/integration/search/query/SimpleQueryTests.java index f6cb0e8d5b6..f0fe9fde574 100644 --- a/src/test/java/org/elasticsearch/test/integration/search/query/SimpleQueryTests.java +++ b/src/test/java/org/elasticsearch/test/integration/search/query/SimpleQueryTests.java @@ -24,6 +24,7 @@ import org.elasticsearch.action.search.SearchResponse; import org.elasticsearch.client.Client; import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.index.query.*; +import org.elasticsearch.index.query.CommonTermsQueryBuilder.Operator; import org.elasticsearch.rest.RestStatus; import org.elasticsearch.search.facet.FacetBuilders; import org.elasticsearch.test.integration.AbstractNodesTests; @@ -104,7 +105,43 @@ public class SimpleQueryTests extends AbstractNodesTests { assertTrue(e.getMessage().endsWith("IllegalStateException[field \"field1\" was indexed without position data; cannot run PhraseQuery (term=quick)]; }")); } } + + @Test + public void testCommonTermsQuery() throws Exception { + try { + client.admin().indices().prepareDelete("test").execute().actionGet(); + } catch (Exception e) { + // ignore + } + client.admin().indices().prepareCreate("test") + .addMapping("type1", jsonBuilder().startObject().startObject("type1").startObject("properties").startObject("field1").field("analyzer", "whitespace").field("type", "string").endObject().endObject().endObject().endObject()) + .setSettings(ImmutableSettings.settingsBuilder().put("number_of_shards", 1)).execute().actionGet(); + + client.prepareIndex("test", "type1", "1").setSource("field1", "the quick brown fox").execute().actionGet(); + client.prepareIndex("test", "type1", "2").setSource("field1", "the quick lazy huge brown fox jumps over the tree").execute().actionGet(); + client.prepareIndex("test", "type1", "3").setSource("field1", "quick lazy huge brown").setRefresh(true).execute().actionGet(); + + SearchResponse searchResponse = client.prepareSearch().setQuery(QueryBuilders.commonTerms("field1", "the quick brown").cutoffFrequency(3)).execute().actionGet(); + assertThat(searchResponse.hits().totalHits(), equalTo(2l)); + assertThat(searchResponse.getHits().getHits()[0].getId(), equalTo("1")); + assertThat(searchResponse.getHits().getHits()[1].getId(), equalTo("2")); + + + searchResponse = client.prepareSearch().setQuery(QueryBuilders.commonTerms("field1", "the quick brown").cutoffFrequency(3).lowFreqOperator(Operator.OR)).execute().actionGet(); + assertThat(searchResponse.hits().totalHits(), equalTo(3l)); + assertThat(searchResponse.getHits().getHits()[0].getId(), equalTo("1")); + assertThat(searchResponse.getHits().getHits()[1].getId(), equalTo("2")); + assertThat(searchResponse.getHits().getHits()[2].getId(), equalTo("3")); + + searchResponse = client.prepareSearch().setQuery(QueryBuilders.commonTerms("field1", "the quick brown").cutoffFrequency(3).analyzer("standard")).execute().actionGet(); + assertThat(searchResponse.hits().totalHits(), equalTo(3l)); + // standard drops "the" since its a stopword + assertThat(searchResponse.getHits().getHits()[0].getId(), equalTo("1")); + assertThat(searchResponse.getHits().getHits()[1].getId(), equalTo("3")); + assertThat(searchResponse.getHits().getHits()[2].getId(), equalTo("2")); + } + @Test public void testOmitTermFreqsAndPositions() throws Exception { // backwards compat test!