Add support for "high_freq" and "low_freq" parameters for Common Query

"minimum_should_match" parameter. High freq parameters is used when the
query has only high frequent terms.

Closes #3188
This commit is contained in:
Cédric HOURCADE 2013-06-17 10:57:03 +01:00
parent 8363fcf281
commit d41c37fdfa
9 changed files with 235 additions and 23 deletions

View File

@ -17,7 +17,14 @@ package org.apache.lucene.queries;
* specific language governing permissions and limitations * specific language governing permissions and limitations
* under the License. * under the License.
*/ */
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.elasticsearch.common.lucene.search.Queries; import org.elasticsearch.common.lucene.search.Queries;
/** /**
@ -35,18 +42,97 @@ public class ExtendedCommonTermsQuery extends CommonTermsQuery {
super(highFreqOccur, lowFreqOccur, maxTermFrequency); super(highFreqOccur, lowFreqOccur, maxTermFrequency);
} }
private String minNumShouldMatchSpec; private String lowFreqMinNumShouldMatchSpec;
private String highFreqMinNumShouldMatchSpec;
@Override @Override
protected int calcLowFreqMinimumNumberShouldMatch(int numOptional) { protected int calcLowFreqMinimumNumberShouldMatch(int numOptional) {
if (minNumShouldMatchSpec == null) { return calcMinimumNumberShouldMatch(lowFreqMinNumShouldMatchSpec, numOptional);
}
protected int calcMinimumNumberShouldMatch(String spec, int numOptional) {
if (spec == null) {
return 0; return 0;
} }
return Queries.calculateMinShouldMatch(numOptional, minNumShouldMatchSpec); return Queries.calculateMinShouldMatch(numOptional, spec);
} }
public void setMinimumNumberShouldMatch(String spec) { protected int calcHighFreqMinimumNumberShouldMatch(int numOptional) {
this.minNumShouldMatchSpec = spec; return calcMinimumNumberShouldMatch(highFreqMinNumShouldMatchSpec, numOptional);
} }
public void setHighFreqMinimumNumberShouldMatch(String spec) {
this.highFreqMinNumShouldMatchSpec = spec;
}
public String getHighFreqMinimumNumberShouldMatch() {
return highFreqMinNumShouldMatchSpec;
}
public void setLowFreqMinimumNumberShouldMatch(String spec) {
this.lowFreqMinNumShouldMatchSpec = spec;
}
public String getLowFreqMinimumNumberShouldMatch() {
return lowFreqMinNumShouldMatchSpec;
}
@Override
protected Query buildQuery(final int maxDoc, final TermContext[] contextArray, final Term[] queryTerms) {
BooleanQuery lowFreq = new BooleanQuery(disableCoord);
BooleanQuery highFreq = new BooleanQuery(disableCoord);
highFreq.setBoost(highFreqBoost);
lowFreq.setBoost(lowFreqBoost);
BooleanQuery query = new BooleanQuery(true);
for (int i = 0; i < queryTerms.length; i++) {
TermContext termContext = contextArray[i];
if (termContext == null) {
lowFreq.add(new TermQuery(queryTerms[i]), lowFreqOccur);
} else {
if ((maxTermFrequency >= 1f && termContext.docFreq() > maxTermFrequency) || (termContext.docFreq() > (int) Math.ceil(maxTermFrequency * (float) maxDoc))) {
highFreq.add(new TermQuery(queryTerms[i], termContext), highFreqOccur);
} else {
lowFreq.add(new TermQuery(queryTerms[i], termContext), lowFreqOccur);
}
}
}
final int numLowFreqClauses = lowFreq.clauses().size(),
numHighFreqClauses = highFreq.clauses().size();
if (lowFreqOccur == Occur.SHOULD && numLowFreqClauses > 0) {
int minMustMatch = calcLowFreqMinimumNumberShouldMatch(numLowFreqClauses);
lowFreq.setMinimumNumberShouldMatch(minMustMatch);
}
if (highFreqOccur == Occur.SHOULD && numHighFreqClauses > 0) {
int minMustMatch = calcHighFreqMinimumNumberShouldMatch(numHighFreqClauses);
highFreq.setMinimumNumberShouldMatch(minMustMatch);
}
if (lowFreq.clauses().isEmpty()) {
/*
* if lowFreq is empty we rewrite the high freq terms in a conjunction to
* prevent slow queries.
* Only if a specic high_freq should_match is not specified.
*/
if (highFreqMinNumShouldMatchSpec == null && highFreqOccur != Occur.MUST) {
for (BooleanClause booleanClause : highFreq) {
booleanClause.setOccur(Occur.MUST);
}
}
highFreq.setBoost(getBoost());
return highFreq;
} else if (highFreq.clauses().isEmpty()) {
// only do low freq terms - we don't have high freq terms
lowFreq.setBoost(getBoost());
return lowFreq;
} else {
query.add(highFreq, Occur.SHOULD);
query.add(lowFreq, Occur.MUST);
query.setBoost(getBoost());
return query;
}
}
} }

View File

@ -58,7 +58,9 @@ public class CommonTermsQueryBuilder extends BaseQueryBuilder implements Boostab
private Float boost = null; private Float boost = null;
private String minimumShouldMatch = null; private String lowFreqMinimumShouldMatch = null;
private String highFreqMinimumShouldMatch = null;
private Boolean disableCoords = null; private Boolean disableCoords = null;
@ -127,11 +129,20 @@ public class CommonTermsQueryBuilder extends BaseQueryBuilder implements Boostab
} }
/** /**
* Sets the minimum number of query terms that need to match in order to * Sets the minimum number of high frequent query terms that need to match in order to
* produce a hit when there are no low frequen terms.
*/
public CommonTermsQueryBuilder highFreqMinimumShouldMatch(String highFreqMinimumShouldMatch) {
this.highFreqMinimumShouldMatch = highFreqMinimumShouldMatch;
return this;
}
/**
* Sets the minimum number of low frequent query terms that need to match in order to
* produce a hit. * produce a hit.
*/ */
public CommonTermsQueryBuilder minimumShouldMatch(String minimumShouldMatch) { public CommonTermsQueryBuilder lowFreqMinimumShouldMatch(String lowFreqMinimumShouldMatch) {
this.minimumShouldMatch = minimumShouldMatch; this.lowFreqMinimumShouldMatch = lowFreqMinimumShouldMatch;
return this; return this;
} }
@ -159,8 +170,15 @@ public class CommonTermsQueryBuilder extends BaseQueryBuilder implements Boostab
if (cutoffFrequency != null) { if (cutoffFrequency != null) {
builder.field("cutoff_frequency", cutoffFrequency); builder.field("cutoff_frequency", cutoffFrequency);
} }
if (minimumShouldMatch != null) { if (lowFreqMinimumShouldMatch != null || highFreqMinimumShouldMatch != null) {
builder.field("minimum_should_match", minimumShouldMatch); builder.startObject("minimum_should_match");
if (lowFreqMinimumShouldMatch != null) {
builder.field("low_freq", lowFreqMinimumShouldMatch);
}
if (highFreqMinimumShouldMatch != null) {
builder.field("high_freq", highFreqMinimumShouldMatch);
}
builder.endObject();
} }
builder.endObject(); builder.endObject();

View File

@ -78,7 +78,8 @@ public class CommonTermsQueryParser implements QueryParser {
Object value = null; Object value = null;
float boost = 1.0f; float boost = 1.0f;
String queryAnalyzer = null; String queryAnalyzer = null;
String minimumShouldMatch = null; String lowFreqMinimumShouldMatch = null;
String highFreqMinimumShouldMatch = null;
boolean disableCoords = DEFAULT_DISABLE_COORDS; boolean disableCoords = DEFAULT_DISABLE_COORDS;
Occur highFreqOccur = DEFAULT_HIGH_FREQ_OCCUR; Occur highFreqOccur = DEFAULT_HIGH_FREQ_OCCUR;
Occur lowFreqOccur = DEFAULT_LOW_FREQ_OCCUR; Occur lowFreqOccur = DEFAULT_LOW_FREQ_OCCUR;
@ -89,6 +90,23 @@ public class CommonTermsQueryParser implements QueryParser {
while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) { while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
if (token == XContentParser.Token.FIELD_NAME) { if (token == XContentParser.Token.FIELD_NAME) {
currentFieldName = parser.currentName(); currentFieldName = parser.currentName();
} else if (token == XContentParser.Token.START_OBJECT) {
if ("minimum_should_match".equals(currentFieldName) || "minimumShouldMatch".equals(currentFieldName)) {
String innerFieldName = null;
while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
if (token == XContentParser.Token.FIELD_NAME) {
innerFieldName = parser.currentName();
} else if (token.isValue()) {
if ("low_freq".equals(innerFieldName) || "lowFreq".equals(innerFieldName)) {
lowFreqMinimumShouldMatch = parser.text();
} else if ("high_freq".equals(innerFieldName) || "highFreq".equals(innerFieldName)) {
highFreqMinimumShouldMatch = parser.text();
} else {
throw new QueryParsingException(parseContext.index(), "[common] query does not support [" + innerFieldName + "] for [" + currentFieldName + "]");
}
}
}
}
} else if (token.isValue()) { } else if (token.isValue()) {
if ("query".equals(currentFieldName)) { if ("query".equals(currentFieldName)) {
value = parser.objectText(); value = parser.objectText();
@ -123,7 +141,7 @@ public class CommonTermsQueryParser implements QueryParser {
"[common] query requires operator to be either 'and' or 'or', not [" + op + "]"); "[common] query requires operator to be either 'and' or 'or', not [" + op + "]");
} }
} else if ("minimum_should_match".equals(currentFieldName) || "minimumShouldMatch".equals(currentFieldName)) { } else if ("minimum_should_match".equals(currentFieldName) || "minimumShouldMatch".equals(currentFieldName)) {
minimumShouldMatch = parser.textOrNull(); lowFreqMinimumShouldMatch = parser.text();
} else if ("cutoff_frequency".equals(currentFieldName)) { } else if ("cutoff_frequency".equals(currentFieldName)) {
maxTermFrequency = parser.floatValue(); maxTermFrequency = parser.floatValue();
} else { } else {
@ -148,12 +166,12 @@ public class CommonTermsQueryParser implements QueryParser {
} }
ExtendedCommonTermsQuery query = new ExtendedCommonTermsQuery(highFreqOccur, lowFreqOccur, maxTermFrequency, disableCoords); ExtendedCommonTermsQuery query = new ExtendedCommonTermsQuery(highFreqOccur, lowFreqOccur, maxTermFrequency, disableCoords);
query.setBoost(boost); query.setBoost(boost);
return parseQueryString(query, value.toString(), fieldName, parseContext, queryAnalyzer, minimumShouldMatch); return parseQueryString(query, value.toString(), fieldName, parseContext, queryAnalyzer, lowFreqMinimumShouldMatch, highFreqMinimumShouldMatch);
} }
private final Query parseQueryString(ExtendedCommonTermsQuery query, String queryString, String fieldName, QueryParseContext parseContext, private final Query parseQueryString(ExtendedCommonTermsQuery query, String queryString, String fieldName, QueryParseContext parseContext,
String queryAnalyzer, String minimumShouldMatch) throws IOException { String queryAnalyzer, String lowFreqMinimumShouldMatch, String highFreqMinimumShouldMatch) throws IOException {
FieldMapper<?> mapper = null; FieldMapper<?> mapper = null;
String field; String field;
@ -199,7 +217,8 @@ public class CommonTermsQueryParser implements QueryParser {
if (count == 0) { if (count == 0) {
return null; return null;
} }
query.setMinimumNumberShouldMatch(minimumShouldMatch); query.setLowFreqMinimumNumberShouldMatch(lowFreqMinimumShouldMatch);
query.setHighFreqMinimumNumberShouldMatch(highFreqMinimumShouldMatch);
return wrapSmartNameQuery(query, smartNameFieldMappers, parseContext); return wrapSmartNameQuery(query, smartNameFieldMappers, parseContext);
} }
} }

View File

@ -166,7 +166,7 @@ public class MatchQueryParser implements QueryParser {
if (query instanceof BooleanQuery) { if (query instanceof BooleanQuery) {
Queries.applyMinimumShouldMatch((BooleanQuery) query, minimumShouldMatch); Queries.applyMinimumShouldMatch((BooleanQuery) query, minimumShouldMatch);
} else if (query instanceof ExtendedCommonTermsQuery) { } else if (query instanceof ExtendedCommonTermsQuery) {
((ExtendedCommonTermsQuery)query).setMinimumNumberShouldMatch(minimumShouldMatch); ((ExtendedCommonTermsQuery)query).setLowFreqMinimumNumberShouldMatch(minimumShouldMatch);
} }
query.setBoost(boost); query.setBoost(boost);
return query; return query;

View File

@ -31,6 +31,7 @@ import org.elasticsearch.index.query.MatchQueryBuilder.Type;
import org.elasticsearch.rest.RestStatus; import org.elasticsearch.rest.RestStatus;
import org.elasticsearch.search.facet.FacetBuilders; import org.elasticsearch.search.facet.FacetBuilders;
import org.elasticsearch.test.integration.AbstractSharedClusterTest; import org.elasticsearch.test.integration.AbstractSharedClusterTest;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.testng.annotations.Test; import org.testng.annotations.Test;
import java.io.IOException; import java.io.IOException;
@ -145,6 +146,29 @@ public class SimpleQueryTests extends AbstractSharedClusterTest {
assertThat(searchResponse.getHits().getHits()[1].getId(), equalTo("2")); assertThat(searchResponse.getHits().getHits()[1].getId(), equalTo("2"));
assertThat(searchResponse.getHits().getHits()[2].getId(), equalTo("3")); assertThat(searchResponse.getHits().getHits()[2].getId(), equalTo("3"));
searchResponse = client().prepareSearch().setQuery(QueryBuilders.commonTerms("field1", "the huge fox").lowFreqMinimumShouldMatch("2")).execute().actionGet();
assertThat(searchResponse.getHits().totalHits(), equalTo(1l));
assertThat(searchResponse.getHits().getHits()[0].getId(), equalTo("2"));
searchResponse = client().prepareSearch().setQuery(QueryBuilders.commonTerms("field1", "the lazy fox brown").cutoffFrequency(1).highFreqMinimumShouldMatch("3")).execute().actionGet();
assertThat(searchResponse.getHits().totalHits(), equalTo(2l));
assertThat(searchResponse.getHits().getHits()[0].getId(), equalTo("1"));
assertThat(searchResponse.getHits().getHits()[1].getId(), equalTo("2"));
searchResponse = client().prepareSearch().setQuery(QueryBuilders.commonTerms("field1", "the lazy fox brown").cutoffFrequency(1).highFreqMinimumShouldMatch("4")).execute().actionGet();
assertThat(searchResponse.getHits().totalHits(), equalTo(1l));
assertThat(searchResponse.getHits().getHits()[0].getId(), equalTo("2"));
searchResponse = client().prepareSearch().setQuery("{ \"common\" : { \"field1\" : { \"query\" : \"the lazy fox brown\", \"cutoff_frequency\" : 1, \"minimum_should_match\" : { \"high_freq\" : 4 } } } }").execute().actionGet();
assertThat(searchResponse.getHits().totalHits(), equalTo(1l));
assertThat(searchResponse.getHits().getHits()[0].getId(), equalTo("2"));
// Default
searchResponse = client().prepareSearch().setQuery(QueryBuilders.commonTerms("field1", "the lazy fox brown").cutoffFrequency(1)).execute().actionGet();
assertThat(searchResponse.getHits().totalHits(), equalTo(1l));
assertThat(searchResponse.getHits().getHits()[0].getId(), equalTo("2"));
searchResponse = client().prepareSearch().setQuery(QueryBuilders.commonTerms("field1", "the quick brown").cutoffFrequency(3).analyzer("standard")).execute().actionGet(); searchResponse = client().prepareSearch().setQuery(QueryBuilders.commonTerms("field1", "the quick brown").cutoffFrequency(3).analyzer("standard")).execute().actionGet();
assertThat(searchResponse.getHits().totalHits(), equalTo(3l)); assertThat(searchResponse.getHits().totalHits(), equalTo(3l));
// standard drops "the" since its a stopword // standard drops "the" since its a stopword

View File

@ -21,6 +21,7 @@ package org.elasticsearch.test.unit.index.query;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.queries.ExtendedCommonTermsQuery;
import org.apache.lucene.queries.BoostingQuery; import org.apache.lucene.queries.BoostingQuery;
import org.apache.lucene.queries.FilterClause; import org.apache.lucene.queries.FilterClause;
import org.apache.lucene.queries.TermsFilter; import org.apache.lucene.queries.TermsFilter;
@ -2191,4 +2192,37 @@ public class SimpleIndexQueryParserTests {
ConstantScoreQuery csq = (ConstantScoreQuery) parsedQuery; ConstantScoreQuery csq = (ConstantScoreQuery) parsedQuery;
assertThat(csq.getFilter(), instanceOf(IntersectsPrefixTreeFilter.class)); assertThat(csq.getFilter(), instanceOf(IntersectsPrefixTreeFilter.class));
} }
@Test
public void testCommonTermsQuery1() throws IOException {
IndexQueryParserService queryParser = queryParser();
String query = copyToStringFromClasspath("/org/elasticsearch/test/unit/index/query/commonTerms-query1.json");
Query parsedQuery = queryParser.parse(query).query();
assertThat(parsedQuery, instanceOf(ExtendedCommonTermsQuery.class));
ExtendedCommonTermsQuery ectQuery = (ExtendedCommonTermsQuery) parsedQuery;
assertThat(ectQuery.getHighFreqMinimumNumberShouldMatch(), nullValue());
assertThat(ectQuery.getLowFreqMinimumNumberShouldMatch(), equalTo("2"));
}
@Test
public void testCommonTermsQuery2() throws IOException {
IndexQueryParserService queryParser = queryParser();
String query = copyToStringFromClasspath("/org/elasticsearch/test/unit/index/query/commonTerms-query2.json");
Query parsedQuery = queryParser.parse(query).query();
assertThat(parsedQuery, instanceOf(ExtendedCommonTermsQuery.class));
ExtendedCommonTermsQuery ectQuery = (ExtendedCommonTermsQuery) parsedQuery;
assertThat(ectQuery.getHighFreqMinimumNumberShouldMatch(), equalTo("50%"));
assertThat(ectQuery.getLowFreqMinimumNumberShouldMatch(), equalTo("5<20%"));
}
@Test
public void testCommonTermsQuery3() throws IOException {
IndexQueryParserService queryParser = queryParser();
String query = copyToStringFromClasspath("/org/elasticsearch/test/unit/index/query/commonTerms-query3.json");
Query parsedQuery = queryParser.parse(query).query();
assertThat(parsedQuery, instanceOf(ExtendedCommonTermsQuery.class));
ExtendedCommonTermsQuery ectQuery = (ExtendedCommonTermsQuery) parsedQuery;
assertThat(ectQuery.getHighFreqMinimumNumberShouldMatch(), nullValue());
assertThat(ectQuery.getLowFreqMinimumNumberShouldMatch(), equalTo("2"));
}
} }

View File

@ -0,0 +1,11 @@
{
"common" : {
"dogs" : {
"query" : "buck mia tom",
"cutoff_frequency" : 1,
"minimum_should_match" : {
"low_freq" : 2
}
}
}
}

View File

@ -0,0 +1,11 @@
{
"common" : {
"dogs" : {
"query" : "buck mia tom",
"minimum_should_match" : {
"high_freq" : "50%",
"low_freq" : "5<20%"
}
}
}
}

View File

@ -0,0 +1,9 @@
{
"common" : {
"dogs" : {
"query" : "buck mia tom",
"cutoff_frequency" : 1,
"minimum_should_match" : 2
}
}
}