Add support for "high_freq" and "low_freq" parameters for Common Query

"minimum_should_match" parameter. High freq parameters is used when the
query has only high frequent terms.

Closes #3188
This commit is contained in:
Cédric HOURCADE 2013-06-17 10:57:03 +01:00
parent 8363fcf281
commit d41c37fdfa
9 changed files with 235 additions and 23 deletions

View File

@ -17,7 +17,14 @@ package org.apache.lucene.queries;
* specific language governing permissions and limitations
* under the License.
*/
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.elasticsearch.common.lucene.search.Queries;
/**
@ -35,18 +42,97 @@ public class ExtendedCommonTermsQuery extends CommonTermsQuery {
super(highFreqOccur, lowFreqOccur, maxTermFrequency);
}
private String minNumShouldMatchSpec;
private String lowFreqMinNumShouldMatchSpec;
private String highFreqMinNumShouldMatchSpec;
@Override
protected int calcLowFreqMinimumNumberShouldMatch(int numOptional) {
if (minNumShouldMatchSpec == null) {
return 0;
}
return Queries.calculateMinShouldMatch(numOptional, minNumShouldMatchSpec);
}
public void setMinimumNumberShouldMatch(String spec) {
this.minNumShouldMatchSpec = spec;
return calcMinimumNumberShouldMatch(lowFreqMinNumShouldMatchSpec, numOptional);
}
protected int calcMinimumNumberShouldMatch(String spec, int numOptional) {
if (spec == null) {
return 0;
}
return Queries.calculateMinShouldMatch(numOptional, spec);
}
protected int calcHighFreqMinimumNumberShouldMatch(int numOptional) {
return calcMinimumNumberShouldMatch(highFreqMinNumShouldMatchSpec, numOptional);
}
public void setHighFreqMinimumNumberShouldMatch(String spec) {
this.highFreqMinNumShouldMatchSpec = spec;
}
public String getHighFreqMinimumNumberShouldMatch() {
return highFreqMinNumShouldMatchSpec;
}
public void setLowFreqMinimumNumberShouldMatch(String spec) {
this.lowFreqMinNumShouldMatchSpec = spec;
}
public String getLowFreqMinimumNumberShouldMatch() {
return lowFreqMinNumShouldMatchSpec;
}
@Override
protected Query buildQuery(final int maxDoc, final TermContext[] contextArray, final Term[] queryTerms) {
BooleanQuery lowFreq = new BooleanQuery(disableCoord);
BooleanQuery highFreq = new BooleanQuery(disableCoord);
highFreq.setBoost(highFreqBoost);
lowFreq.setBoost(lowFreqBoost);
BooleanQuery query = new BooleanQuery(true);
for (int i = 0; i < queryTerms.length; i++) {
TermContext termContext = contextArray[i];
if (termContext == null) {
lowFreq.add(new TermQuery(queryTerms[i]), lowFreqOccur);
} else {
if ((maxTermFrequency >= 1f && termContext.docFreq() > maxTermFrequency) || (termContext.docFreq() > (int) Math.ceil(maxTermFrequency * (float) maxDoc))) {
highFreq.add(new TermQuery(queryTerms[i], termContext), highFreqOccur);
} else {
lowFreq.add(new TermQuery(queryTerms[i], termContext), lowFreqOccur);
}
}
}
final int numLowFreqClauses = lowFreq.clauses().size(),
numHighFreqClauses = highFreq.clauses().size();
if (lowFreqOccur == Occur.SHOULD && numLowFreqClauses > 0) {
int minMustMatch = calcLowFreqMinimumNumberShouldMatch(numLowFreqClauses);
lowFreq.setMinimumNumberShouldMatch(minMustMatch);
}
if (highFreqOccur == Occur.SHOULD && numHighFreqClauses > 0) {
int minMustMatch = calcHighFreqMinimumNumberShouldMatch(numHighFreqClauses);
highFreq.setMinimumNumberShouldMatch(minMustMatch);
}
if (lowFreq.clauses().isEmpty()) {
/*
* if lowFreq is empty we rewrite the high freq terms in a conjunction to
* prevent slow queries.
* Only if a specic high_freq should_match is not specified.
*/
if (highFreqMinNumShouldMatchSpec == null && highFreqOccur != Occur.MUST) {
for (BooleanClause booleanClause : highFreq) {
booleanClause.setOccur(Occur.MUST);
}
}
highFreq.setBoost(getBoost());
return highFreq;
} else if (highFreq.clauses().isEmpty()) {
// only do low freq terms - we don't have high freq terms
lowFreq.setBoost(getBoost());
return lowFreq;
} else {
query.add(highFreq, Occur.SHOULD);
query.add(lowFreq, Occur.MUST);
query.setBoost(getBoost());
return query;
}
}
}

View File

@ -58,7 +58,9 @@ public class CommonTermsQueryBuilder extends BaseQueryBuilder implements Boostab
private Float boost = null;
private String minimumShouldMatch = null;
private String lowFreqMinimumShouldMatch = null;
private String highFreqMinimumShouldMatch = null;
private Boolean disableCoords = null;
@ -127,11 +129,20 @@ public class CommonTermsQueryBuilder extends BaseQueryBuilder implements Boostab
}
/**
* Sets the minimum number of query terms that need to match in order to
* Sets the minimum number of high frequent query terms that need to match in order to
* produce a hit when there are no low frequen terms.
*/
public CommonTermsQueryBuilder highFreqMinimumShouldMatch(String highFreqMinimumShouldMatch) {
this.highFreqMinimumShouldMatch = highFreqMinimumShouldMatch;
return this;
}
/**
* Sets the minimum number of low frequent query terms that need to match in order to
* produce a hit.
*/
public CommonTermsQueryBuilder minimumShouldMatch(String minimumShouldMatch) {
this.minimumShouldMatch = minimumShouldMatch;
public CommonTermsQueryBuilder lowFreqMinimumShouldMatch(String lowFreqMinimumShouldMatch) {
this.lowFreqMinimumShouldMatch = lowFreqMinimumShouldMatch;
return this;
}
@ -159,11 +170,18 @@ public class CommonTermsQueryBuilder extends BaseQueryBuilder implements Boostab
if (cutoffFrequency != null) {
builder.field("cutoff_frequency", cutoffFrequency);
}
if (minimumShouldMatch != null) {
builder.field("minimum_should_match", minimumShouldMatch);
if (lowFreqMinimumShouldMatch != null || highFreqMinimumShouldMatch != null) {
builder.startObject("minimum_should_match");
if (lowFreqMinimumShouldMatch != null) {
builder.field("low_freq", lowFreqMinimumShouldMatch);
}
if (highFreqMinimumShouldMatch != null) {
builder.field("high_freq", highFreqMinimumShouldMatch);
}
builder.endObject();
}
builder.endObject();
builder.endObject();
}
}
}

View File

@ -78,7 +78,8 @@ public class CommonTermsQueryParser implements QueryParser {
Object value = null;
float boost = 1.0f;
String queryAnalyzer = null;
String minimumShouldMatch = null;
String lowFreqMinimumShouldMatch = null;
String highFreqMinimumShouldMatch = null;
boolean disableCoords = DEFAULT_DISABLE_COORDS;
Occur highFreqOccur = DEFAULT_HIGH_FREQ_OCCUR;
Occur lowFreqOccur = DEFAULT_LOW_FREQ_OCCUR;
@ -89,6 +90,23 @@ public class CommonTermsQueryParser implements QueryParser {
while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
if (token == XContentParser.Token.FIELD_NAME) {
currentFieldName = parser.currentName();
} else if (token == XContentParser.Token.START_OBJECT) {
if ("minimum_should_match".equals(currentFieldName) || "minimumShouldMatch".equals(currentFieldName)) {
String innerFieldName = null;
while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
if (token == XContentParser.Token.FIELD_NAME) {
innerFieldName = parser.currentName();
} else if (token.isValue()) {
if ("low_freq".equals(innerFieldName) || "lowFreq".equals(innerFieldName)) {
lowFreqMinimumShouldMatch = parser.text();
} else if ("high_freq".equals(innerFieldName) || "highFreq".equals(innerFieldName)) {
highFreqMinimumShouldMatch = parser.text();
} else {
throw new QueryParsingException(parseContext.index(), "[common] query does not support [" + innerFieldName + "] for [" + currentFieldName + "]");
}
}
}
}
} else if (token.isValue()) {
if ("query".equals(currentFieldName)) {
value = parser.objectText();
@ -123,7 +141,7 @@ public class CommonTermsQueryParser implements QueryParser {
"[common] query requires operator to be either 'and' or 'or', not [" + op + "]");
}
} else if ("minimum_should_match".equals(currentFieldName) || "minimumShouldMatch".equals(currentFieldName)) {
minimumShouldMatch = parser.textOrNull();
lowFreqMinimumShouldMatch = parser.text();
} else if ("cutoff_frequency".equals(currentFieldName)) {
maxTermFrequency = parser.floatValue();
} else {
@ -148,12 +166,12 @@ public class CommonTermsQueryParser implements QueryParser {
}
ExtendedCommonTermsQuery query = new ExtendedCommonTermsQuery(highFreqOccur, lowFreqOccur, maxTermFrequency, disableCoords);
query.setBoost(boost);
return parseQueryString(query, value.toString(), fieldName, parseContext, queryAnalyzer, minimumShouldMatch);
return parseQueryString(query, value.toString(), fieldName, parseContext, queryAnalyzer, lowFreqMinimumShouldMatch, highFreqMinimumShouldMatch);
}
private final Query parseQueryString(ExtendedCommonTermsQuery query, String queryString, String fieldName, QueryParseContext parseContext,
String queryAnalyzer, String minimumShouldMatch) throws IOException {
String queryAnalyzer, String lowFreqMinimumShouldMatch, String highFreqMinimumShouldMatch) throws IOException {
FieldMapper<?> mapper = null;
String field;
@ -199,7 +217,8 @@ public class CommonTermsQueryParser implements QueryParser {
if (count == 0) {
return null;
}
query.setMinimumNumberShouldMatch(minimumShouldMatch);
query.setLowFreqMinimumNumberShouldMatch(lowFreqMinimumShouldMatch);
query.setHighFreqMinimumNumberShouldMatch(highFreqMinimumShouldMatch);
return wrapSmartNameQuery(query, smartNameFieldMappers, parseContext);
}
}

View File

@ -166,9 +166,9 @@ public class MatchQueryParser implements QueryParser {
if (query instanceof BooleanQuery) {
Queries.applyMinimumShouldMatch((BooleanQuery) query, minimumShouldMatch);
} else if (query instanceof ExtendedCommonTermsQuery) {
((ExtendedCommonTermsQuery)query).setMinimumNumberShouldMatch(minimumShouldMatch);
((ExtendedCommonTermsQuery)query).setLowFreqMinimumNumberShouldMatch(minimumShouldMatch);
}
query.setBoost(boost);
return query;
}
}
}

View File

@ -31,6 +31,7 @@ import org.elasticsearch.index.query.MatchQueryBuilder.Type;
import org.elasticsearch.rest.RestStatus;
import org.elasticsearch.search.facet.FacetBuilders;
import org.elasticsearch.test.integration.AbstractSharedClusterTest;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.testng.annotations.Test;
import java.io.IOException;
@ -145,6 +146,29 @@ public class SimpleQueryTests extends AbstractSharedClusterTest {
assertThat(searchResponse.getHits().getHits()[1].getId(), equalTo("2"));
assertThat(searchResponse.getHits().getHits()[2].getId(), equalTo("3"));
searchResponse = client().prepareSearch().setQuery(QueryBuilders.commonTerms("field1", "the huge fox").lowFreqMinimumShouldMatch("2")).execute().actionGet();
assertThat(searchResponse.getHits().totalHits(), equalTo(1l));
assertThat(searchResponse.getHits().getHits()[0].getId(), equalTo("2"));
searchResponse = client().prepareSearch().setQuery(QueryBuilders.commonTerms("field1", "the lazy fox brown").cutoffFrequency(1).highFreqMinimumShouldMatch("3")).execute().actionGet();
assertThat(searchResponse.getHits().totalHits(), equalTo(2l));
assertThat(searchResponse.getHits().getHits()[0].getId(), equalTo("1"));
assertThat(searchResponse.getHits().getHits()[1].getId(), equalTo("2"));
searchResponse = client().prepareSearch().setQuery(QueryBuilders.commonTerms("field1", "the lazy fox brown").cutoffFrequency(1).highFreqMinimumShouldMatch("4")).execute().actionGet();
assertThat(searchResponse.getHits().totalHits(), equalTo(1l));
assertThat(searchResponse.getHits().getHits()[0].getId(), equalTo("2"));
searchResponse = client().prepareSearch().setQuery("{ \"common\" : { \"field1\" : { \"query\" : \"the lazy fox brown\", \"cutoff_frequency\" : 1, \"minimum_should_match\" : { \"high_freq\" : 4 } } } }").execute().actionGet();
assertThat(searchResponse.getHits().totalHits(), equalTo(1l));
assertThat(searchResponse.getHits().getHits()[0].getId(), equalTo("2"));
// Default
searchResponse = client().prepareSearch().setQuery(QueryBuilders.commonTerms("field1", "the lazy fox brown").cutoffFrequency(1)).execute().actionGet();
assertThat(searchResponse.getHits().totalHits(), equalTo(1l));
assertThat(searchResponse.getHits().getHits()[0].getId(), equalTo("2"));
searchResponse = client().prepareSearch().setQuery(QueryBuilders.commonTerms("field1", "the quick brown").cutoffFrequency(3).analyzer("standard")).execute().actionGet();
assertThat(searchResponse.getHits().totalHits(), equalTo(3l));
// standard drops "the" since its a stopword

View File

@ -21,6 +21,7 @@ package org.elasticsearch.test.unit.index.query;
import com.google.common.collect.Lists;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.ExtendedCommonTermsQuery;
import org.apache.lucene.queries.BoostingQuery;
import org.apache.lucene.queries.FilterClause;
import org.apache.lucene.queries.TermsFilter;
@ -2191,4 +2192,37 @@ public class SimpleIndexQueryParserTests {
ConstantScoreQuery csq = (ConstantScoreQuery) parsedQuery;
assertThat(csq.getFilter(), instanceOf(IntersectsPrefixTreeFilter.class));
}
@Test
public void testCommonTermsQuery1() throws IOException {
IndexQueryParserService queryParser = queryParser();
String query = copyToStringFromClasspath("/org/elasticsearch/test/unit/index/query/commonTerms-query1.json");
Query parsedQuery = queryParser.parse(query).query();
assertThat(parsedQuery, instanceOf(ExtendedCommonTermsQuery.class));
ExtendedCommonTermsQuery ectQuery = (ExtendedCommonTermsQuery) parsedQuery;
assertThat(ectQuery.getHighFreqMinimumNumberShouldMatch(), nullValue());
assertThat(ectQuery.getLowFreqMinimumNumberShouldMatch(), equalTo("2"));
}
@Test
public void testCommonTermsQuery2() throws IOException {
IndexQueryParserService queryParser = queryParser();
String query = copyToStringFromClasspath("/org/elasticsearch/test/unit/index/query/commonTerms-query2.json");
Query parsedQuery = queryParser.parse(query).query();
assertThat(parsedQuery, instanceOf(ExtendedCommonTermsQuery.class));
ExtendedCommonTermsQuery ectQuery = (ExtendedCommonTermsQuery) parsedQuery;
assertThat(ectQuery.getHighFreqMinimumNumberShouldMatch(), equalTo("50%"));
assertThat(ectQuery.getLowFreqMinimumNumberShouldMatch(), equalTo("5<20%"));
}
@Test
public void testCommonTermsQuery3() throws IOException {
IndexQueryParserService queryParser = queryParser();
String query = copyToStringFromClasspath("/org/elasticsearch/test/unit/index/query/commonTerms-query3.json");
Query parsedQuery = queryParser.parse(query).query();
assertThat(parsedQuery, instanceOf(ExtendedCommonTermsQuery.class));
ExtendedCommonTermsQuery ectQuery = (ExtendedCommonTermsQuery) parsedQuery;
assertThat(ectQuery.getHighFreqMinimumNumberShouldMatch(), nullValue());
assertThat(ectQuery.getLowFreqMinimumNumberShouldMatch(), equalTo("2"));
}
}

View File

@ -0,0 +1,11 @@
{
"common" : {
"dogs" : {
"query" : "buck mia tom",
"cutoff_frequency" : 1,
"minimum_should_match" : {
"low_freq" : 2
}
}
}
}

View File

@ -0,0 +1,11 @@
{
"common" : {
"dogs" : {
"query" : "buck mia tom",
"minimum_should_match" : {
"high_freq" : "50%",
"low_freq" : "5<20%"
}
}
}
}

View File

@ -0,0 +1,9 @@
{
"common" : {
"dogs" : {
"query" : "buck mia tom",
"cutoff_frequency" : 1,
"minimum_should_match" : 2
}
}
}