MLT Query: use minimum should match more extensive syntax

The minimum number of optional should clauses of the generated query to match
can now be set using the more extensive minimum should match syntax. This
makes the `percent_terms_to_match` parameter deprecated, and replaced in favor
to a new `minimum_should_match` parameter.

Closes #7898
This commit is contained in:
Alex Ksikes 2014-09-26 16:30:43 +02:00
parent 03d880de38
commit 5014158d6b
13 changed files with 170 additions and 45 deletions

View File

@ -29,8 +29,9 @@ The `more_like_this_field` top level parameters include:
|Parameter |Description
|`like_text` |The text to find documents like it, *required*.
|`percent_terms_to_match` |The percentage of terms to match on (float
value). Defaults to `0.3` (30 percent).
|`minimum_should_match`| From the generated query, the number of terms that
must match following the <<query-dsl-minimum-should-match,minimum should
syntax>>. (Defaults to `"30%"`).
|`min_term_freq` |The frequency below which terms will be ignored in the
source doc. The default frequency is `2`.

View File

@ -87,8 +87,9 @@ unless specified otherwise in each `doc`.
|`include` |When using `ids` or `docs`, specifies whether the documents should be
included from the search. Defaults to `false`.
|`percent_terms_to_match` |From the generated query, the percentage of terms
that must match (float value between 0 and 1). Defaults to `0.3` (30 percent).
|`minimum_should_match`| From the generated query, the number of terms that
must match following the <<query-dsl-minimum-should-match,minimum should
syntax>>. (Defaults to `"30%"`).
|`min_term_freq` |The frequency below which terms will be ignored in the
source doc. The default frequency is `2`.

View File

@ -33,6 +33,7 @@ import org.elasticsearch.common.bytes.BytesArray;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.lucene.search.Queries;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.search.Scroll;
@ -66,7 +67,7 @@ public class MoreLikeThisRequest extends ActionRequest<MoreLikeThisRequest> impl
private String[] fields;
private float percentTermsToMatch = -1;
private String minimumShouldMatch = "0%";
private int minTermFreq = -1;
private int maxQueryTerms = -1;
private String[] stopWords = null;
@ -211,18 +212,44 @@ public class MoreLikeThisRequest extends ActionRequest<MoreLikeThisRequest> impl
}
/**
* The percent of the terms to match for each field. Defaults to <tt>0.3f</tt>.
* Number of terms that must match the generated query expressed in the
* common syntax for minimum should match. Defaults to <tt>30%</tt>.
*
* @see org.elasticsearch.common.lucene.search.Queries#calculateMinShouldMatch(int, String)
*/
public MoreLikeThisRequest percentTermsToMatch(float percentTermsToMatch) {
this.percentTermsToMatch = percentTermsToMatch;
public MoreLikeThisRequest minimumShouldMatch(String minimumShouldMatch) {
this.minimumShouldMatch = minimumShouldMatch;
return this;
}
/**
* Number of terms that must match the generated query expressed in the
* common syntax for minimum should match.
*
* @see org.elasticsearch.common.lucene.search.Queries#calculateMinShouldMatch(int, String)
*/
public String minimumShouldMatch() {
return this.minimumShouldMatch;
}
/**
* The percent of the terms to match for each field. Defaults to <tt>0.3f</tt>.
*/
@Deprecated
public MoreLikeThisRequest percentTermsToMatch(float percentTermsToMatch) {
return minimumShouldMatch((int) (percentTermsToMatch * 100) + "%");
}
/**
* The percent of the terms to match for each field. Defaults to <tt>0.3f</tt>.
*/
@Deprecated
public float percentTermsToMatch() {
return this.percentTermsToMatch;
if (minimumShouldMatch.endsWith("%")) {
return Float.parseFloat(minimumShouldMatch.substring(0, minimumShouldMatch.indexOf("%"))) / 100;
} else {
return -1;
}
}
/**
@ -584,7 +611,12 @@ public class MoreLikeThisRequest extends ActionRequest<MoreLikeThisRequest> impl
}
}
percentTermsToMatch = in.readFloat();
if (in.getVersion().onOrAfter(Version.V_1_5_0)) {
minimumShouldMatch(in.readString());
} else {
percentTermsToMatch(in.readFloat());
}
minTermFreq = in.readVInt();
maxQueryTerms = in.readVInt();
size = in.readVInt();
@ -661,7 +693,12 @@ public class MoreLikeThisRequest extends ActionRequest<MoreLikeThisRequest> impl
}
}
out.writeFloat(percentTermsToMatch);
if (out.getVersion().onOrAfter(Version.V_1_5_0)) {
out.writeString(minimumShouldMatch);
} else {
out.writeFloat(percentTermsToMatch());
}
out.writeVInt(minTermFreq);
out.writeVInt(maxQueryTerms);
if (stopWords == null) {

View File

@ -60,12 +60,22 @@ public class MoreLikeThisRequestBuilder extends ActionRequestBuilder<MoreLikeThi
return this;
}
/**
* Number of terms that must match the generated query expressed in the
* common syntax for minimum should match. Defaults to <tt>30%</tt>.
*
* @see org.elasticsearch.common.lucene.search.Queries#calculateMinShouldMatch(int, String)
*/
public MoreLikeThisRequestBuilder setMinimumShouldMatch(String minimumShouldMatch) {
request.minimumShouldMatch(minimumShouldMatch);
return this;
}
/**
* The percent of the terms to match for each field. Defaults to <tt>0.3f</tt>.
*/
public MoreLikeThisRequestBuilder setPercentTermsToMatch(float percentTermsToMatch) {
request.percentTermsToMatch(percentTermsToMatch);
return this;
return setMinimumShouldMatch((int) (percentTermsToMatch * 100) + "%");
}
/**

View File

@ -323,7 +323,7 @@ public class TransportMoreLikeThisAction extends HandledTransportAction<MoreLike
private void addMoreLikeThis(MoreLikeThisRequest request, BoolQueryBuilder boolBuilder, String fieldName, String likeText, boolean failOnUnsupportedField) {
MoreLikeThisFieldQueryBuilder mlt = moreLikeThisFieldQuery(fieldName)
.likeText(likeText)
.percentTermsToMatch(request.percentTermsToMatch())
.minimumShouldMatch(request.minimumShouldMatch())
.boostTerms(request.boostTerms())
.minDocFreq(request.minDocFreq())
.maxDocFreq(request.maxDocFreq())

View File

@ -42,7 +42,7 @@ import java.util.Set;
*/
public class MoreLikeThisQuery extends Query {
public static final float DEFAULT_PERCENT_TERMS_TO_MATCH = 0.3f;
public static final String DEFAULT_MINIMUM_SHOULD_MATCH = "30%";
private TFIDFSimilarity similarity;
@ -50,7 +50,7 @@ public class MoreLikeThisQuery extends Query {
private Fields[] likeFields;
private String[] moreLikeFields;
private Analyzer analyzer;
private float percentTermsToMatch = DEFAULT_PERCENT_TERMS_TO_MATCH;
private String minimumShouldMatch = DEFAULT_MINIMUM_SHOULD_MATCH;
private int minTermFrequency = XMoreLikeThis.DEFAULT_MIN_TERM_FREQ;
private int maxQueryTerms = XMoreLikeThis.DEFAULT_MAX_QUERY_TERMS;
private Set<?> stopWords = XMoreLikeThis.DEFAULT_STOP_WORDS;
@ -84,7 +84,7 @@ public class MoreLikeThisQuery extends Query {
result = 31 * result + minTermFrequency;
result = 31 * result + minWordLen;
result = 31 * result + Arrays.hashCode(moreLikeFields);
result = 31 * result + Float.floatToIntBits(percentTermsToMatch);
result = 31 * result + minimumShouldMatch.hashCode();
result = 31 * result + (stopWords == null ? 0 : stopWords.hashCode());
result = 31 * result + Float.floatToIntBits(getBoost());
return result;
@ -119,7 +119,7 @@ public class MoreLikeThisQuery extends Query {
return false;
if (!Arrays.equals(moreLikeFields, other.moreLikeFields))
return false;
if (percentTermsToMatch != other.percentTermsToMatch)
if (!minimumShouldMatch.equals(other.minimumShouldMatch))
return false;
if (similarity == null) {
if (other.similarity != null)
@ -153,7 +153,7 @@ public class MoreLikeThisQuery extends Query {
BooleanQuery bq = new BooleanQuery();
if (this.likeFields != null) {
Query mltQuery = mlt.like(this.likeFields);
setMinimumShouldMatch((BooleanQuery) mltQuery, percentTermsToMatch);
Queries.applyMinimumShouldMatch((BooleanQuery) mltQuery, minimumShouldMatch);
bq.add(mltQuery, BooleanClause.Occur.SHOULD);
}
if (this.likeText != null) {
@ -163,7 +163,7 @@ public class MoreLikeThisQuery extends Query {
}
//LUCENE 4 UPGRADE this mapps the 3.6 behavior (only use the first field)
Query mltQuery = mlt.like(moreLikeFields[0], readers);
setMinimumShouldMatch((BooleanQuery) mltQuery, percentTermsToMatch);
Queries.applyMinimumShouldMatch((BooleanQuery) mltQuery, minimumShouldMatch);
bq.add(mltQuery, BooleanClause.Occur.SHOULD);
}
@ -231,12 +231,24 @@ public class MoreLikeThisQuery extends Query {
this.analyzer = analyzer;
}
public float getPercentTermsToMatch() {
return percentTermsToMatch;
/**
* Number of terms that must match the generated query expressed in the
* common syntax for minimum should match.
*
* @see org.elasticsearch.common.lucene.search.Queries#calculateMinShouldMatch(int, String)
*/
public String getMinimumShouldMatch() {
return minimumShouldMatch;
}
public void setPercentTermsToMatch(float percentTermsToMatch) {
this.percentTermsToMatch = percentTermsToMatch;
/**
* Number of terms that must match the generated query expressed in the
* common syntax for minimum should match. Defaults to <tt>30%</tt>.
*
* @see org.elasticsearch.common.lucene.search.Queries#calculateMinShouldMatch(int, String)
*/
public void setMinimumShouldMatch(String minimumShouldMatch) {
this.minimumShouldMatch = minimumShouldMatch;
}
public int getMinTermFrequency() {
@ -310,9 +322,4 @@ public class MoreLikeThisQuery extends Query {
public void setBoostTermsFactor(float boostTermsFactor) {
this.boostTermsFactor = boostTermsFactor;
}
private static void setMinimumShouldMatch(BooleanQuery bq, float percentTermsToMatch) {
BooleanClause[] clauses = bq.getClauses();
bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch));
}
}

View File

@ -32,7 +32,7 @@ public class MoreLikeThisFieldQueryBuilder extends BaseQueryBuilder implements B
private final String name;
private String likeText;
private float percentTermsToMatch = -1;
private String minimumShouldMatch = null;
private int minTermFreq = -1;
private int maxQueryTerms = -1;
private String[] stopWords = null;
@ -63,12 +63,23 @@ public class MoreLikeThisFieldQueryBuilder extends BaseQueryBuilder implements B
return this;
}
/**
* Number of terms that must match the generated query expressed in the
* common syntax for minimum should match. Defaults to <tt>30%</tt>.
*
* @see org.elasticsearch.common.lucene.search.Queries#calculateMinShouldMatch(int, String)
*/
public MoreLikeThisFieldQueryBuilder minimumShouldMatch(String minimumShouldMatch) {
this.minimumShouldMatch = minimumShouldMatch;
return this;
}
/**
* The percentage of terms to match. Defaults to <tt>0.3</tt>.
*/
@Deprecated
public MoreLikeThisFieldQueryBuilder percentTermsToMatch(float percentTermsToMatch) {
this.percentTermsToMatch = percentTermsToMatch;
return this;
return minimumShouldMatch((int) (percentTermsToMatch * 100) + "%");
}
/**
@ -183,8 +194,8 @@ public class MoreLikeThisFieldQueryBuilder extends BaseQueryBuilder implements B
MoreLikeThisQueryParser.Fields.LIKE_TEXT.getPreferredName() +"' to be provided");
}
builder.field(MoreLikeThisQueryParser.Fields.LIKE_TEXT.getPreferredName(), likeText);
if (percentTermsToMatch != -1) {
builder.field(MoreLikeThisQueryParser.Fields.PERCENT_TERMS_TO_MATCH.getPreferredName(), percentTermsToMatch);
if (minimumShouldMatch != null) {
builder.field(MoreLikeThisQueryParser.Fields.MINIMUM_SHOULD_MATCH.getPreferredName(), minimumShouldMatch);
}
if (minTermFreq != -1) {
builder.field(MoreLikeThisQueryParser.Fields.MIN_TERM_FREQ.getPreferredName(), minTermFreq);

View File

@ -96,8 +96,10 @@ public class MoreLikeThisFieldQueryParser implements QueryParser {
mltQuery.setBoostTerms(true);
mltQuery.setBoostTermsFactor(boostFactor);
}
} else if (MoreLikeThisQueryParser.Fields.MINIMUM_SHOULD_MATCH.match(currentFieldName,parseContext.parseFlags())) {
mltQuery.setMinimumShouldMatch(parser.text());
} else if (MoreLikeThisQueryParser.Fields.PERCENT_TERMS_TO_MATCH.match(currentFieldName,parseContext.parseFlags())) {
mltQuery.setPercentTermsToMatch(parser.floatValue());
mltQuery.setMinimumShouldMatch((int) (parser.floatValue() * 100) + "%");
} else if ("analyzer".equals(currentFieldName)) {
analyzer = parseContext.analysisService().analyzer(parser.text());
} else if ("boost".equals(currentFieldName)) {

View File

@ -103,7 +103,7 @@ public class MoreLikeThisQueryBuilder extends BaseQueryBuilder implements Boosta
private List<String> ids = new ArrayList<>();
private List<Item> docs = new ArrayList<>();
private Boolean include = null;
private float percentTermsToMatch = -1;
private String minimumShouldMatch = null;
private int minTermFreq = -1;
private int maxQueryTerms = -1;
private String[] stopWords = null;
@ -161,12 +161,23 @@ public class MoreLikeThisQueryBuilder extends BaseQueryBuilder implements Boosta
return this;
}
/**
* Number of terms that must match the generated query expressed in the
* common syntax for minimum should match. Defaults to <tt>30%</tt>.
*
* @see org.elasticsearch.common.lucene.search.Queries#calculateMinShouldMatch(int, String)
*/
public MoreLikeThisQueryBuilder minimumShouldMatch(String minimumShouldMatch) {
this.minimumShouldMatch = minimumShouldMatch;
return this;
}
/**
* The percentage of terms to match. Defaults to <tt>0.3</tt>.
*/
@Deprecated
public MoreLikeThisQueryBuilder percentTermsToMatch(float percentTermsToMatch) {
this.percentTermsToMatch = percentTermsToMatch;
return this;
return minimumShouldMatch((int) (percentTermsToMatch * 100) + "%");
}
/**
@ -287,8 +298,8 @@ public class MoreLikeThisQueryBuilder extends BaseQueryBuilder implements Boosta
MoreLikeThisQueryParser.Fields.LIKE_TEXT.getPreferredName() +"' or 'docs/ids' to be provided");
}
builder.field(MoreLikeThisQueryParser.Fields.LIKE_TEXT.getPreferredName(), likeText);
if (percentTermsToMatch != -1) {
builder.field(MoreLikeThisQueryParser.Fields.PERCENT_TERMS_TO_MATCH.getPreferredName(), percentTermsToMatch);
if (minimumShouldMatch != null) {
builder.field(MoreLikeThisQueryParser.Fields.MINIMUM_SHOULD_MATCH.getPreferredName(), minimumShouldMatch);
}
if (minTermFreq != -1) {
builder.field(MoreLikeThisQueryParser.Fields.MIN_TERM_FREQ.getPreferredName(), minTermFreq);

View File

@ -63,6 +63,7 @@ public class MoreLikeThisQueryParser implements QueryParser {
public static final ParseField MIN_DOC_FREQ = new ParseField("min_doc_freq");
public static final ParseField MAX_DOC_FREQ = new ParseField("max_doc_freq");
public static final ParseField BOOST_TERMS = new ParseField("boost_terms");
public static final ParseField MINIMUM_SHOULD_MATCH = new ParseField("minimum_should_match");
public static final ParseField PERCENT_TERMS_TO_MATCH = new ParseField("percent_terms_to_match");
public static final ParseField FAIL_ON_UNSUPPORTED_FIELD = new ParseField("fail_on_unsupported_field");
public static final ParseField STOP_WORDS = new ParseField("stop_words");
@ -124,8 +125,10 @@ public class MoreLikeThisQueryParser implements QueryParser {
mltQuery.setBoostTerms(true);
mltQuery.setBoostTermsFactor(boostFactor);
}
} else if (Fields.MINIMUM_SHOULD_MATCH.match(currentFieldName, parseContext.parseFlags())) {
mltQuery.setMinimumShouldMatch(parser.text());
} else if (Fields.PERCENT_TERMS_TO_MATCH.match(currentFieldName, parseContext.parseFlags())) {
mltQuery.setPercentTermsToMatch(parser.floatValue());
mltQuery.setMinimumShouldMatch((int) (parser.floatValue() * 100) + "%");
} else if ("analyzer".equals(currentFieldName)) {
analyzer = parseContext.analysisService().analyzer(parser.text());
} else if ("boost".equals(currentFieldName)) {

View File

@ -56,7 +56,7 @@ public class RestMoreLikeThisAction extends BaseRestHandler {
//needs some work if it is to be used in a REST context like this too
// See the MoreLikeThisQueryParser constants that hold the valid syntax
mltRequest.fields(request.paramAsStringArray("mlt_fields", null));
mltRequest.percentTermsToMatch(request.paramAsFloat("percent_terms_to_match", -1));
mltRequest.minimumShouldMatch(request.param("minimum_should_match", "0"));
mltRequest.minTermFreq(request.paramAsInt("min_term_freq", -1));
mltRequest.maxQueryTerms(request.paramAsInt("max_query_terms", -1));
mltRequest.stopWords(request.paramAsStringArray("stop_words", null));

View File

@ -1638,7 +1638,7 @@ public class SimpleIndexQueryParserTests extends ElasticsearchSingleNodeTest {
MoreLikeThisQuery mltQuery = (MoreLikeThisQuery) parsedQuery.getClauses()[0].getQuery();
// all terms must match
mltQuery.setPercentTermsToMatch(1.0f);
mltQuery.setMinimumShouldMatch("100%");
mltQuery.setMinWordLen(0);
mltQuery.setMinDocFreq(0);

View File

@ -40,6 +40,7 @@ import org.junit.Test;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.concurrent.ExecutionException;
import static org.elasticsearch.client.Requests.*;
import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_REPLICAS;
@ -407,7 +408,8 @@ public class MoreLikeThisActionTests extends ElasticsearchIntegrationTest {
logger.info("Running MoreLikeThis DSL with IDs");
String id = String.valueOf(getRandom().nextInt(texts.length));
Client client = client();
MoreLikeThisQueryBuilder queryBuilder = QueryBuilders.moreLikeThisQuery("text").ids(id).minTermFreq(1).minDocFreq(1);
MoreLikeThisQueryBuilder queryBuilder = QueryBuilders.moreLikeThisQuery("text").ids(id).minTermFreq(1).minDocFreq(1)
.minimumShouldMatch("0%");
SearchResponse mltResponseDSL = client.prepareSearch()
.setSearchType(SearchType.QUERY_THEN_FETCH)
.setTypes("type1")
@ -417,7 +419,8 @@ public class MoreLikeThisActionTests extends ElasticsearchIntegrationTest {
assertSearchResponse(mltResponseDSL);
logger.info("Running MoreLikeThis API");
MoreLikeThisRequest mltRequest = moreLikeThisRequest("test").type("type1").searchSize(texts.length).id(id).minTermFreq(1).minDocFreq(1);
MoreLikeThisRequest mltRequest = moreLikeThisRequest("test").type("type1").searchSize(texts.length).id(id).minTermFreq(1).minDocFreq(1)
.minimumShouldMatch("0%");
SearchResponse mltResponseAPI = client.moreLikeThis(mltRequest).actionGet();
assertSearchResponse(mltResponseAPI);
@ -523,4 +526,43 @@ public class MoreLikeThisActionTests extends ElasticsearchIntegrationTest {
assertHitCount(response, values.length);
}
}
@Test
public void testMinimumShouldMatch() throws ExecutionException, InterruptedException {
logger.info("Creating the index ...");
assertAcked(prepareCreate("test")
.addMapping("type1", "text", "type=string,analyzer=whitespace")
.setSettings(SETTING_NUMBER_OF_SHARDS, 1));
ensureGreen();
logger.info("Indexing with each doc having one less term ...");
List<IndexRequestBuilder> builders = new ArrayList<>();
for (int i = 0; i < 10; i++) {
String text = "";
for (int j = 1; j <= 10 - i; j++) {
text += j + " ";
}
builders.add(client().prepareIndex("test", "type1", i + "").setSource("text", text));
}
indexRandom(true, builders);
logger.info("Testing each minimum_should_match from 0% - 100% with 10% increment ...");
for (int i = 0; i <= 10; i++) {
String minimumShouldMatch = (10 * i) + "%";
MoreLikeThisQueryBuilder mltQuery = moreLikeThisQuery("text")
.likeText("1 2 3 4 5 6 7 8 9 10")
.minTermFreq(1)
.minDocFreq(1)
.minimumShouldMatch(minimumShouldMatch);
logger.info("Testing with minimum_should_match = " + minimumShouldMatch);
SearchResponse response = client().prepareSearch("test").setTypes("type1")
.setQuery(mltQuery).get();
assertSearchResponse(response);
if (minimumShouldMatch.equals("0%")) {
assertHitCount(response, 10);
} else {
assertHitCount(response, 11 - i);
}
}
}
}