diff --git a/docs/reference/query-dsl/queries/mlt-field-query.asciidoc b/docs/reference/query-dsl/queries/mlt-field-query.asciidoc index fdcb735197f..2c6338ccf08 100644 --- a/docs/reference/query-dsl/queries/mlt-field-query.asciidoc +++ b/docs/reference/query-dsl/queries/mlt-field-query.asciidoc @@ -29,8 +29,9 @@ The `more_like_this_field` top level parameters include: |Parameter |Description |`like_text` |The text to find documents like it, *required*. -|`percent_terms_to_match` |The percentage of terms to match on (float -value). Defaults to `0.3` (30 percent). +|`minimum_should_match`| From the generated query, the number of terms that +must match following the <>. (Defaults to `"30%"`). |`min_term_freq` |The frequency below which terms will be ignored in the source doc. The default frequency is `2`. diff --git a/docs/reference/query-dsl/queries/mlt-query.asciidoc b/docs/reference/query-dsl/queries/mlt-query.asciidoc index c451677b692..776d6265730 100644 --- a/docs/reference/query-dsl/queries/mlt-query.asciidoc +++ b/docs/reference/query-dsl/queries/mlt-query.asciidoc @@ -87,8 +87,9 @@ unless specified otherwise in each `doc`. |`include` |When using `ids` or `docs`, specifies whether the documents should be included from the search. Defaults to `false`. -|`percent_terms_to_match` |From the generated query, the percentage of terms -that must match (float value between 0 and 1). Defaults to `0.3` (30 percent). +|`minimum_should_match`| From the generated query, the number of terms that +must match following the <>. (Defaults to `"30%"`). |`min_term_freq` |The frequency below which terms will be ignored in the source doc. The default frequency is `2`. diff --git a/src/main/java/org/elasticsearch/action/mlt/MoreLikeThisRequest.java b/src/main/java/org/elasticsearch/action/mlt/MoreLikeThisRequest.java index 7b1cb8c972f..c058c1b8bfa 100644 --- a/src/main/java/org/elasticsearch/action/mlt/MoreLikeThisRequest.java +++ b/src/main/java/org/elasticsearch/action/mlt/MoreLikeThisRequest.java @@ -33,6 +33,7 @@ import org.elasticsearch.common.bytes.BytesArray; import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.lucene.search.Queries; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentFactory; import org.elasticsearch.search.Scroll; @@ -66,7 +67,7 @@ public class MoreLikeThisRequest extends ActionRequest impl private String[] fields; - private float percentTermsToMatch = -1; + private String minimumShouldMatch = "0%"; private int minTermFreq = -1; private int maxQueryTerms = -1; private String[] stopWords = null; @@ -211,18 +212,44 @@ public class MoreLikeThisRequest extends ActionRequest impl } /** - * The percent of the terms to match for each field. Defaults to 0.3f. + * Number of terms that must match the generated query expressed in the + * common syntax for minimum should match. Defaults to 30%. + * + * @see org.elasticsearch.common.lucene.search.Queries#calculateMinShouldMatch(int, String) */ - public MoreLikeThisRequest percentTermsToMatch(float percentTermsToMatch) { - this.percentTermsToMatch = percentTermsToMatch; + public MoreLikeThisRequest minimumShouldMatch(String minimumShouldMatch) { + this.minimumShouldMatch = minimumShouldMatch; return this; } + /** + * Number of terms that must match the generated query expressed in the + * common syntax for minimum should match. + * + * @see org.elasticsearch.common.lucene.search.Queries#calculateMinShouldMatch(int, String) + */ + public String minimumShouldMatch() { + return this.minimumShouldMatch; + } + /** * The percent of the terms to match for each field. Defaults to 0.3f. */ + @Deprecated + public MoreLikeThisRequest percentTermsToMatch(float percentTermsToMatch) { + return minimumShouldMatch((int) (percentTermsToMatch * 100) + "%"); + } + + /** + * The percent of the terms to match for each field. Defaults to 0.3f. + */ + @Deprecated public float percentTermsToMatch() { - return this.percentTermsToMatch; + if (minimumShouldMatch.endsWith("%")) { + return Float.parseFloat(minimumShouldMatch.substring(0, minimumShouldMatch.indexOf("%"))) / 100; + } else { + return -1; + } } /** @@ -584,7 +611,12 @@ public class MoreLikeThisRequest extends ActionRequest impl } } - percentTermsToMatch = in.readFloat(); + if (in.getVersion().onOrAfter(Version.V_1_5_0)) { + minimumShouldMatch(in.readString()); + } else { + percentTermsToMatch(in.readFloat()); + } + minTermFreq = in.readVInt(); maxQueryTerms = in.readVInt(); size = in.readVInt(); @@ -661,7 +693,12 @@ public class MoreLikeThisRequest extends ActionRequest impl } } - out.writeFloat(percentTermsToMatch); + if (out.getVersion().onOrAfter(Version.V_1_5_0)) { + out.writeString(minimumShouldMatch); + } else { + out.writeFloat(percentTermsToMatch()); + } + out.writeVInt(minTermFreq); out.writeVInt(maxQueryTerms); if (stopWords == null) { diff --git a/src/main/java/org/elasticsearch/action/mlt/MoreLikeThisRequestBuilder.java b/src/main/java/org/elasticsearch/action/mlt/MoreLikeThisRequestBuilder.java index 9d075f13305..e8226f8df82 100644 --- a/src/main/java/org/elasticsearch/action/mlt/MoreLikeThisRequestBuilder.java +++ b/src/main/java/org/elasticsearch/action/mlt/MoreLikeThisRequestBuilder.java @@ -60,12 +60,22 @@ public class MoreLikeThisRequestBuilder extends ActionRequestBuilder30%. + * + * @see org.elasticsearch.common.lucene.search.Queries#calculateMinShouldMatch(int, String) + */ + public MoreLikeThisRequestBuilder setMinimumShouldMatch(String minimumShouldMatch) { + request.minimumShouldMatch(minimumShouldMatch); + return this; + } + /** * The percent of the terms to match for each field. Defaults to 0.3f. */ public MoreLikeThisRequestBuilder setPercentTermsToMatch(float percentTermsToMatch) { - request.percentTermsToMatch(percentTermsToMatch); - return this; + return setMinimumShouldMatch((int) (percentTermsToMatch * 100) + "%"); } /** diff --git a/src/main/java/org/elasticsearch/action/mlt/TransportMoreLikeThisAction.java b/src/main/java/org/elasticsearch/action/mlt/TransportMoreLikeThisAction.java index 099bbb0a0bd..a7c30c0a7eb 100644 --- a/src/main/java/org/elasticsearch/action/mlt/TransportMoreLikeThisAction.java +++ b/src/main/java/org/elasticsearch/action/mlt/TransportMoreLikeThisAction.java @@ -323,7 +323,7 @@ public class TransportMoreLikeThisAction extends HandledTransportAction stopWords = XMoreLikeThis.DEFAULT_STOP_WORDS; @@ -84,7 +84,7 @@ public class MoreLikeThisQuery extends Query { result = 31 * result + minTermFrequency; result = 31 * result + minWordLen; result = 31 * result + Arrays.hashCode(moreLikeFields); - result = 31 * result + Float.floatToIntBits(percentTermsToMatch); + result = 31 * result + minimumShouldMatch.hashCode(); result = 31 * result + (stopWords == null ? 0 : stopWords.hashCode()); result = 31 * result + Float.floatToIntBits(getBoost()); return result; @@ -119,7 +119,7 @@ public class MoreLikeThisQuery extends Query { return false; if (!Arrays.equals(moreLikeFields, other.moreLikeFields)) return false; - if (percentTermsToMatch != other.percentTermsToMatch) + if (!minimumShouldMatch.equals(other.minimumShouldMatch)) return false; if (similarity == null) { if (other.similarity != null) @@ -153,7 +153,7 @@ public class MoreLikeThisQuery extends Query { BooleanQuery bq = new BooleanQuery(); if (this.likeFields != null) { Query mltQuery = mlt.like(this.likeFields); - setMinimumShouldMatch((BooleanQuery) mltQuery, percentTermsToMatch); + Queries.applyMinimumShouldMatch((BooleanQuery) mltQuery, minimumShouldMatch); bq.add(mltQuery, BooleanClause.Occur.SHOULD); } if (this.likeText != null) { @@ -163,7 +163,7 @@ public class MoreLikeThisQuery extends Query { } //LUCENE 4 UPGRADE this mapps the 3.6 behavior (only use the first field) Query mltQuery = mlt.like(moreLikeFields[0], readers); - setMinimumShouldMatch((BooleanQuery) mltQuery, percentTermsToMatch); + Queries.applyMinimumShouldMatch((BooleanQuery) mltQuery, minimumShouldMatch); bq.add(mltQuery, BooleanClause.Occur.SHOULD); } @@ -231,12 +231,24 @@ public class MoreLikeThisQuery extends Query { this.analyzer = analyzer; } - public float getPercentTermsToMatch() { - return percentTermsToMatch; + /** + * Number of terms that must match the generated query expressed in the + * common syntax for minimum should match. + * + * @see org.elasticsearch.common.lucene.search.Queries#calculateMinShouldMatch(int, String) + */ + public String getMinimumShouldMatch() { + return minimumShouldMatch; } - public void setPercentTermsToMatch(float percentTermsToMatch) { - this.percentTermsToMatch = percentTermsToMatch; + /** + * Number of terms that must match the generated query expressed in the + * common syntax for minimum should match. Defaults to 30%. + * + * @see org.elasticsearch.common.lucene.search.Queries#calculateMinShouldMatch(int, String) + */ + public void setMinimumShouldMatch(String minimumShouldMatch) { + this.minimumShouldMatch = minimumShouldMatch; } public int getMinTermFrequency() { @@ -310,9 +322,4 @@ public class MoreLikeThisQuery extends Query { public void setBoostTermsFactor(float boostTermsFactor) { this.boostTermsFactor = boostTermsFactor; } - - private static void setMinimumShouldMatch(BooleanQuery bq, float percentTermsToMatch) { - BooleanClause[] clauses = bq.getClauses(); - bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch)); - } } diff --git a/src/main/java/org/elasticsearch/index/query/MoreLikeThisFieldQueryBuilder.java b/src/main/java/org/elasticsearch/index/query/MoreLikeThisFieldQueryBuilder.java index 72bed1698ae..3ea61feb033 100644 --- a/src/main/java/org/elasticsearch/index/query/MoreLikeThisFieldQueryBuilder.java +++ b/src/main/java/org/elasticsearch/index/query/MoreLikeThisFieldQueryBuilder.java @@ -32,7 +32,7 @@ public class MoreLikeThisFieldQueryBuilder extends BaseQueryBuilder implements B private final String name; private String likeText; - private float percentTermsToMatch = -1; + private String minimumShouldMatch = null; private int minTermFreq = -1; private int maxQueryTerms = -1; private String[] stopWords = null; @@ -63,12 +63,23 @@ public class MoreLikeThisFieldQueryBuilder extends BaseQueryBuilder implements B return this; } + /** + * Number of terms that must match the generated query expressed in the + * common syntax for minimum should match. Defaults to 30%. + * + * @see org.elasticsearch.common.lucene.search.Queries#calculateMinShouldMatch(int, String) + */ + public MoreLikeThisFieldQueryBuilder minimumShouldMatch(String minimumShouldMatch) { + this.minimumShouldMatch = minimumShouldMatch; + return this; + } + /** * The percentage of terms to match. Defaults to 0.3. */ + @Deprecated public MoreLikeThisFieldQueryBuilder percentTermsToMatch(float percentTermsToMatch) { - this.percentTermsToMatch = percentTermsToMatch; - return this; + return minimumShouldMatch((int) (percentTermsToMatch * 100) + "%"); } /** @@ -183,8 +194,8 @@ public class MoreLikeThisFieldQueryBuilder extends BaseQueryBuilder implements B MoreLikeThisQueryParser.Fields.LIKE_TEXT.getPreferredName() +"' to be provided"); } builder.field(MoreLikeThisQueryParser.Fields.LIKE_TEXT.getPreferredName(), likeText); - if (percentTermsToMatch != -1) { - builder.field(MoreLikeThisQueryParser.Fields.PERCENT_TERMS_TO_MATCH.getPreferredName(), percentTermsToMatch); + if (minimumShouldMatch != null) { + builder.field(MoreLikeThisQueryParser.Fields.MINIMUM_SHOULD_MATCH.getPreferredName(), minimumShouldMatch); } if (minTermFreq != -1) { builder.field(MoreLikeThisQueryParser.Fields.MIN_TERM_FREQ.getPreferredName(), minTermFreq); diff --git a/src/main/java/org/elasticsearch/index/query/MoreLikeThisFieldQueryParser.java b/src/main/java/org/elasticsearch/index/query/MoreLikeThisFieldQueryParser.java index ca714490325..ca838506369 100644 --- a/src/main/java/org/elasticsearch/index/query/MoreLikeThisFieldQueryParser.java +++ b/src/main/java/org/elasticsearch/index/query/MoreLikeThisFieldQueryParser.java @@ -96,8 +96,10 @@ public class MoreLikeThisFieldQueryParser implements QueryParser { mltQuery.setBoostTerms(true); mltQuery.setBoostTermsFactor(boostFactor); } + } else if (MoreLikeThisQueryParser.Fields.MINIMUM_SHOULD_MATCH.match(currentFieldName,parseContext.parseFlags())) { + mltQuery.setMinimumShouldMatch(parser.text()); } else if (MoreLikeThisQueryParser.Fields.PERCENT_TERMS_TO_MATCH.match(currentFieldName,parseContext.parseFlags())) { - mltQuery.setPercentTermsToMatch(parser.floatValue()); + mltQuery.setMinimumShouldMatch((int) (parser.floatValue() * 100) + "%"); } else if ("analyzer".equals(currentFieldName)) { analyzer = parseContext.analysisService().analyzer(parser.text()); } else if ("boost".equals(currentFieldName)) { diff --git a/src/main/java/org/elasticsearch/index/query/MoreLikeThisQueryBuilder.java b/src/main/java/org/elasticsearch/index/query/MoreLikeThisQueryBuilder.java index 5059b8f3ab8..51ac957b86d 100644 --- a/src/main/java/org/elasticsearch/index/query/MoreLikeThisQueryBuilder.java +++ b/src/main/java/org/elasticsearch/index/query/MoreLikeThisQueryBuilder.java @@ -103,7 +103,7 @@ public class MoreLikeThisQueryBuilder extends BaseQueryBuilder implements Boosta private List ids = new ArrayList<>(); private List docs = new ArrayList<>(); private Boolean include = null; - private float percentTermsToMatch = -1; + private String minimumShouldMatch = null; private int minTermFreq = -1; private int maxQueryTerms = -1; private String[] stopWords = null; @@ -161,12 +161,23 @@ public class MoreLikeThisQueryBuilder extends BaseQueryBuilder implements Boosta return this; } + /** + * Number of terms that must match the generated query expressed in the + * common syntax for minimum should match. Defaults to 30%. + * + * @see org.elasticsearch.common.lucene.search.Queries#calculateMinShouldMatch(int, String) + */ + public MoreLikeThisQueryBuilder minimumShouldMatch(String minimumShouldMatch) { + this.minimumShouldMatch = minimumShouldMatch; + return this; + } + /** * The percentage of terms to match. Defaults to 0.3. */ + @Deprecated public MoreLikeThisQueryBuilder percentTermsToMatch(float percentTermsToMatch) { - this.percentTermsToMatch = percentTermsToMatch; - return this; + return minimumShouldMatch((int) (percentTermsToMatch * 100) + "%"); } /** @@ -287,8 +298,8 @@ public class MoreLikeThisQueryBuilder extends BaseQueryBuilder implements Boosta MoreLikeThisQueryParser.Fields.LIKE_TEXT.getPreferredName() +"' or 'docs/ids' to be provided"); } builder.field(MoreLikeThisQueryParser.Fields.LIKE_TEXT.getPreferredName(), likeText); - if (percentTermsToMatch != -1) { - builder.field(MoreLikeThisQueryParser.Fields.PERCENT_TERMS_TO_MATCH.getPreferredName(), percentTermsToMatch); + if (minimumShouldMatch != null) { + builder.field(MoreLikeThisQueryParser.Fields.MINIMUM_SHOULD_MATCH.getPreferredName(), minimumShouldMatch); } if (minTermFreq != -1) { builder.field(MoreLikeThisQueryParser.Fields.MIN_TERM_FREQ.getPreferredName(), minTermFreq); diff --git a/src/main/java/org/elasticsearch/index/query/MoreLikeThisQueryParser.java b/src/main/java/org/elasticsearch/index/query/MoreLikeThisQueryParser.java index f94dee7254f..6d7e8b5b1ad 100644 --- a/src/main/java/org/elasticsearch/index/query/MoreLikeThisQueryParser.java +++ b/src/main/java/org/elasticsearch/index/query/MoreLikeThisQueryParser.java @@ -63,6 +63,7 @@ public class MoreLikeThisQueryParser implements QueryParser { public static final ParseField MIN_DOC_FREQ = new ParseField("min_doc_freq"); public static final ParseField MAX_DOC_FREQ = new ParseField("max_doc_freq"); public static final ParseField BOOST_TERMS = new ParseField("boost_terms"); + public static final ParseField MINIMUM_SHOULD_MATCH = new ParseField("minimum_should_match"); public static final ParseField PERCENT_TERMS_TO_MATCH = new ParseField("percent_terms_to_match"); public static final ParseField FAIL_ON_UNSUPPORTED_FIELD = new ParseField("fail_on_unsupported_field"); public static final ParseField STOP_WORDS = new ParseField("stop_words"); @@ -124,8 +125,10 @@ public class MoreLikeThisQueryParser implements QueryParser { mltQuery.setBoostTerms(true); mltQuery.setBoostTermsFactor(boostFactor); } + } else if (Fields.MINIMUM_SHOULD_MATCH.match(currentFieldName, parseContext.parseFlags())) { + mltQuery.setMinimumShouldMatch(parser.text()); } else if (Fields.PERCENT_TERMS_TO_MATCH.match(currentFieldName, parseContext.parseFlags())) { - mltQuery.setPercentTermsToMatch(parser.floatValue()); + mltQuery.setMinimumShouldMatch((int) (parser.floatValue() * 100) + "%"); } else if ("analyzer".equals(currentFieldName)) { analyzer = parseContext.analysisService().analyzer(parser.text()); } else if ("boost".equals(currentFieldName)) { diff --git a/src/main/java/org/elasticsearch/rest/action/mlt/RestMoreLikeThisAction.java b/src/main/java/org/elasticsearch/rest/action/mlt/RestMoreLikeThisAction.java index 8a201218384..b716e3881e7 100644 --- a/src/main/java/org/elasticsearch/rest/action/mlt/RestMoreLikeThisAction.java +++ b/src/main/java/org/elasticsearch/rest/action/mlt/RestMoreLikeThisAction.java @@ -56,7 +56,7 @@ public class RestMoreLikeThisAction extends BaseRestHandler { //needs some work if it is to be used in a REST context like this too // See the MoreLikeThisQueryParser constants that hold the valid syntax mltRequest.fields(request.paramAsStringArray("mlt_fields", null)); - mltRequest.percentTermsToMatch(request.paramAsFloat("percent_terms_to_match", -1)); + mltRequest.minimumShouldMatch(request.param("minimum_should_match", "0")); mltRequest.minTermFreq(request.paramAsInt("min_term_freq", -1)); mltRequest.maxQueryTerms(request.paramAsInt("max_query_terms", -1)); mltRequest.stopWords(request.paramAsStringArray("stop_words", null)); diff --git a/src/test/java/org/elasticsearch/index/query/SimpleIndexQueryParserTests.java b/src/test/java/org/elasticsearch/index/query/SimpleIndexQueryParserTests.java index d46631d4206..8c4b5a40a5f 100644 --- a/src/test/java/org/elasticsearch/index/query/SimpleIndexQueryParserTests.java +++ b/src/test/java/org/elasticsearch/index/query/SimpleIndexQueryParserTests.java @@ -1638,7 +1638,7 @@ public class SimpleIndexQueryParserTests extends ElasticsearchSingleNodeTest { MoreLikeThisQuery mltQuery = (MoreLikeThisQuery) parsedQuery.getClauses()[0].getQuery(); // all terms must match - mltQuery.setPercentTermsToMatch(1.0f); + mltQuery.setMinimumShouldMatch("100%"); mltQuery.setMinWordLen(0); mltQuery.setMinDocFreq(0); diff --git a/src/test/java/org/elasticsearch/mlt/MoreLikeThisActionTests.java b/src/test/java/org/elasticsearch/mlt/MoreLikeThisActionTests.java index 879927c122a..6bc35e8178d 100644 --- a/src/test/java/org/elasticsearch/mlt/MoreLikeThisActionTests.java +++ b/src/test/java/org/elasticsearch/mlt/MoreLikeThisActionTests.java @@ -40,6 +40,7 @@ import org.junit.Test; import java.util.ArrayList; import java.util.Comparator; import java.util.List; +import java.util.concurrent.ExecutionException; import static org.elasticsearch.client.Requests.*; import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_REPLICAS; @@ -407,7 +408,8 @@ public class MoreLikeThisActionTests extends ElasticsearchIntegrationTest { logger.info("Running MoreLikeThis DSL with IDs"); String id = String.valueOf(getRandom().nextInt(texts.length)); Client client = client(); - MoreLikeThisQueryBuilder queryBuilder = QueryBuilders.moreLikeThisQuery("text").ids(id).minTermFreq(1).minDocFreq(1); + MoreLikeThisQueryBuilder queryBuilder = QueryBuilders.moreLikeThisQuery("text").ids(id).minTermFreq(1).minDocFreq(1) + .minimumShouldMatch("0%"); SearchResponse mltResponseDSL = client.prepareSearch() .setSearchType(SearchType.QUERY_THEN_FETCH) .setTypes("type1") @@ -417,7 +419,8 @@ public class MoreLikeThisActionTests extends ElasticsearchIntegrationTest { assertSearchResponse(mltResponseDSL); logger.info("Running MoreLikeThis API"); - MoreLikeThisRequest mltRequest = moreLikeThisRequest("test").type("type1").searchSize(texts.length).id(id).minTermFreq(1).minDocFreq(1); + MoreLikeThisRequest mltRequest = moreLikeThisRequest("test").type("type1").searchSize(texts.length).id(id).minTermFreq(1).minDocFreq(1) + .minimumShouldMatch("0%"); SearchResponse mltResponseAPI = client.moreLikeThis(mltRequest).actionGet(); assertSearchResponse(mltResponseAPI); @@ -523,4 +526,43 @@ public class MoreLikeThisActionTests extends ElasticsearchIntegrationTest { assertHitCount(response, values.length); } } + + @Test + public void testMinimumShouldMatch() throws ExecutionException, InterruptedException { + logger.info("Creating the index ..."); + assertAcked(prepareCreate("test") + .addMapping("type1", "text", "type=string,analyzer=whitespace") + .setSettings(SETTING_NUMBER_OF_SHARDS, 1)); + ensureGreen(); + + logger.info("Indexing with each doc having one less term ..."); + List builders = new ArrayList<>(); + for (int i = 0; i < 10; i++) { + String text = ""; + for (int j = 1; j <= 10 - i; j++) { + text += j + " "; + } + builders.add(client().prepareIndex("test", "type1", i + "").setSource("text", text)); + } + indexRandom(true, builders); + + logger.info("Testing each minimum_should_match from 0% - 100% with 10% increment ..."); + for (int i = 0; i <= 10; i++) { + String minimumShouldMatch = (10 * i) + "%"; + MoreLikeThisQueryBuilder mltQuery = moreLikeThisQuery("text") + .likeText("1 2 3 4 5 6 7 8 9 10") + .minTermFreq(1) + .minDocFreq(1) + .minimumShouldMatch(minimumShouldMatch); + logger.info("Testing with minimum_should_match = " + minimumShouldMatch); + SearchResponse response = client().prepareSearch("test").setTypes("type1") + .setQuery(mltQuery).get(); + assertSearchResponse(response); + if (minimumShouldMatch.equals("0%")) { + assertHitCount(response, 10); + } else { + assertHitCount(response, 11 - i); + } + } + } }