From 51bf3e6730761674b07d2aeb937d107669c48b9e Mon Sep 17 00:00:00 2001 From: Alex Ksikes Date: Wed, 17 Sep 2014 01:03:19 +0200 Subject: [PATCH] MLT Query: fix percent_terms_to_match The parameter `percent_terms_to_match` (percentage of terms that must match in the generated query) was wrongly set to the top level boolean query. This would lead to zero or all results type of situations. This commit ensures that the parameter is indeed applied to the query of generated terms. Closes #7754 --- .../query-dsl/queries/mlt-query.asciidoc | 4 +- .../lucene/search/MoreLikeThisQuery.java | 16 +++++--- .../common/lucene/search/XMoreLikeThis.java | 25 ++++++++++--- .../query/SimpleIndexQueryParserTests.java | 37 +++++++++++++++++++ 4 files changed, 70 insertions(+), 12 deletions(-) diff --git a/docs/reference/query-dsl/queries/mlt-query.asciidoc b/docs/reference/query-dsl/queries/mlt-query.asciidoc index bbd0ae16a08..c451677b692 100644 --- a/docs/reference/query-dsl/queries/mlt-query.asciidoc +++ b/docs/reference/query-dsl/queries/mlt-query.asciidoc @@ -87,8 +87,8 @@ unless specified otherwise in each `doc`. |`include` |When using `ids` or `docs`, specifies whether the documents should be included from the search. Defaults to `false`. -|`percent_terms_to_match` |The percentage of terms to match on (float -value). Defaults to `0.3` (30 percent). +|`percent_terms_to_match` |From the generated query, the percentage of terms +that must match (float value between 0 and 1). Defaults to `0.3` (30 percent). |`min_term_freq` |The frequency below which terms will be ignored in the source doc. The default frequency is `2`. diff --git a/src/main/java/org/elasticsearch/common/lucene/search/MoreLikeThisQuery.java b/src/main/java/org/elasticsearch/common/lucene/search/MoreLikeThisQuery.java index c712f60aa44..4c17e01675c 100644 --- a/src/main/java/org/elasticsearch/common/lucene/search/MoreLikeThisQuery.java +++ b/src/main/java/org/elasticsearch/common/lucene/search/MoreLikeThisQuery.java @@ -152,7 +152,9 @@ public class MoreLikeThisQuery extends Query { BooleanQuery bq = new BooleanQuery(); if (this.likeFields != null) { - bq.add((BooleanQuery) mlt.like(this.likeFields), BooleanClause.Occur.SHOULD); + Query mltQuery = mlt.like(this.likeFields); + setMinimumShouldMatch((BooleanQuery) mltQuery, percentTermsToMatch); + bq.add(mltQuery, BooleanClause.Occur.SHOULD); } if (this.likeText != null) { Reader[] readers = new Reader[likeText.length]; @@ -160,12 +162,11 @@ public class MoreLikeThisQuery extends Query { readers[i] = new FastStringReader(likeText[i]); } //LUCENE 4 UPGRADE this mapps the 3.6 behavior (only use the first field) - bq.add((BooleanQuery) mlt.like(moreLikeFields[0], readers), BooleanClause.Occur.SHOULD); + Query mltQuery = mlt.like(moreLikeFields[0], readers); + setMinimumShouldMatch((BooleanQuery) mltQuery, percentTermsToMatch); + bq.add(mltQuery, BooleanClause.Occur.SHOULD); } - BooleanClause[] clauses = bq.getClauses(); - bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch)); - bq.setBoost(getBoost()); return bq; } @@ -309,4 +310,9 @@ public class MoreLikeThisQuery extends Query { public void setBoostTermsFactor(float boostTermsFactor) { this.boostTermsFactor = boostTermsFactor; } + + private static void setMinimumShouldMatch(BooleanQuery bq, float percentTermsToMatch) { + BooleanClause[] clauses = bq.getClauses(); + bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch)); + } } diff --git a/src/main/java/org/elasticsearch/common/lucene/search/XMoreLikeThis.java b/src/main/java/org/elasticsearch/common/lucene/search/XMoreLikeThis.java index 39fecec0c91..3fbc30c07c6 100644 --- a/src/main/java/org/elasticsearch/common/lucene/search/XMoreLikeThis.java +++ b/src/main/java/org/elasticsearch/common/lucene/search/XMoreLikeThis.java @@ -639,19 +639,17 @@ public final class XMoreLikeThis { fieldNames.add(fieldName); } } - // to create one query per field name only + // term selection is per field, then appended to a single boolean query BooleanQuery bq = new BooleanQuery(); for (String fieldName : fieldNames) { Map termFreqMap = new HashMap<>(); - this.setFieldNames(new String[]{fieldName}); for (Fields fields : likeFields) { Terms vector = fields.terms(fieldName); if (vector != null) { addTermFrequencies(termFreqMap, vector); } } - Query query = createQuery(createQueue(termFreqMap)); - bq.add(query, BooleanClause.Occur.SHOULD); + addToQuery(createQueue(termFreqMap, fieldName), bq); } return bq; } @@ -661,6 +659,14 @@ public final class XMoreLikeThis { */ private Query createQuery(PriorityQueue q) { BooleanQuery query = new BooleanQuery(); + addToQuery(q, query); + return query; + } + + /** + * Add to an existing boolean query the More Like This query from this PriorityQueue + */ + private void addToQuery(PriorityQueue q, BooleanQuery query) { ScoreTerm scoreTerm; float bestScore = -1; @@ -682,7 +688,6 @@ public final class XMoreLikeThis { break; } } - return query; } /** @@ -691,6 +696,16 @@ public final class XMoreLikeThis { * @param words a map of words keyed on the word(String) with Int objects as the values. */ private PriorityQueue createQueue(Map words) throws IOException { + return createQueue(words, this.fieldNames); + } + + /** + * Create a PriorityQueue from a word->tf map. + * + * @param words a map of words keyed on the word(String) with Int objects as the values. + * @param fieldNames an array of field names to override defaults. + */ + private PriorityQueue createQueue(Map words, String... fieldNames) throws IOException { // have collected all words in doc and their freqs int numDocs = ir.numDocs(); final int limit = Math.min(maxQueryTerms, words.size()); diff --git a/src/test/java/org/elasticsearch/index/query/SimpleIndexQueryParserTests.java b/src/test/java/org/elasticsearch/index/query/SimpleIndexQueryParserTests.java index 6bc1be655e7..d46631d4206 100644 --- a/src/test/java/org/elasticsearch/index/query/SimpleIndexQueryParserTests.java +++ b/src/test/java/org/elasticsearch/index/query/SimpleIndexQueryParserTests.java @@ -1623,6 +1623,43 @@ public class SimpleIndexQueryParserTests extends ElasticsearchSingleNodeTest { } } + @Test + public void testMLTPercentTermsToMatch() throws Exception { + // setup for mocking fetching items + MoreLikeThisQueryParser parser = (MoreLikeThisQueryParser) queryParser.queryParser("more_like_this"); + parser.setFetchService(new MockMoreLikeThisFetchService()); + + // parsing the ES query + IndexQueryParserService queryParser = queryParser(); + String query = copyToStringFromClasspath("/org/elasticsearch/index/query/mlt-items.json"); + BooleanQuery parsedQuery = (BooleanQuery) queryParser.parse(query).query(); + + // get MLT query, other clause is for include/exclude items + MoreLikeThisQuery mltQuery = (MoreLikeThisQuery) parsedQuery.getClauses()[0].getQuery(); + + // all terms must match + mltQuery.setPercentTermsToMatch(1.0f); + mltQuery.setMinWordLen(0); + mltQuery.setMinDocFreq(0); + + // one document has all values + MemoryIndex index = new MemoryIndex(); + index.addField("name.first", "apache lucene", new WhitespaceAnalyzer()); + index.addField("name.last", "1 2 3 4", new WhitespaceAnalyzer()); + + // two clauses, one for items and one for like_text if set + BooleanQuery luceneQuery = (BooleanQuery) mltQuery.rewrite(index.createSearcher().getIndexReader()); + BooleanClause[] clauses = luceneQuery.getClauses(); + + // check for items + int minNumberShouldMatch = ((BooleanQuery) (clauses[0].getQuery())).getMinimumNumberShouldMatch(); + assertThat(minNumberShouldMatch, is(4)); + + // and for like_text + minNumberShouldMatch = ((BooleanQuery) (clauses[1].getQuery())).getMinimumNumberShouldMatch(); + assertThat(minNumberShouldMatch, is(2)); + } + private static class MockMoreLikeThisFetchService extends MoreLikeThisFetchService { public MockMoreLikeThisFetchService() {