MLT Query: fix percent_terms_to_match

The parameter `percent_terms_to_match` (percentage of terms that must match in
the generated query) was wrongly set to the top level boolean query. This
would lead to zero or all results type of situations. This commit ensures that
the parameter is indeed applied to the query of generated terms.

Closes #7754
This commit is contained in:
Alex Ksikes 2014-09-17 01:03:19 +02:00
parent 70303be50c
commit 51bf3e6730
4 changed files with 70 additions and 12 deletions

View File

@ -87,8 +87,8 @@ unless specified otherwise in each `doc`.
|`include` |When using `ids` or `docs`, specifies whether the documents should be |`include` |When using `ids` or `docs`, specifies whether the documents should be
included from the search. Defaults to `false`. included from the search. Defaults to `false`.
|`percent_terms_to_match` |The percentage of terms to match on (float |`percent_terms_to_match` |From the generated query, the percentage of terms
value). Defaults to `0.3` (30 percent). that must match (float value between 0 and 1). Defaults to `0.3` (30 percent).
|`min_term_freq` |The frequency below which terms will be ignored in the |`min_term_freq` |The frequency below which terms will be ignored in the
source doc. The default frequency is `2`. source doc. The default frequency is `2`.

View File

@ -152,7 +152,9 @@ public class MoreLikeThisQuery extends Query {
BooleanQuery bq = new BooleanQuery(); BooleanQuery bq = new BooleanQuery();
if (this.likeFields != null) { if (this.likeFields != null) {
bq.add((BooleanQuery) mlt.like(this.likeFields), BooleanClause.Occur.SHOULD); Query mltQuery = mlt.like(this.likeFields);
setMinimumShouldMatch((BooleanQuery) mltQuery, percentTermsToMatch);
bq.add(mltQuery, BooleanClause.Occur.SHOULD);
} }
if (this.likeText != null) { if (this.likeText != null) {
Reader[] readers = new Reader[likeText.length]; Reader[] readers = new Reader[likeText.length];
@ -160,12 +162,11 @@ public class MoreLikeThisQuery extends Query {
readers[i] = new FastStringReader(likeText[i]); readers[i] = new FastStringReader(likeText[i]);
} }
//LUCENE 4 UPGRADE this mapps the 3.6 behavior (only use the first field) //LUCENE 4 UPGRADE this mapps the 3.6 behavior (only use the first field)
bq.add((BooleanQuery) mlt.like(moreLikeFields[0], readers), BooleanClause.Occur.SHOULD); Query mltQuery = mlt.like(moreLikeFields[0], readers);
setMinimumShouldMatch((BooleanQuery) mltQuery, percentTermsToMatch);
bq.add(mltQuery, BooleanClause.Occur.SHOULD);
} }
BooleanClause[] clauses = bq.getClauses();
bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch));
bq.setBoost(getBoost()); bq.setBoost(getBoost());
return bq; return bq;
} }
@ -309,4 +310,9 @@ public class MoreLikeThisQuery extends Query {
public void setBoostTermsFactor(float boostTermsFactor) { public void setBoostTermsFactor(float boostTermsFactor) {
this.boostTermsFactor = boostTermsFactor; this.boostTermsFactor = boostTermsFactor;
} }
private static void setMinimumShouldMatch(BooleanQuery bq, float percentTermsToMatch) {
BooleanClause[] clauses = bq.getClauses();
bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch));
}
} }

View File

@ -639,19 +639,17 @@ public final class XMoreLikeThis {
fieldNames.add(fieldName); fieldNames.add(fieldName);
} }
} }
// to create one query per field name only // term selection is per field, then appended to a single boolean query
BooleanQuery bq = new BooleanQuery(); BooleanQuery bq = new BooleanQuery();
for (String fieldName : fieldNames) { for (String fieldName : fieldNames) {
Map<String, Int> termFreqMap = new HashMap<>(); Map<String, Int> termFreqMap = new HashMap<>();
this.setFieldNames(new String[]{fieldName});
for (Fields fields : likeFields) { for (Fields fields : likeFields) {
Terms vector = fields.terms(fieldName); Terms vector = fields.terms(fieldName);
if (vector != null) { if (vector != null) {
addTermFrequencies(termFreqMap, vector); addTermFrequencies(termFreqMap, vector);
} }
} }
Query query = createQuery(createQueue(termFreqMap)); addToQuery(createQueue(termFreqMap, fieldName), bq);
bq.add(query, BooleanClause.Occur.SHOULD);
} }
return bq; return bq;
} }
@ -661,6 +659,14 @@ public final class XMoreLikeThis {
*/ */
private Query createQuery(PriorityQueue<ScoreTerm> q) { private Query createQuery(PriorityQueue<ScoreTerm> q) {
BooleanQuery query = new BooleanQuery(); BooleanQuery query = new BooleanQuery();
addToQuery(q, query);
return query;
}
/**
* Add to an existing boolean query the More Like This query from this PriorityQueue
*/
private void addToQuery(PriorityQueue<ScoreTerm> q, BooleanQuery query) {
ScoreTerm scoreTerm; ScoreTerm scoreTerm;
float bestScore = -1; float bestScore = -1;
@ -682,7 +688,6 @@ public final class XMoreLikeThis {
break; break;
} }
} }
return query;
} }
/** /**
@ -691,6 +696,16 @@ public final class XMoreLikeThis {
* @param words a map of words keyed on the word(String) with Int objects as the values. * @param words a map of words keyed on the word(String) with Int objects as the values.
*/ */
private PriorityQueue<ScoreTerm> createQueue(Map<String, Int> words) throws IOException { private PriorityQueue<ScoreTerm> createQueue(Map<String, Int> words) throws IOException {
return createQueue(words, this.fieldNames);
}
/**
* Create a PriorityQueue from a word->tf map.
*
* @param words a map of words keyed on the word(String) with Int objects as the values.
* @param fieldNames an array of field names to override defaults.
*/
private PriorityQueue<ScoreTerm> createQueue(Map<String, Int> words, String... fieldNames) throws IOException {
// have collected all words in doc and their freqs // have collected all words in doc and their freqs
int numDocs = ir.numDocs(); int numDocs = ir.numDocs();
final int limit = Math.min(maxQueryTerms, words.size()); final int limit = Math.min(maxQueryTerms, words.size());

View File

@ -1623,6 +1623,43 @@ public class SimpleIndexQueryParserTests extends ElasticsearchSingleNodeTest {
} }
} }
@Test
public void testMLTPercentTermsToMatch() throws Exception {
// setup for mocking fetching items
MoreLikeThisQueryParser parser = (MoreLikeThisQueryParser) queryParser.queryParser("more_like_this");
parser.setFetchService(new MockMoreLikeThisFetchService());
// parsing the ES query
IndexQueryParserService queryParser = queryParser();
String query = copyToStringFromClasspath("/org/elasticsearch/index/query/mlt-items.json");
BooleanQuery parsedQuery = (BooleanQuery) queryParser.parse(query).query();
// get MLT query, other clause is for include/exclude items
MoreLikeThisQuery mltQuery = (MoreLikeThisQuery) parsedQuery.getClauses()[0].getQuery();
// all terms must match
mltQuery.setPercentTermsToMatch(1.0f);
mltQuery.setMinWordLen(0);
mltQuery.setMinDocFreq(0);
// one document has all values
MemoryIndex index = new MemoryIndex();
index.addField("name.first", "apache lucene", new WhitespaceAnalyzer());
index.addField("name.last", "1 2 3 4", new WhitespaceAnalyzer());
// two clauses, one for items and one for like_text if set
BooleanQuery luceneQuery = (BooleanQuery) mltQuery.rewrite(index.createSearcher().getIndexReader());
BooleanClause[] clauses = luceneQuery.getClauses();
// check for items
int minNumberShouldMatch = ((BooleanQuery) (clauses[0].getQuery())).getMinimumNumberShouldMatch();
assertThat(minNumberShouldMatch, is(4));
// and for like_text
minNumberShouldMatch = ((BooleanQuery) (clauses[1].getQuery())).getMinimumNumberShouldMatch();
assertThat(minNumberShouldMatch, is(2));
}
private static class MockMoreLikeThisFetchService extends MoreLikeThisFetchService { private static class MockMoreLikeThisFetchService extends MoreLikeThisFetchService {
public MockMoreLikeThisFetchService() { public MockMoreLikeThisFetchService() {