MLT Query: fix percent_terms_to_match

The parameter `percent_terms_to_match` (percentage of terms that must match in
the generated query) was wrongly set to the top level boolean query. This
would lead to zero or all results type of situations. This commit ensures that
the parameter is indeed applied to the query of generated terms.

Closes #7754
This commit is contained in:
Alex Ksikes 2014-09-17 01:03:19 +02:00
parent 70303be50c
commit 51bf3e6730
4 changed files with 70 additions and 12 deletions

View File

@ -87,8 +87,8 @@ unless specified otherwise in each `doc`.
|`include` |When using `ids` or `docs`, specifies whether the documents should be
included from the search. Defaults to `false`.
|`percent_terms_to_match` |The percentage of terms to match on (float
value). Defaults to `0.3` (30 percent).
|`percent_terms_to_match` |From the generated query, the percentage of terms
that must match (float value between 0 and 1). Defaults to `0.3` (30 percent).
|`min_term_freq` |The frequency below which terms will be ignored in the
source doc. The default frequency is `2`.

View File

@ -152,7 +152,9 @@ public class MoreLikeThisQuery extends Query {
BooleanQuery bq = new BooleanQuery();
if (this.likeFields != null) {
bq.add((BooleanQuery) mlt.like(this.likeFields), BooleanClause.Occur.SHOULD);
Query mltQuery = mlt.like(this.likeFields);
setMinimumShouldMatch((BooleanQuery) mltQuery, percentTermsToMatch);
bq.add(mltQuery, BooleanClause.Occur.SHOULD);
}
if (this.likeText != null) {
Reader[] readers = new Reader[likeText.length];
@ -160,12 +162,11 @@ public class MoreLikeThisQuery extends Query {
readers[i] = new FastStringReader(likeText[i]);
}
//LUCENE 4 UPGRADE this mapps the 3.6 behavior (only use the first field)
bq.add((BooleanQuery) mlt.like(moreLikeFields[0], readers), BooleanClause.Occur.SHOULD);
Query mltQuery = mlt.like(moreLikeFields[0], readers);
setMinimumShouldMatch((BooleanQuery) mltQuery, percentTermsToMatch);
bq.add(mltQuery, BooleanClause.Occur.SHOULD);
}
BooleanClause[] clauses = bq.getClauses();
bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch));
bq.setBoost(getBoost());
return bq;
}
@ -309,4 +310,9 @@ public class MoreLikeThisQuery extends Query {
public void setBoostTermsFactor(float boostTermsFactor) {
this.boostTermsFactor = boostTermsFactor;
}
private static void setMinimumShouldMatch(BooleanQuery bq, float percentTermsToMatch) {
BooleanClause[] clauses = bq.getClauses();
bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch));
}
}

View File

@ -639,19 +639,17 @@ public final class XMoreLikeThis {
fieldNames.add(fieldName);
}
}
// to create one query per field name only
// term selection is per field, then appended to a single boolean query
BooleanQuery bq = new BooleanQuery();
for (String fieldName : fieldNames) {
Map<String, Int> termFreqMap = new HashMap<>();
this.setFieldNames(new String[]{fieldName});
for (Fields fields : likeFields) {
Terms vector = fields.terms(fieldName);
if (vector != null) {
addTermFrequencies(termFreqMap, vector);
}
}
Query query = createQuery(createQueue(termFreqMap));
bq.add(query, BooleanClause.Occur.SHOULD);
addToQuery(createQueue(termFreqMap, fieldName), bq);
}
return bq;
}
@ -661,6 +659,14 @@ public final class XMoreLikeThis {
*/
private Query createQuery(PriorityQueue<ScoreTerm> q) {
BooleanQuery query = new BooleanQuery();
addToQuery(q, query);
return query;
}
/**
* Add to an existing boolean query the More Like This query from this PriorityQueue
*/
private void addToQuery(PriorityQueue<ScoreTerm> q, BooleanQuery query) {
ScoreTerm scoreTerm;
float bestScore = -1;
@ -682,7 +688,6 @@ public final class XMoreLikeThis {
break;
}
}
return query;
}
/**
@ -691,6 +696,16 @@ public final class XMoreLikeThis {
* @param words a map of words keyed on the word(String) with Int objects as the values.
*/
private PriorityQueue<ScoreTerm> createQueue(Map<String, Int> words) throws IOException {
return createQueue(words, this.fieldNames);
}
/**
* Create a PriorityQueue from a word->tf map.
*
* @param words a map of words keyed on the word(String) with Int objects as the values.
* @param fieldNames an array of field names to override defaults.
*/
private PriorityQueue<ScoreTerm> createQueue(Map<String, Int> words, String... fieldNames) throws IOException {
// have collected all words in doc and their freqs
int numDocs = ir.numDocs();
final int limit = Math.min(maxQueryTerms, words.size());

View File

@ -1623,6 +1623,43 @@ public class SimpleIndexQueryParserTests extends ElasticsearchSingleNodeTest {
}
}
@Test
public void testMLTPercentTermsToMatch() throws Exception {
// setup for mocking fetching items
MoreLikeThisQueryParser parser = (MoreLikeThisQueryParser) queryParser.queryParser("more_like_this");
parser.setFetchService(new MockMoreLikeThisFetchService());
// parsing the ES query
IndexQueryParserService queryParser = queryParser();
String query = copyToStringFromClasspath("/org/elasticsearch/index/query/mlt-items.json");
BooleanQuery parsedQuery = (BooleanQuery) queryParser.parse(query).query();
// get MLT query, other clause is for include/exclude items
MoreLikeThisQuery mltQuery = (MoreLikeThisQuery) parsedQuery.getClauses()[0].getQuery();
// all terms must match
mltQuery.setPercentTermsToMatch(1.0f);
mltQuery.setMinWordLen(0);
mltQuery.setMinDocFreq(0);
// one document has all values
MemoryIndex index = new MemoryIndex();
index.addField("name.first", "apache lucene", new WhitespaceAnalyzer());
index.addField("name.last", "1 2 3 4", new WhitespaceAnalyzer());
// two clauses, one for items and one for like_text if set
BooleanQuery luceneQuery = (BooleanQuery) mltQuery.rewrite(index.createSearcher().getIndexReader());
BooleanClause[] clauses = luceneQuery.getClauses();
// check for items
int minNumberShouldMatch = ((BooleanQuery) (clauses[0].getQuery())).getMinimumNumberShouldMatch();
assertThat(minNumberShouldMatch, is(4));
// and for like_text
minNumberShouldMatch = ((BooleanQuery) (clauses[1].getQuery())).getMinimumNumberShouldMatch();
assertThat(minNumberShouldMatch, is(2));
}
private static class MockMoreLikeThisFetchService extends MoreLikeThisFetchService {
public MockMoreLikeThisFetchService() {