MLT Query: fix percent_terms_to_match
The parameter `percent_terms_to_match` (percentage of terms that must match in the generated query) was wrongly set to the top level boolean query. This would lead to zero or all results type of situations. This commit ensures that the parameter is indeed applied to the query of generated terms. Closes #7754
This commit is contained in:
parent
70303be50c
commit
51bf3e6730
|
@ -87,8 +87,8 @@ unless specified otherwise in each `doc`.
|
|||
|`include` |When using `ids` or `docs`, specifies whether the documents should be
|
||||
included from the search. Defaults to `false`.
|
||||
|
||||
|`percent_terms_to_match` |The percentage of terms to match on (float
|
||||
value). Defaults to `0.3` (30 percent).
|
||||
|`percent_terms_to_match` |From the generated query, the percentage of terms
|
||||
that must match (float value between 0 and 1). Defaults to `0.3` (30 percent).
|
||||
|
||||
|`min_term_freq` |The frequency below which terms will be ignored in the
|
||||
source doc. The default frequency is `2`.
|
||||
|
|
|
@ -152,7 +152,9 @@ public class MoreLikeThisQuery extends Query {
|
|||
|
||||
BooleanQuery bq = new BooleanQuery();
|
||||
if (this.likeFields != null) {
|
||||
bq.add((BooleanQuery) mlt.like(this.likeFields), BooleanClause.Occur.SHOULD);
|
||||
Query mltQuery = mlt.like(this.likeFields);
|
||||
setMinimumShouldMatch((BooleanQuery) mltQuery, percentTermsToMatch);
|
||||
bq.add(mltQuery, BooleanClause.Occur.SHOULD);
|
||||
}
|
||||
if (this.likeText != null) {
|
||||
Reader[] readers = new Reader[likeText.length];
|
||||
|
@ -160,12 +162,11 @@ public class MoreLikeThisQuery extends Query {
|
|||
readers[i] = new FastStringReader(likeText[i]);
|
||||
}
|
||||
//LUCENE 4 UPGRADE this mapps the 3.6 behavior (only use the first field)
|
||||
bq.add((BooleanQuery) mlt.like(moreLikeFields[0], readers), BooleanClause.Occur.SHOULD);
|
||||
Query mltQuery = mlt.like(moreLikeFields[0], readers);
|
||||
setMinimumShouldMatch((BooleanQuery) mltQuery, percentTermsToMatch);
|
||||
bq.add(mltQuery, BooleanClause.Occur.SHOULD);
|
||||
}
|
||||
|
||||
BooleanClause[] clauses = bq.getClauses();
|
||||
bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch));
|
||||
|
||||
bq.setBoost(getBoost());
|
||||
return bq;
|
||||
}
|
||||
|
@ -309,4 +310,9 @@ public class MoreLikeThisQuery extends Query {
|
|||
public void setBoostTermsFactor(float boostTermsFactor) {
|
||||
this.boostTermsFactor = boostTermsFactor;
|
||||
}
|
||||
|
||||
private static void setMinimumShouldMatch(BooleanQuery bq, float percentTermsToMatch) {
|
||||
BooleanClause[] clauses = bq.getClauses();
|
||||
bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -639,19 +639,17 @@ public final class XMoreLikeThis {
|
|||
fieldNames.add(fieldName);
|
||||
}
|
||||
}
|
||||
// to create one query per field name only
|
||||
// term selection is per field, then appended to a single boolean query
|
||||
BooleanQuery bq = new BooleanQuery();
|
||||
for (String fieldName : fieldNames) {
|
||||
Map<String, Int> termFreqMap = new HashMap<>();
|
||||
this.setFieldNames(new String[]{fieldName});
|
||||
for (Fields fields : likeFields) {
|
||||
Terms vector = fields.terms(fieldName);
|
||||
if (vector != null) {
|
||||
addTermFrequencies(termFreqMap, vector);
|
||||
}
|
||||
}
|
||||
Query query = createQuery(createQueue(termFreqMap));
|
||||
bq.add(query, BooleanClause.Occur.SHOULD);
|
||||
addToQuery(createQueue(termFreqMap, fieldName), bq);
|
||||
}
|
||||
return bq;
|
||||
}
|
||||
|
@ -661,6 +659,14 @@ public final class XMoreLikeThis {
|
|||
*/
|
||||
private Query createQuery(PriorityQueue<ScoreTerm> q) {
|
||||
BooleanQuery query = new BooleanQuery();
|
||||
addToQuery(q, query);
|
||||
return query;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add to an existing boolean query the More Like This query from this PriorityQueue
|
||||
*/
|
||||
private void addToQuery(PriorityQueue<ScoreTerm> q, BooleanQuery query) {
|
||||
ScoreTerm scoreTerm;
|
||||
float bestScore = -1;
|
||||
|
||||
|
@ -682,7 +688,6 @@ public final class XMoreLikeThis {
|
|||
break;
|
||||
}
|
||||
}
|
||||
return query;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -691,6 +696,16 @@ public final class XMoreLikeThis {
|
|||
* @param words a map of words keyed on the word(String) with Int objects as the values.
|
||||
*/
|
||||
private PriorityQueue<ScoreTerm> createQueue(Map<String, Int> words) throws IOException {
|
||||
return createQueue(words, this.fieldNames);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a PriorityQueue from a word->tf map.
|
||||
*
|
||||
* @param words a map of words keyed on the word(String) with Int objects as the values.
|
||||
* @param fieldNames an array of field names to override defaults.
|
||||
*/
|
||||
private PriorityQueue<ScoreTerm> createQueue(Map<String, Int> words, String... fieldNames) throws IOException {
|
||||
// have collected all words in doc and their freqs
|
||||
int numDocs = ir.numDocs();
|
||||
final int limit = Math.min(maxQueryTerms, words.size());
|
||||
|
|
|
@ -1623,6 +1623,43 @@ public class SimpleIndexQueryParserTests extends ElasticsearchSingleNodeTest {
|
|||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMLTPercentTermsToMatch() throws Exception {
|
||||
// setup for mocking fetching items
|
||||
MoreLikeThisQueryParser parser = (MoreLikeThisQueryParser) queryParser.queryParser("more_like_this");
|
||||
parser.setFetchService(new MockMoreLikeThisFetchService());
|
||||
|
||||
// parsing the ES query
|
||||
IndexQueryParserService queryParser = queryParser();
|
||||
String query = copyToStringFromClasspath("/org/elasticsearch/index/query/mlt-items.json");
|
||||
BooleanQuery parsedQuery = (BooleanQuery) queryParser.parse(query).query();
|
||||
|
||||
// get MLT query, other clause is for include/exclude items
|
||||
MoreLikeThisQuery mltQuery = (MoreLikeThisQuery) parsedQuery.getClauses()[0].getQuery();
|
||||
|
||||
// all terms must match
|
||||
mltQuery.setPercentTermsToMatch(1.0f);
|
||||
mltQuery.setMinWordLen(0);
|
||||
mltQuery.setMinDocFreq(0);
|
||||
|
||||
// one document has all values
|
||||
MemoryIndex index = new MemoryIndex();
|
||||
index.addField("name.first", "apache lucene", new WhitespaceAnalyzer());
|
||||
index.addField("name.last", "1 2 3 4", new WhitespaceAnalyzer());
|
||||
|
||||
// two clauses, one for items and one for like_text if set
|
||||
BooleanQuery luceneQuery = (BooleanQuery) mltQuery.rewrite(index.createSearcher().getIndexReader());
|
||||
BooleanClause[] clauses = luceneQuery.getClauses();
|
||||
|
||||
// check for items
|
||||
int minNumberShouldMatch = ((BooleanQuery) (clauses[0].getQuery())).getMinimumNumberShouldMatch();
|
||||
assertThat(minNumberShouldMatch, is(4));
|
||||
|
||||
// and for like_text
|
||||
minNumberShouldMatch = ((BooleanQuery) (clauses[1].getQuery())).getMinimumNumberShouldMatch();
|
||||
assertThat(minNumberShouldMatch, is(2));
|
||||
}
|
||||
|
||||
private static class MockMoreLikeThisFetchService extends MoreLikeThisFetchService {
|
||||
|
||||
public MockMoreLikeThisFetchService() {
|
||||
|
|
Loading…
Reference in New Issue