LUCENE-5149: CommonTermsQuery should allow minNrShouldMatch for high & low freq terms

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1508453 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Simon Willnauer 2013-07-30 14:08:14 +00:00
parent 6ec978de46
commit 4d29e7e20e
3 changed files with 102 additions and 23 deletions

View File

@ -130,7 +130,11 @@ API Changes
migrated to the new FacetsAggregator and FacetsAccumulator API. Also,
FacetRequest.createAggregator was replaced by OldFacetsAccumulator.createAggregator.
(Shai Erera)
* LUCENE-5149: CommonTermsQuery now allows to set the minimum number of terms that
should match for its high and low frequent sub-queries. Previously this was only
supported on the low frequent terms query. (Simon Willnauer)
Optimizations
* LUCENE-5088: Added TermFilter to filter docs by a specific term.

View File

@ -74,7 +74,8 @@ public class CommonTermsQuery extends Query {
protected final Occur highFreqOccur;
protected float lowFreqBoost = 1.0f;
protected float highFreqBoost = 1.0f;
protected float minNrShouldMatch = 0;
protected float lowFreqMinNrShouldMatch = 0;
protected float highFreqMinNrShouldMatch = 0;
/**
* Creates a new {@link CommonTermsQuery}
@ -161,10 +162,18 @@ public class CommonTermsQuery extends Query {
}
protected int calcLowFreqMinimumNumberShouldMatch(int numOptional) {
if (minNrShouldMatch >= 1.0f || minNrShouldMatch == 0.0f) {
return (int) minNrShouldMatch;
}
return (int) (Math.round(minNrShouldMatch * numOptional));
return minNrShouldMatch(lowFreqMinNrShouldMatch, numOptional);
}
protected int calcHighFreqMinimumNumberShouldMatch(int numOptional) {
return minNrShouldMatch(highFreqMinNrShouldMatch, numOptional);
}
private final int minNrShouldMatch(float minNrShouldMatch, int numOptional) {
if (minNrShouldMatch >= 1.0f || minNrShouldMatch == 0.0f) {
return (int) minNrShouldMatch;
}
return (int) (Math.round(minNrShouldMatch * numOptional));
}
protected Query buildQuery(final int maxDoc,
@ -190,11 +199,16 @@ public class CommonTermsQuery extends Query {
}
}
final int numLowFreqClauses = lowFreq.clauses().size();
final int numLowFreqClauses = lowFreq.clauses().size();
final int numHighFreqClauses = highFreq.clauses().size();
if (lowFreqOccur == Occur.SHOULD && numLowFreqClauses > 0) {
int minMustMatch = calcLowFreqMinimumNumberShouldMatch(numLowFreqClauses);
lowFreq.setMinimumNumberShouldMatch(minMustMatch);
}
if (highFreqOccur == Occur.SHOULD && numHighFreqClauses > 0) {
int minMustMatch = calcHighFreqMinimumNumberShouldMatch(numHighFreqClauses);
highFreq.setMinimumNumberShouldMatch(minMustMatch);
}
if (lowFreq.clauses().isEmpty()) {
/*
* if lowFreq is empty we rewrite the high freq terms in a conjunction to
@ -272,7 +286,7 @@ public class CommonTermsQuery extends Query {
}
/**
* Specifies a minimum number of the optional BooleanClauses which must be
* Specifies a minimum number of the low frequent optional BooleanClauses which must be
* satisfied in order to produce a match on the low frequency terms query
* part. This method accepts a float value in the range [0..1) as a fraction
* of the actual query terms in the low frequent clause or a number
@ -287,16 +301,44 @@ public class CommonTermsQuery extends Query {
* @param min
* the number of optional clauses that must match
*/
public void setMinimumNumberShouldMatch(float min) {
this.minNrShouldMatch = min;
public void setLowFreqMinimumNumberShouldMatch(float min) {
this.lowFreqMinNrShouldMatch = min;
}
/**
* Gets the minimum number of the optional BooleanClauses which must be
* Gets the minimum number of the optional low frequent BooleanClauses which must be
* satisfied.
*/
public float getMinimumNumberShouldMatch() {
return minNrShouldMatch;
public float getLowFreqMinimumNumberShouldMatch() {
return lowFreqMinNrShouldMatch;
}
/**
* Specifies a minimum number of the high frequent optional BooleanClauses which must be
* satisfied in order to produce a match on the low frequency terms query
* part. This method accepts a float value in the range [0..1) as a fraction
* of the actual query terms in the low frequent clause or a number
* <tt>&gt;=1</tt> as an absolut number of clauses that need to match.
*
* <p>
* By default no optional clauses are necessary for a match (unless there are
* no required clauses). If this method is used, then the specified number of
* clauses is required.
* </p>
*
* @param min
* the number of optional clauses that must match
*/
public void setHighFreqMinimumNumberShouldMatch(float min) {
this.highFreqMinNrShouldMatch = min;
}
/**
* Gets the minimum number of the optional high frequent BooleanClauses which must be
* satisfied.
*/
public float getHighFreqMinimumNumberShouldMatch() {
return highFreqMinNrShouldMatch;
}
@Override
@ -308,7 +350,7 @@ public class CommonTermsQuery extends Query {
public String toString(String field) {
StringBuilder buffer = new StringBuilder();
boolean needParens = (getBoost() != 1.0)
|| (getMinimumNumberShouldMatch() > 0);
|| (getLowFreqMinimumNumberShouldMatch() > 0);
if (needParens) {
buffer.append("(");
}
@ -321,9 +363,12 @@ public class CommonTermsQuery extends Query {
if (needParens) {
buffer.append(")");
}
if (getMinimumNumberShouldMatch() > 0) {
if (getLowFreqMinimumNumberShouldMatch() > 0 || getHighFreqMinimumNumberShouldMatch() > 0) {
buffer.append('~');
buffer.append(getMinimumNumberShouldMatch());
buffer.append("(");
buffer.append(getLowFreqMinimumNumberShouldMatch());
buffer.append(getHighFreqMinimumNumberShouldMatch());
buffer.append(")");
}
if (getBoost() != 1.0f) {
buffer.append(ToStringUtils.boost(getBoost()));
@ -343,7 +388,8 @@ public class CommonTermsQuery extends Query {
result = prime * result
+ ((lowFreqOccur == null) ? 0 : lowFreqOccur.hashCode());
result = prime * result + Float.floatToIntBits(maxTermFrequency);
result = prime * result + Float.floatToIntBits(minNrShouldMatch);
result = prime * result + Float.floatToIntBits(lowFreqMinNrShouldMatch);
result = prime * result + Float.floatToIntBits(highFreqMinNrShouldMatch);
result = prime * result + ((terms == null) ? 0 : terms.hashCode());
return result;
}
@ -363,7 +409,8 @@ public class CommonTermsQuery extends Query {
if (lowFreqOccur != other.lowFreqOccur) return false;
if (Float.floatToIntBits(maxTermFrequency) != Float
.floatToIntBits(other.maxTermFrequency)) return false;
if (minNrShouldMatch != other.minNrShouldMatch) return false;
if (lowFreqMinNrShouldMatch != other.lowFreqMinNrShouldMatch) return false;
if (highFreqMinNrShouldMatch != other.highFreqMinNrShouldMatch) return false;
if (terms == null) {
if (other.terms != null) return false;
} else if (!terms.equals(other.terms)) return false;

View File

@ -19,6 +19,7 @@ package org.apache.lucene.queries;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Random;
@ -146,6 +147,8 @@ public class CommonTermsQueryTest extends LuceneTestCase {
left.add(new Term(_TestUtil.randomRealisticUnicodeString(r), _TestUtil
.randomRealisticUnicodeString(r)));
}
left.setHighFreqMinimumNumberShouldMatch(r.nextInt(4));
left.setLowFreqMinimumNumberShouldMatch(r.nextInt(4));
r = new Random(seed);
CommonTermsQuery right = new CommonTermsQuery(randomOccur(r),
@ -155,6 +158,8 @@ public class CommonTermsQueryTest extends LuceneTestCase {
right.add(new Term(_TestUtil.randomRealisticUnicodeString(r), _TestUtil
.randomRealisticUnicodeString(r)));
}
right.setHighFreqMinimumNumberShouldMatch(r.nextInt(4));
right.setLowFreqMinimumNumberShouldMatch(r.nextInt(4));
QueryUtils.checkEqual(left, right);
}
}
@ -200,7 +205,7 @@ public class CommonTermsQueryTest extends LuceneTestCase {
query.add(new Term("field", "world"));
query.add(new Term("field", "universe"));
query.add(new Term("field", "right"));
query.setMinimumNumberShouldMatch(0.5f);
query.setLowFreqMinimumNumberShouldMatch(0.5f);
TopDocs search = s.search(query, 10);
assertEquals(search.totalHits, 1);
assertEquals("0", r.document(search.scoreDocs[0].doc).get("id"));
@ -214,7 +219,7 @@ public class CommonTermsQueryTest extends LuceneTestCase {
query.add(new Term("field", "world"));
query.add(new Term("field", "universe"));
query.add(new Term("field", "right"));
query.setMinimumNumberShouldMatch(2.0f);
query.setLowFreqMinimumNumberShouldMatch(2.0f);
TopDocs search = s.search(query, 10);
assertEquals(search.totalHits, 1);
assertEquals("0", r.document(search.scoreDocs[0].doc).get("id"));
@ -229,7 +234,7 @@ public class CommonTermsQueryTest extends LuceneTestCase {
query.add(new Term("field", "world"));
query.add(new Term("field", "universe"));
query.add(new Term("field", "right"));
query.setMinimumNumberShouldMatch(0.49f);
query.setLowFreqMinimumNumberShouldMatch(0.49f);
TopDocs search = s.search(query, 10);
assertEquals(search.totalHits, 3);
assertEquals("0", r.document(search.scoreDocs[0].doc).get("id"));
@ -246,14 +251,37 @@ public class CommonTermsQueryTest extends LuceneTestCase {
query.add(new Term("field", "world"));
query.add(new Term("field", "universe"));
query.add(new Term("field", "right"));
query.setMinimumNumberShouldMatch(1.0f);
query.setLowFreqMinimumNumberShouldMatch(1.0f);
TopDocs search = s.search(query, 10);
assertEquals(search.totalHits, 3);
assertEquals("0", r.document(search.scoreDocs[0].doc).get("id"));
assertEquals("2", r.document(search.scoreDocs[1].doc).get("id"));
assertEquals("3", r.document(search.scoreDocs[2].doc).get("id"));
assertTrue(search.scoreDocs[1].score > search.scoreDocs[2].score);
}
{
CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.SHOULD,
random().nextBoolean() ? 2.0f : 0.5f);
query.add(new Term("field", "is"));
query.add(new Term("field", "this"));
query.add(new Term("field", "end"));
query.add(new Term("field", "world"));
query.add(new Term("field", "universe"));
query.add(new Term("field", "right"));
query.setLowFreqMinimumNumberShouldMatch(1.0f);
query.setHighFreqMinimumNumberShouldMatch(4.0f);
TopDocs search = s.search(query, 10);
assertEquals(search.totalHits, 3);
assertEquals(search.scoreDocs[1].score, search.scoreDocs[2].score, 0.0f);
assertEquals("0", r.document(search.scoreDocs[0].doc).get("id"));
// doc 2 and 3 only get a score from low freq terms
assertEquals(
new HashSet<>(Arrays.asList("2", "3")),
new HashSet<>(Arrays.asList(
r.document(search.scoreDocs[1].doc).get("id"),
r.document(search.scoreDocs[2].doc).get("id"))));
}
r.close();
w.close();
dir.close();