mirror of https://github.com/apache/lucene.git
LUCENE-5149: CommonTermsQuery should allow minNrShouldMatch for high & low freq terms
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1508453 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
6ec978de46
commit
4d29e7e20e
|
@ -130,7 +130,11 @@ API Changes
|
|||
migrated to the new FacetsAggregator and FacetsAccumulator API. Also,
|
||||
FacetRequest.createAggregator was replaced by OldFacetsAccumulator.createAggregator.
|
||||
(Shai Erera)
|
||||
|
||||
|
||||
* LUCENE-5149: CommonTermsQuery now allows to set the minimum number of terms that
|
||||
should match for its high and low frequent sub-queries. Previously this was only
|
||||
supported on the low frequent terms query. (Simon Willnauer)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-5088: Added TermFilter to filter docs by a specific term.
|
||||
|
|
|
@ -74,7 +74,8 @@ public class CommonTermsQuery extends Query {
|
|||
protected final Occur highFreqOccur;
|
||||
protected float lowFreqBoost = 1.0f;
|
||||
protected float highFreqBoost = 1.0f;
|
||||
protected float minNrShouldMatch = 0;
|
||||
protected float lowFreqMinNrShouldMatch = 0;
|
||||
protected float highFreqMinNrShouldMatch = 0;
|
||||
|
||||
/**
|
||||
* Creates a new {@link CommonTermsQuery}
|
||||
|
@ -161,10 +162,18 @@ public class CommonTermsQuery extends Query {
|
|||
}
|
||||
|
||||
protected int calcLowFreqMinimumNumberShouldMatch(int numOptional) {
|
||||
if (minNrShouldMatch >= 1.0f || minNrShouldMatch == 0.0f) {
|
||||
return (int) minNrShouldMatch;
|
||||
}
|
||||
return (int) (Math.round(minNrShouldMatch * numOptional));
|
||||
return minNrShouldMatch(lowFreqMinNrShouldMatch, numOptional);
|
||||
}
|
||||
|
||||
protected int calcHighFreqMinimumNumberShouldMatch(int numOptional) {
|
||||
return minNrShouldMatch(highFreqMinNrShouldMatch, numOptional);
|
||||
}
|
||||
|
||||
private final int minNrShouldMatch(float minNrShouldMatch, int numOptional) {
|
||||
if (minNrShouldMatch >= 1.0f || minNrShouldMatch == 0.0f) {
|
||||
return (int) minNrShouldMatch;
|
||||
}
|
||||
return (int) (Math.round(minNrShouldMatch * numOptional));
|
||||
}
|
||||
|
||||
protected Query buildQuery(final int maxDoc,
|
||||
|
@ -190,11 +199,16 @@ public class CommonTermsQuery extends Query {
|
|||
}
|
||||
|
||||
}
|
||||
final int numLowFreqClauses = lowFreq.clauses().size();
|
||||
final int numLowFreqClauses = lowFreq.clauses().size();
|
||||
final int numHighFreqClauses = highFreq.clauses().size();
|
||||
if (lowFreqOccur == Occur.SHOULD && numLowFreqClauses > 0) {
|
||||
int minMustMatch = calcLowFreqMinimumNumberShouldMatch(numLowFreqClauses);
|
||||
lowFreq.setMinimumNumberShouldMatch(minMustMatch);
|
||||
}
|
||||
if (highFreqOccur == Occur.SHOULD && numHighFreqClauses > 0) {
|
||||
int minMustMatch = calcHighFreqMinimumNumberShouldMatch(numHighFreqClauses);
|
||||
highFreq.setMinimumNumberShouldMatch(minMustMatch);
|
||||
}
|
||||
if (lowFreq.clauses().isEmpty()) {
|
||||
/*
|
||||
* if lowFreq is empty we rewrite the high freq terms in a conjunction to
|
||||
|
@ -272,7 +286,7 @@ public class CommonTermsQuery extends Query {
|
|||
}
|
||||
|
||||
/**
|
||||
* Specifies a minimum number of the optional BooleanClauses which must be
|
||||
* Specifies a minimum number of the low frequent optional BooleanClauses which must be
|
||||
* satisfied in order to produce a match on the low frequency terms query
|
||||
* part. This method accepts a float value in the range [0..1) as a fraction
|
||||
* of the actual query terms in the low frequent clause or a number
|
||||
|
@ -287,16 +301,44 @@ public class CommonTermsQuery extends Query {
|
|||
* @param min
|
||||
* the number of optional clauses that must match
|
||||
*/
|
||||
public void setMinimumNumberShouldMatch(float min) {
|
||||
this.minNrShouldMatch = min;
|
||||
public void setLowFreqMinimumNumberShouldMatch(float min) {
|
||||
this.lowFreqMinNrShouldMatch = min;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the minimum number of the optional BooleanClauses which must be
|
||||
* Gets the minimum number of the optional low frequent BooleanClauses which must be
|
||||
* satisfied.
|
||||
*/
|
||||
public float getMinimumNumberShouldMatch() {
|
||||
return minNrShouldMatch;
|
||||
public float getLowFreqMinimumNumberShouldMatch() {
|
||||
return lowFreqMinNrShouldMatch;
|
||||
}
|
||||
|
||||
/**
|
||||
* Specifies a minimum number of the high frequent optional BooleanClauses which must be
|
||||
* satisfied in order to produce a match on the low frequency terms query
|
||||
* part. This method accepts a float value in the range [0..1) as a fraction
|
||||
* of the actual query terms in the low frequent clause or a number
|
||||
* <tt>>=1</tt> as an absolut number of clauses that need to match.
|
||||
*
|
||||
* <p>
|
||||
* By default no optional clauses are necessary for a match (unless there are
|
||||
* no required clauses). If this method is used, then the specified number of
|
||||
* clauses is required.
|
||||
* </p>
|
||||
*
|
||||
* @param min
|
||||
* the number of optional clauses that must match
|
||||
*/
|
||||
public void setHighFreqMinimumNumberShouldMatch(float min) {
|
||||
this.highFreqMinNrShouldMatch = min;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the minimum number of the optional high frequent BooleanClauses which must be
|
||||
* satisfied.
|
||||
*/
|
||||
public float getHighFreqMinimumNumberShouldMatch() {
|
||||
return highFreqMinNrShouldMatch;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -308,7 +350,7 @@ public class CommonTermsQuery extends Query {
|
|||
public String toString(String field) {
|
||||
StringBuilder buffer = new StringBuilder();
|
||||
boolean needParens = (getBoost() != 1.0)
|
||||
|| (getMinimumNumberShouldMatch() > 0);
|
||||
|| (getLowFreqMinimumNumberShouldMatch() > 0);
|
||||
if (needParens) {
|
||||
buffer.append("(");
|
||||
}
|
||||
|
@ -321,9 +363,12 @@ public class CommonTermsQuery extends Query {
|
|||
if (needParens) {
|
||||
buffer.append(")");
|
||||
}
|
||||
if (getMinimumNumberShouldMatch() > 0) {
|
||||
if (getLowFreqMinimumNumberShouldMatch() > 0 || getHighFreqMinimumNumberShouldMatch() > 0) {
|
||||
buffer.append('~');
|
||||
buffer.append(getMinimumNumberShouldMatch());
|
||||
buffer.append("(");
|
||||
buffer.append(getLowFreqMinimumNumberShouldMatch());
|
||||
buffer.append(getHighFreqMinimumNumberShouldMatch());
|
||||
buffer.append(")");
|
||||
}
|
||||
if (getBoost() != 1.0f) {
|
||||
buffer.append(ToStringUtils.boost(getBoost()));
|
||||
|
@ -343,7 +388,8 @@ public class CommonTermsQuery extends Query {
|
|||
result = prime * result
|
||||
+ ((lowFreqOccur == null) ? 0 : lowFreqOccur.hashCode());
|
||||
result = prime * result + Float.floatToIntBits(maxTermFrequency);
|
||||
result = prime * result + Float.floatToIntBits(minNrShouldMatch);
|
||||
result = prime * result + Float.floatToIntBits(lowFreqMinNrShouldMatch);
|
||||
result = prime * result + Float.floatToIntBits(highFreqMinNrShouldMatch);
|
||||
result = prime * result + ((terms == null) ? 0 : terms.hashCode());
|
||||
return result;
|
||||
}
|
||||
|
@ -363,7 +409,8 @@ public class CommonTermsQuery extends Query {
|
|||
if (lowFreqOccur != other.lowFreqOccur) return false;
|
||||
if (Float.floatToIntBits(maxTermFrequency) != Float
|
||||
.floatToIntBits(other.maxTermFrequency)) return false;
|
||||
if (minNrShouldMatch != other.minNrShouldMatch) return false;
|
||||
if (lowFreqMinNrShouldMatch != other.lowFreqMinNrShouldMatch) return false;
|
||||
if (highFreqMinNrShouldMatch != other.highFreqMinNrShouldMatch) return false;
|
||||
if (terms == null) {
|
||||
if (other.terms != null) return false;
|
||||
} else if (!terms.equals(other.terms)) return false;
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.queries;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
|
@ -146,6 +147,8 @@ public class CommonTermsQueryTest extends LuceneTestCase {
|
|||
left.add(new Term(_TestUtil.randomRealisticUnicodeString(r), _TestUtil
|
||||
.randomRealisticUnicodeString(r)));
|
||||
}
|
||||
left.setHighFreqMinimumNumberShouldMatch(r.nextInt(4));
|
||||
left.setLowFreqMinimumNumberShouldMatch(r.nextInt(4));
|
||||
|
||||
r = new Random(seed);
|
||||
CommonTermsQuery right = new CommonTermsQuery(randomOccur(r),
|
||||
|
@ -155,6 +158,8 @@ public class CommonTermsQueryTest extends LuceneTestCase {
|
|||
right.add(new Term(_TestUtil.randomRealisticUnicodeString(r), _TestUtil
|
||||
.randomRealisticUnicodeString(r)));
|
||||
}
|
||||
right.setHighFreqMinimumNumberShouldMatch(r.nextInt(4));
|
||||
right.setLowFreqMinimumNumberShouldMatch(r.nextInt(4));
|
||||
QueryUtils.checkEqual(left, right);
|
||||
}
|
||||
}
|
||||
|
@ -200,7 +205,7 @@ public class CommonTermsQueryTest extends LuceneTestCase {
|
|||
query.add(new Term("field", "world"));
|
||||
query.add(new Term("field", "universe"));
|
||||
query.add(new Term("field", "right"));
|
||||
query.setMinimumNumberShouldMatch(0.5f);
|
||||
query.setLowFreqMinimumNumberShouldMatch(0.5f);
|
||||
TopDocs search = s.search(query, 10);
|
||||
assertEquals(search.totalHits, 1);
|
||||
assertEquals("0", r.document(search.scoreDocs[0].doc).get("id"));
|
||||
|
@ -214,7 +219,7 @@ public class CommonTermsQueryTest extends LuceneTestCase {
|
|||
query.add(new Term("field", "world"));
|
||||
query.add(new Term("field", "universe"));
|
||||
query.add(new Term("field", "right"));
|
||||
query.setMinimumNumberShouldMatch(2.0f);
|
||||
query.setLowFreqMinimumNumberShouldMatch(2.0f);
|
||||
TopDocs search = s.search(query, 10);
|
||||
assertEquals(search.totalHits, 1);
|
||||
assertEquals("0", r.document(search.scoreDocs[0].doc).get("id"));
|
||||
|
@ -229,7 +234,7 @@ public class CommonTermsQueryTest extends LuceneTestCase {
|
|||
query.add(new Term("field", "world"));
|
||||
query.add(new Term("field", "universe"));
|
||||
query.add(new Term("field", "right"));
|
||||
query.setMinimumNumberShouldMatch(0.49f);
|
||||
query.setLowFreqMinimumNumberShouldMatch(0.49f);
|
||||
TopDocs search = s.search(query, 10);
|
||||
assertEquals(search.totalHits, 3);
|
||||
assertEquals("0", r.document(search.scoreDocs[0].doc).get("id"));
|
||||
|
@ -246,14 +251,37 @@ public class CommonTermsQueryTest extends LuceneTestCase {
|
|||
query.add(new Term("field", "world"));
|
||||
query.add(new Term("field", "universe"));
|
||||
query.add(new Term("field", "right"));
|
||||
query.setMinimumNumberShouldMatch(1.0f);
|
||||
query.setLowFreqMinimumNumberShouldMatch(1.0f);
|
||||
TopDocs search = s.search(query, 10);
|
||||
assertEquals(search.totalHits, 3);
|
||||
assertEquals("0", r.document(search.scoreDocs[0].doc).get("id"));
|
||||
assertEquals("2", r.document(search.scoreDocs[1].doc).get("id"));
|
||||
assertEquals("3", r.document(search.scoreDocs[2].doc).get("id"));
|
||||
assertTrue(search.scoreDocs[1].score > search.scoreDocs[2].score);
|
||||
}
|
||||
|
||||
{
|
||||
CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.SHOULD,
|
||||
random().nextBoolean() ? 2.0f : 0.5f);
|
||||
query.add(new Term("field", "is"));
|
||||
query.add(new Term("field", "this"));
|
||||
query.add(new Term("field", "end"));
|
||||
query.add(new Term("field", "world"));
|
||||
query.add(new Term("field", "universe"));
|
||||
query.add(new Term("field", "right"));
|
||||
query.setLowFreqMinimumNumberShouldMatch(1.0f);
|
||||
query.setHighFreqMinimumNumberShouldMatch(4.0f);
|
||||
TopDocs search = s.search(query, 10);
|
||||
assertEquals(search.totalHits, 3);
|
||||
assertEquals(search.scoreDocs[1].score, search.scoreDocs[2].score, 0.0f);
|
||||
assertEquals("0", r.document(search.scoreDocs[0].doc).get("id"));
|
||||
// doc 2 and 3 only get a score from low freq terms
|
||||
assertEquals(
|
||||
new HashSet<>(Arrays.asList("2", "3")),
|
||||
new HashSet<>(Arrays.asList(
|
||||
r.document(search.scoreDocs[1].doc).get("id"),
|
||||
r.document(search.scoreDocs[2].doc).get("id"))));
|
||||
}
|
||||
|
||||
r.close();
|
||||
w.close();
|
||||
dir.close();
|
||||
|
|
Loading…
Reference in New Issue