LUCENE-4727: use float as minShouldMatch on CommonTermsQuery

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1439449 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Simon Willnauer 2013-01-28 15:32:25 +00:00
parent 765d3ef36c
commit ce7be4dc65
2 changed files with 105 additions and 10 deletions

View File

@ -74,7 +74,7 @@ public class CommonTermsQuery extends Query {
protected final Occur highFreqOccur; protected final Occur highFreqOccur;
protected float lowFreqBoost = 1.0f; protected float lowFreqBoost = 1.0f;
protected float highFreqBoost = 1.0f; protected float highFreqBoost = 1.0f;
protected int minNrShouldMatch = 0; protected float minNrShouldMatch = 0;
/** /**
* Creates a new {@link CommonTermsQuery} * Creates a new {@link CommonTermsQuery}
@ -84,7 +84,7 @@ public class CommonTermsQuery extends Query {
* @param lowFreqOccur * @param lowFreqOccur
* {@link Occur} used for low frequency terms * {@link Occur} used for low frequency terms
* @param maxTermFrequency * @param maxTermFrequency
* a value in [0..1] (or absolute number >=1) representing the * a value in [0..1) (or absolute number >=1) representing the
* maximum threshold of a terms document frequency to be considered a * maximum threshold of a terms document frequency to be considered a
* low frequency term. * low frequency term.
* @throws IllegalArgumentException * @throws IllegalArgumentException
@ -104,7 +104,7 @@ public class CommonTermsQuery extends Query {
* @param lowFreqOccur * @param lowFreqOccur
* {@link Occur} used for low frequency terms * {@link Occur} used for low frequency terms
* @param maxTermFrequency * @param maxTermFrequency
* a value in [0..1] (or absolute number >=1) representing the * a value in [0..1) (or absolute number >=1) representing the
* maximum threshold of a terms document frequency to be considered a * maximum threshold of a terms document frequency to be considered a
* low frequency term. * low frequency term.
* @param disableCoord * @param disableCoord
@ -160,15 +160,19 @@ public class CommonTermsQuery extends Query {
return buildQuery(maxDoc, contextArray, queryTerms); return buildQuery(maxDoc, contextArray, queryTerms);
} }
protected int calcLowFreqMinimumNumberShouldMatch(int numOptional) {
if (minNrShouldMatch >= 1.0f || minNrShouldMatch == 0.0f) {
return (int) minNrShouldMatch;
}
return (int) (Math.round(minNrShouldMatch * numOptional));
}
protected Query buildQuery(final int maxDoc, protected Query buildQuery(final int maxDoc,
final TermContext[] contextArray, final Term[] queryTerms) { final TermContext[] contextArray, final Term[] queryTerms) {
BooleanQuery lowFreq = new BooleanQuery(disableCoord); BooleanQuery lowFreq = new BooleanQuery(disableCoord);
BooleanQuery highFreq = new BooleanQuery(disableCoord); BooleanQuery highFreq = new BooleanQuery(disableCoord);
highFreq.setBoost(highFreqBoost); highFreq.setBoost(highFreqBoost);
lowFreq.setBoost(lowFreqBoost); lowFreq.setBoost(lowFreqBoost);
if (lowFreqOccur == Occur.SHOULD) {
lowFreq.setMinimumNumberShouldMatch(minNrShouldMatch);
}
BooleanQuery query = new BooleanQuery(true); BooleanQuery query = new BooleanQuery(true);
for (int i = 0; i < queryTerms.length; i++) { for (int i = 0; i < queryTerms.length; i++) {
TermContext termContext = contextArray[i]; TermContext termContext = contextArray[i];
@ -186,6 +190,11 @@ public class CommonTermsQuery extends Query {
} }
} }
final int numLowFreqClauses = lowFreq.clauses().size();
if (lowFreqOccur == Occur.SHOULD && numLowFreqClauses > 0) {
int minMustMatch = calcLowFreqMinimumNumberShouldMatch(numLowFreqClauses);
lowFreq.setMinimumNumberShouldMatch(minMustMatch);
}
if (lowFreq.clauses().isEmpty()) { if (lowFreq.clauses().isEmpty()) {
/* /*
* if lowFreq is empty we rewrite the high freq terms in a conjunction to * if lowFreq is empty we rewrite the high freq terms in a conjunction to
@ -265,7 +274,9 @@ public class CommonTermsQuery extends Query {
/** /**
* Specifies a minimum number of the optional BooleanClauses which must be * Specifies a minimum number of the optional BooleanClauses which must be
* satisfied in order to produce a match on the low frequency terms query * satisfied in order to produce a match on the low frequency terms query
* part. * part. This method accepts a float value in the range [0..1) as a fraction
* of the actual query terms in the low frequent clause or a number
* <tt>&gt;=1</tt> as an absolut number of clauses that need to match.
* *
* <p> * <p>
* By default no optional clauses are necessary for a match (unless there are * By default no optional clauses are necessary for a match (unless there are
@ -276,7 +287,7 @@ public class CommonTermsQuery extends Query {
* @param min * @param min
* the number of optional clauses that must match * the number of optional clauses that must match
*/ */
public void setMinimumNumberShouldMatch(int min) { public void setMinimumNumberShouldMatch(float min) {
this.minNrShouldMatch = min; this.minNrShouldMatch = min;
} }
@ -284,7 +295,7 @@ public class CommonTermsQuery extends Query {
* Gets the minimum number of the optional BooleanClauses which must be * Gets the minimum number of the optional BooleanClauses which must be
* satisfied. * satisfied.
*/ */
public int getMinimumNumberShouldMatch() { public float getMinimumNumberShouldMatch() {
return minNrShouldMatch; return minNrShouldMatch;
} }
@ -332,7 +343,7 @@ public class CommonTermsQuery extends Query {
result = prime * result result = prime * result
+ ((lowFreqOccur == null) ? 0 : lowFreqOccur.hashCode()); + ((lowFreqOccur == null) ? 0 : lowFreqOccur.hashCode());
result = prime * result + Float.floatToIntBits(maxTermFrequency); result = prime * result + Float.floatToIntBits(maxTermFrequency);
result = prime * result + minNrShouldMatch; result = prime * result + Float.floatToIntBits(minNrShouldMatch);
result = prime * result + ((terms == null) ? 0 : terms.hashCode()); result = prime * result + ((terms == null) ? 0 : terms.hashCode());
return result; return result;
} }

View File

@ -175,6 +175,90 @@ public class CommonTermsQueryTest extends LuceneTestCase {
} }
} }
public void testMinShouldMatch() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
String[] docs = new String[] {"this is the end of the world right",
"is this it or maybe not",
"this is the end of the universe as we know it",
"there is the famous restaurant at the end of the universe",};
for (int i = 0; i < docs.length; i++) {
Document doc = new Document();
doc.add(newStringField("id", "" + i, Field.Store.YES));
doc.add(newTextField("field", docs[i], Field.Store.NO));
w.addDocument(doc);
}
IndexReader r = w.getReader();
IndexSearcher s = newSearcher(r);
{
CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.SHOULD,
random().nextBoolean() ? 2.0f : 0.5f);
query.add(new Term("field", "is"));
query.add(new Term("field", "this"));
query.add(new Term("field", "end"));
query.add(new Term("field", "world"));
query.add(new Term("field", "universe"));
query.add(new Term("field", "right"));
query.setMinimumNumberShouldMatch(0.5f);
TopDocs search = s.search(query, 10);
assertEquals(search.totalHits, 1);
assertEquals("0", r.document(search.scoreDocs[0].doc).get("id"));
}
{
CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.SHOULD,
random().nextBoolean() ? 2.0f : 0.5f);
query.add(new Term("field", "is"));
query.add(new Term("field", "this"));
query.add(new Term("field", "end"));
query.add(new Term("field", "world"));
query.add(new Term("field", "universe"));
query.add(new Term("field", "right"));
query.setMinimumNumberShouldMatch(2.0f);
TopDocs search = s.search(query, 10);
assertEquals(search.totalHits, 1);
assertEquals("0", r.document(search.scoreDocs[0].doc).get("id"));
}
{
CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.SHOULD,
random().nextBoolean() ? 2.0f : 0.5f);
query.add(new Term("field", "is"));
query.add(new Term("field", "this"));
query.add(new Term("field", "end"));
query.add(new Term("field", "world"));
query.add(new Term("field", "universe"));
query.add(new Term("field", "right"));
query.setMinimumNumberShouldMatch(0.49f);
TopDocs search = s.search(query, 10);
assertEquals(search.totalHits, 3);
assertEquals("0", r.document(search.scoreDocs[0].doc).get("id"));
assertEquals("2", r.document(search.scoreDocs[1].doc).get("id"));
assertEquals("3", r.document(search.scoreDocs[2].doc).get("id"));
}
{
CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.SHOULD,
random().nextBoolean() ? 2.0f : 0.5f);
query.add(new Term("field", "is"));
query.add(new Term("field", "this"));
query.add(new Term("field", "end"));
query.add(new Term("field", "world"));
query.add(new Term("field", "universe"));
query.add(new Term("field", "right"));
query.setMinimumNumberShouldMatch(1.0f);
TopDocs search = s.search(query, 10);
assertEquals(search.totalHits, 3);
assertEquals("0", r.document(search.scoreDocs[0].doc).get("id"));
assertEquals("2", r.document(search.scoreDocs[1].doc).get("id"));
assertEquals("3", r.document(search.scoreDocs[2].doc).get("id"));
}
r.close();
w.close();
dir.close();
}
public void testIllegalOccur() { public void testIllegalOccur() {
Random random = random(); Random random = random();