mirror of https://github.com/apache/lucene.git
LUCENE-4727: use float as minShouldMatch on CommonTermsQuery
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1439449 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
765d3ef36c
commit
ce7be4dc65
|
@ -74,7 +74,7 @@ public class CommonTermsQuery extends Query {
|
||||||
protected final Occur highFreqOccur;
|
protected final Occur highFreqOccur;
|
||||||
protected float lowFreqBoost = 1.0f;
|
protected float lowFreqBoost = 1.0f;
|
||||||
protected float highFreqBoost = 1.0f;
|
protected float highFreqBoost = 1.0f;
|
||||||
protected int minNrShouldMatch = 0;
|
protected float minNrShouldMatch = 0;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a new {@link CommonTermsQuery}
|
* Creates a new {@link CommonTermsQuery}
|
||||||
|
@ -84,7 +84,7 @@ public class CommonTermsQuery extends Query {
|
||||||
* @param lowFreqOccur
|
* @param lowFreqOccur
|
||||||
* {@link Occur} used for low frequency terms
|
* {@link Occur} used for low frequency terms
|
||||||
* @param maxTermFrequency
|
* @param maxTermFrequency
|
||||||
* a value in [0..1] (or absolute number >=1) representing the
|
* a value in [0..1) (or absolute number >=1) representing the
|
||||||
* maximum threshold of a terms document frequency to be considered a
|
* maximum threshold of a terms document frequency to be considered a
|
||||||
* low frequency term.
|
* low frequency term.
|
||||||
* @throws IllegalArgumentException
|
* @throws IllegalArgumentException
|
||||||
|
@ -104,7 +104,7 @@ public class CommonTermsQuery extends Query {
|
||||||
* @param lowFreqOccur
|
* @param lowFreqOccur
|
||||||
* {@link Occur} used for low frequency terms
|
* {@link Occur} used for low frequency terms
|
||||||
* @param maxTermFrequency
|
* @param maxTermFrequency
|
||||||
* a value in [0..1] (or absolute number >=1) representing the
|
* a value in [0..1) (or absolute number >=1) representing the
|
||||||
* maximum threshold of a terms document frequency to be considered a
|
* maximum threshold of a terms document frequency to be considered a
|
||||||
* low frequency term.
|
* low frequency term.
|
||||||
* @param disableCoord
|
* @param disableCoord
|
||||||
|
@ -160,15 +160,19 @@ public class CommonTermsQuery extends Query {
|
||||||
return buildQuery(maxDoc, contextArray, queryTerms);
|
return buildQuery(maxDoc, contextArray, queryTerms);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected int calcLowFreqMinimumNumberShouldMatch(int numOptional) {
|
||||||
|
if (minNrShouldMatch >= 1.0f || minNrShouldMatch == 0.0f) {
|
||||||
|
return (int) minNrShouldMatch;
|
||||||
|
}
|
||||||
|
return (int) (Math.round(minNrShouldMatch * numOptional));
|
||||||
|
}
|
||||||
|
|
||||||
protected Query buildQuery(final int maxDoc,
|
protected Query buildQuery(final int maxDoc,
|
||||||
final TermContext[] contextArray, final Term[] queryTerms) {
|
final TermContext[] contextArray, final Term[] queryTerms) {
|
||||||
BooleanQuery lowFreq = new BooleanQuery(disableCoord);
|
BooleanQuery lowFreq = new BooleanQuery(disableCoord);
|
||||||
BooleanQuery highFreq = new BooleanQuery(disableCoord);
|
BooleanQuery highFreq = new BooleanQuery(disableCoord);
|
||||||
highFreq.setBoost(highFreqBoost);
|
highFreq.setBoost(highFreqBoost);
|
||||||
lowFreq.setBoost(lowFreqBoost);
|
lowFreq.setBoost(lowFreqBoost);
|
||||||
if (lowFreqOccur == Occur.SHOULD) {
|
|
||||||
lowFreq.setMinimumNumberShouldMatch(minNrShouldMatch);
|
|
||||||
}
|
|
||||||
BooleanQuery query = new BooleanQuery(true);
|
BooleanQuery query = new BooleanQuery(true);
|
||||||
for (int i = 0; i < queryTerms.length; i++) {
|
for (int i = 0; i < queryTerms.length; i++) {
|
||||||
TermContext termContext = contextArray[i];
|
TermContext termContext = contextArray[i];
|
||||||
|
@ -186,6 +190,11 @@ public class CommonTermsQuery extends Query {
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
final int numLowFreqClauses = lowFreq.clauses().size();
|
||||||
|
if (lowFreqOccur == Occur.SHOULD && numLowFreqClauses > 0) {
|
||||||
|
int minMustMatch = calcLowFreqMinimumNumberShouldMatch(numLowFreqClauses);
|
||||||
|
lowFreq.setMinimumNumberShouldMatch(minMustMatch);
|
||||||
|
}
|
||||||
if (lowFreq.clauses().isEmpty()) {
|
if (lowFreq.clauses().isEmpty()) {
|
||||||
/*
|
/*
|
||||||
* if lowFreq is empty we rewrite the high freq terms in a conjunction to
|
* if lowFreq is empty we rewrite the high freq terms in a conjunction to
|
||||||
|
@ -265,7 +274,9 @@ public class CommonTermsQuery extends Query {
|
||||||
/**
|
/**
|
||||||
* Specifies a minimum number of the optional BooleanClauses which must be
|
* Specifies a minimum number of the optional BooleanClauses which must be
|
||||||
* satisfied in order to produce a match on the low frequency terms query
|
* satisfied in order to produce a match on the low frequency terms query
|
||||||
* part.
|
* part. This method accepts a float value in the range [0..1) as a fraction
|
||||||
|
* of the actual query terms in the low frequent clause or a number
|
||||||
|
* <tt>>=1</tt> as an absolut number of clauses that need to match.
|
||||||
*
|
*
|
||||||
* <p>
|
* <p>
|
||||||
* By default no optional clauses are necessary for a match (unless there are
|
* By default no optional clauses are necessary for a match (unless there are
|
||||||
|
@ -276,7 +287,7 @@ public class CommonTermsQuery extends Query {
|
||||||
* @param min
|
* @param min
|
||||||
* the number of optional clauses that must match
|
* the number of optional clauses that must match
|
||||||
*/
|
*/
|
||||||
public void setMinimumNumberShouldMatch(int min) {
|
public void setMinimumNumberShouldMatch(float min) {
|
||||||
this.minNrShouldMatch = min;
|
this.minNrShouldMatch = min;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -284,7 +295,7 @@ public class CommonTermsQuery extends Query {
|
||||||
* Gets the minimum number of the optional BooleanClauses which must be
|
* Gets the minimum number of the optional BooleanClauses which must be
|
||||||
* satisfied.
|
* satisfied.
|
||||||
*/
|
*/
|
||||||
public int getMinimumNumberShouldMatch() {
|
public float getMinimumNumberShouldMatch() {
|
||||||
return minNrShouldMatch;
|
return minNrShouldMatch;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -332,7 +343,7 @@ public class CommonTermsQuery extends Query {
|
||||||
result = prime * result
|
result = prime * result
|
||||||
+ ((lowFreqOccur == null) ? 0 : lowFreqOccur.hashCode());
|
+ ((lowFreqOccur == null) ? 0 : lowFreqOccur.hashCode());
|
||||||
result = prime * result + Float.floatToIntBits(maxTermFrequency);
|
result = prime * result + Float.floatToIntBits(maxTermFrequency);
|
||||||
result = prime * result + minNrShouldMatch;
|
result = prime * result + Float.floatToIntBits(minNrShouldMatch);
|
||||||
result = prime * result + ((terms == null) ? 0 : terms.hashCode());
|
result = prime * result + ((terms == null) ? 0 : terms.hashCode());
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
|
@ -175,6 +175,90 @@ public class CommonTermsQueryTest extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testMinShouldMatch() throws IOException {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
|
||||||
|
String[] docs = new String[] {"this is the end of the world right",
|
||||||
|
"is this it or maybe not",
|
||||||
|
"this is the end of the universe as we know it",
|
||||||
|
"there is the famous restaurant at the end of the universe",};
|
||||||
|
for (int i = 0; i < docs.length; i++) {
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(newStringField("id", "" + i, Field.Store.YES));
|
||||||
|
doc.add(newTextField("field", docs[i], Field.Store.NO));
|
||||||
|
w.addDocument(doc);
|
||||||
|
}
|
||||||
|
|
||||||
|
IndexReader r = w.getReader();
|
||||||
|
IndexSearcher s = newSearcher(r);
|
||||||
|
{
|
||||||
|
CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.SHOULD,
|
||||||
|
random().nextBoolean() ? 2.0f : 0.5f);
|
||||||
|
query.add(new Term("field", "is"));
|
||||||
|
query.add(new Term("field", "this"));
|
||||||
|
query.add(new Term("field", "end"));
|
||||||
|
query.add(new Term("field", "world"));
|
||||||
|
query.add(new Term("field", "universe"));
|
||||||
|
query.add(new Term("field", "right"));
|
||||||
|
query.setMinimumNumberShouldMatch(0.5f);
|
||||||
|
TopDocs search = s.search(query, 10);
|
||||||
|
assertEquals(search.totalHits, 1);
|
||||||
|
assertEquals("0", r.document(search.scoreDocs[0].doc).get("id"));
|
||||||
|
}
|
||||||
|
{
|
||||||
|
CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.SHOULD,
|
||||||
|
random().nextBoolean() ? 2.0f : 0.5f);
|
||||||
|
query.add(new Term("field", "is"));
|
||||||
|
query.add(new Term("field", "this"));
|
||||||
|
query.add(new Term("field", "end"));
|
||||||
|
query.add(new Term("field", "world"));
|
||||||
|
query.add(new Term("field", "universe"));
|
||||||
|
query.add(new Term("field", "right"));
|
||||||
|
query.setMinimumNumberShouldMatch(2.0f);
|
||||||
|
TopDocs search = s.search(query, 10);
|
||||||
|
assertEquals(search.totalHits, 1);
|
||||||
|
assertEquals("0", r.document(search.scoreDocs[0].doc).get("id"));
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.SHOULD,
|
||||||
|
random().nextBoolean() ? 2.0f : 0.5f);
|
||||||
|
query.add(new Term("field", "is"));
|
||||||
|
query.add(new Term("field", "this"));
|
||||||
|
query.add(new Term("field", "end"));
|
||||||
|
query.add(new Term("field", "world"));
|
||||||
|
query.add(new Term("field", "universe"));
|
||||||
|
query.add(new Term("field", "right"));
|
||||||
|
query.setMinimumNumberShouldMatch(0.49f);
|
||||||
|
TopDocs search = s.search(query, 10);
|
||||||
|
assertEquals(search.totalHits, 3);
|
||||||
|
assertEquals("0", r.document(search.scoreDocs[0].doc).get("id"));
|
||||||
|
assertEquals("2", r.document(search.scoreDocs[1].doc).get("id"));
|
||||||
|
assertEquals("3", r.document(search.scoreDocs[2].doc).get("id"));
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.SHOULD,
|
||||||
|
random().nextBoolean() ? 2.0f : 0.5f);
|
||||||
|
query.add(new Term("field", "is"));
|
||||||
|
query.add(new Term("field", "this"));
|
||||||
|
query.add(new Term("field", "end"));
|
||||||
|
query.add(new Term("field", "world"));
|
||||||
|
query.add(new Term("field", "universe"));
|
||||||
|
query.add(new Term("field", "right"));
|
||||||
|
query.setMinimumNumberShouldMatch(1.0f);
|
||||||
|
TopDocs search = s.search(query, 10);
|
||||||
|
assertEquals(search.totalHits, 3);
|
||||||
|
assertEquals("0", r.document(search.scoreDocs[0].doc).get("id"));
|
||||||
|
assertEquals("2", r.document(search.scoreDocs[1].doc).get("id"));
|
||||||
|
assertEquals("3", r.document(search.scoreDocs[2].doc).get("id"));
|
||||||
|
}
|
||||||
|
|
||||||
|
r.close();
|
||||||
|
w.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
|
||||||
public void testIllegalOccur() {
|
public void testIllegalOccur() {
|
||||||
Random random = random();
|
Random random = random();
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue