From ce7be4dc657017e0a9802ae6f61ccf0bc35b96de Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Mon, 28 Jan 2013 15:32:25 +0000 Subject: [PATCH] LUCENE-4727: use float as minShouldMatch on CommonTermsQuery git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1439449 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/queries/CommonTermsQuery.java | 31 ++++--- .../lucene/queries/CommonTermsQueryTest.java | 84 +++++++++++++++++++ 2 files changed, 105 insertions(+), 10 deletions(-) diff --git a/lucene/queries/src/java/org/apache/lucene/queries/CommonTermsQuery.java b/lucene/queries/src/java/org/apache/lucene/queries/CommonTermsQuery.java index 3e96a31fb96..5bdbadac8fd 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/CommonTermsQuery.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/CommonTermsQuery.java @@ -74,7 +74,7 @@ public class CommonTermsQuery extends Query { protected final Occur highFreqOccur; protected float lowFreqBoost = 1.0f; protected float highFreqBoost = 1.0f; - protected int minNrShouldMatch = 0; + protected float minNrShouldMatch = 0; /** * Creates a new {@link CommonTermsQuery} @@ -84,7 +84,7 @@ public class CommonTermsQuery extends Query { * @param lowFreqOccur * {@link Occur} used for low frequency terms * @param maxTermFrequency - * a value in [0..1] (or absolute number >=1) representing the + * a value in [0..1) (or absolute number >=1) representing the * maximum threshold of a terms document frequency to be considered a * low frequency term. * @throws IllegalArgumentException @@ -104,7 +104,7 @@ public class CommonTermsQuery extends Query { * @param lowFreqOccur * {@link Occur} used for low frequency terms * @param maxTermFrequency - * a value in [0..1] (or absolute number >=1) representing the + * a value in [0..1) (or absolute number >=1) representing the * maximum threshold of a terms document frequency to be considered a * low frequency term. * @param disableCoord @@ -160,15 +160,19 @@ public class CommonTermsQuery extends Query { return buildQuery(maxDoc, contextArray, queryTerms); } + protected int calcLowFreqMinimumNumberShouldMatch(int numOptional) { + if (minNrShouldMatch >= 1.0f || minNrShouldMatch == 0.0f) { + return (int) minNrShouldMatch; + } + return (int) (Math.round(minNrShouldMatch * numOptional)); + } + protected Query buildQuery(final int maxDoc, final TermContext[] contextArray, final Term[] queryTerms) { BooleanQuery lowFreq = new BooleanQuery(disableCoord); BooleanQuery highFreq = new BooleanQuery(disableCoord); highFreq.setBoost(highFreqBoost); lowFreq.setBoost(lowFreqBoost); - if (lowFreqOccur == Occur.SHOULD) { - lowFreq.setMinimumNumberShouldMatch(minNrShouldMatch); - } BooleanQuery query = new BooleanQuery(true); for (int i = 0; i < queryTerms.length; i++) { TermContext termContext = contextArray[i]; @@ -186,6 +190,11 @@ public class CommonTermsQuery extends Query { } } + final int numLowFreqClauses = lowFreq.clauses().size(); + if (lowFreqOccur == Occur.SHOULD && numLowFreqClauses > 0) { + int minMustMatch = calcLowFreqMinimumNumberShouldMatch(numLowFreqClauses); + lowFreq.setMinimumNumberShouldMatch(minMustMatch); + } if (lowFreq.clauses().isEmpty()) { /* * if lowFreq is empty we rewrite the high freq terms in a conjunction to @@ -265,7 +274,9 @@ public class CommonTermsQuery extends Query { /** * Specifies a minimum number of the optional BooleanClauses which must be * satisfied in order to produce a match on the low frequency terms query - * part. + * part. This method accepts a float value in the range [0..1) as a fraction + * of the actual query terms in the low frequent clause or a number + * >=1 as an absolut number of clauses that need to match. * *

* By default no optional clauses are necessary for a match (unless there are @@ -276,7 +287,7 @@ public class CommonTermsQuery extends Query { * @param min * the number of optional clauses that must match */ - public void setMinimumNumberShouldMatch(int min) { + public void setMinimumNumberShouldMatch(float min) { this.minNrShouldMatch = min; } @@ -284,7 +295,7 @@ public class CommonTermsQuery extends Query { * Gets the minimum number of the optional BooleanClauses which must be * satisfied. */ - public int getMinimumNumberShouldMatch() { + public float getMinimumNumberShouldMatch() { return minNrShouldMatch; } @@ -332,7 +343,7 @@ public class CommonTermsQuery extends Query { result = prime * result + ((lowFreqOccur == null) ? 0 : lowFreqOccur.hashCode()); result = prime * result + Float.floatToIntBits(maxTermFrequency); - result = prime * result + minNrShouldMatch; + result = prime * result + Float.floatToIntBits(minNrShouldMatch); result = prime * result + ((terms == null) ? 0 : terms.hashCode()); return result; } diff --git a/lucene/queries/src/test/org/apache/lucene/queries/CommonTermsQueryTest.java b/lucene/queries/src/test/org/apache/lucene/queries/CommonTermsQueryTest.java index 7571753248d..e3595d37ad5 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/CommonTermsQueryTest.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/CommonTermsQueryTest.java @@ -175,6 +175,90 @@ public class CommonTermsQueryTest extends LuceneTestCase { } } + public void testMinShouldMatch() throws IOException { + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), dir); + String[] docs = new String[] {"this is the end of the world right", + "is this it or maybe not", + "this is the end of the universe as we know it", + "there is the famous restaurant at the end of the universe",}; + for (int i = 0; i < docs.length; i++) { + Document doc = new Document(); + doc.add(newStringField("id", "" + i, Field.Store.YES)); + doc.add(newTextField("field", docs[i], Field.Store.NO)); + w.addDocument(doc); + } + + IndexReader r = w.getReader(); + IndexSearcher s = newSearcher(r); + { + CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.SHOULD, + random().nextBoolean() ? 2.0f : 0.5f); + query.add(new Term("field", "is")); + query.add(new Term("field", "this")); + query.add(new Term("field", "end")); + query.add(new Term("field", "world")); + query.add(new Term("field", "universe")); + query.add(new Term("field", "right")); + query.setMinimumNumberShouldMatch(0.5f); + TopDocs search = s.search(query, 10); + assertEquals(search.totalHits, 1); + assertEquals("0", r.document(search.scoreDocs[0].doc).get("id")); + } + { + CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.SHOULD, + random().nextBoolean() ? 2.0f : 0.5f); + query.add(new Term("field", "is")); + query.add(new Term("field", "this")); + query.add(new Term("field", "end")); + query.add(new Term("field", "world")); + query.add(new Term("field", "universe")); + query.add(new Term("field", "right")); + query.setMinimumNumberShouldMatch(2.0f); + TopDocs search = s.search(query, 10); + assertEquals(search.totalHits, 1); + assertEquals("0", r.document(search.scoreDocs[0].doc).get("id")); + } + + { + CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.SHOULD, + random().nextBoolean() ? 2.0f : 0.5f); + query.add(new Term("field", "is")); + query.add(new Term("field", "this")); + query.add(new Term("field", "end")); + query.add(new Term("field", "world")); + query.add(new Term("field", "universe")); + query.add(new Term("field", "right")); + query.setMinimumNumberShouldMatch(0.49f); + TopDocs search = s.search(query, 10); + assertEquals(search.totalHits, 3); + assertEquals("0", r.document(search.scoreDocs[0].doc).get("id")); + assertEquals("2", r.document(search.scoreDocs[1].doc).get("id")); + assertEquals("3", r.document(search.scoreDocs[2].doc).get("id")); + } + + { + CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.SHOULD, + random().nextBoolean() ? 2.0f : 0.5f); + query.add(new Term("field", "is")); + query.add(new Term("field", "this")); + query.add(new Term("field", "end")); + query.add(new Term("field", "world")); + query.add(new Term("field", "universe")); + query.add(new Term("field", "right")); + query.setMinimumNumberShouldMatch(1.0f); + TopDocs search = s.search(query, 10); + assertEquals(search.totalHits, 3); + assertEquals("0", r.document(search.scoreDocs[0].doc).get("id")); + assertEquals("2", r.document(search.scoreDocs[1].doc).get("id")); + assertEquals("3", r.document(search.scoreDocs[2].doc).get("id")); + } + + r.close(); + w.close(); + dir.close(); + } + public void testIllegalOccur() { Random random = random();