From 4d29e7e20ed0d0de5f593d5e7b36e58c7932aa83 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Tue, 30 Jul 2013 14:08:14 +0000 Subject: [PATCH] LUCENE-5149: CommonTermsQuery should allow minNrShouldMatch for high & low freq terms git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1508453 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 6 +- .../lucene/queries/CommonTermsQuery.java | 81 +++++++++++++++---- .../lucene/queries/CommonTermsQueryTest.java | 38 +++++++-- 3 files changed, 102 insertions(+), 23 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 95a5f46418c..dcba189f7a1 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -130,7 +130,11 @@ API Changes migrated to the new FacetsAggregator and FacetsAccumulator API. Also, FacetRequest.createAggregator was replaced by OldFacetsAccumulator.createAggregator. (Shai Erera) - + +* LUCENE-5149: CommonTermsQuery now allows to set the minimum number of terms that + should match for its high and low frequent sub-queries. Previously this was only + supported on the low frequent terms query. (Simon Willnauer) + Optimizations * LUCENE-5088: Added TermFilter to filter docs by a specific term. diff --git a/lucene/queries/src/java/org/apache/lucene/queries/CommonTermsQuery.java b/lucene/queries/src/java/org/apache/lucene/queries/CommonTermsQuery.java index cbf652fa114..c0389a79d99 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/CommonTermsQuery.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/CommonTermsQuery.java @@ -74,7 +74,8 @@ public class CommonTermsQuery extends Query { protected final Occur highFreqOccur; protected float lowFreqBoost = 1.0f; protected float highFreqBoost = 1.0f; - protected float minNrShouldMatch = 0; + protected float lowFreqMinNrShouldMatch = 0; + protected float highFreqMinNrShouldMatch = 0; /** * Creates a new {@link CommonTermsQuery} @@ -161,10 +162,18 @@ public class CommonTermsQuery extends Query { } protected int calcLowFreqMinimumNumberShouldMatch(int numOptional) { - if (minNrShouldMatch >= 1.0f || minNrShouldMatch == 0.0f) { - return (int) minNrShouldMatch; - } - return (int) (Math.round(minNrShouldMatch * numOptional)); + return minNrShouldMatch(lowFreqMinNrShouldMatch, numOptional); + } + + protected int calcHighFreqMinimumNumberShouldMatch(int numOptional) { + return minNrShouldMatch(highFreqMinNrShouldMatch, numOptional); + } + + private final int minNrShouldMatch(float minNrShouldMatch, int numOptional) { + if (minNrShouldMatch >= 1.0f || minNrShouldMatch == 0.0f) { + return (int) minNrShouldMatch; + } + return (int) (Math.round(minNrShouldMatch * numOptional)); } protected Query buildQuery(final int maxDoc, @@ -190,11 +199,16 @@ public class CommonTermsQuery extends Query { } } - final int numLowFreqClauses = lowFreq.clauses().size(); + final int numLowFreqClauses = lowFreq.clauses().size(); + final int numHighFreqClauses = highFreq.clauses().size(); if (lowFreqOccur == Occur.SHOULD && numLowFreqClauses > 0) { int minMustMatch = calcLowFreqMinimumNumberShouldMatch(numLowFreqClauses); lowFreq.setMinimumNumberShouldMatch(minMustMatch); } + if (highFreqOccur == Occur.SHOULD && numHighFreqClauses > 0) { + int minMustMatch = calcHighFreqMinimumNumberShouldMatch(numHighFreqClauses); + highFreq.setMinimumNumberShouldMatch(minMustMatch); + } if (lowFreq.clauses().isEmpty()) { /* * if lowFreq is empty we rewrite the high freq terms in a conjunction to @@ -272,7 +286,7 @@ public class CommonTermsQuery extends Query { } /** - * Specifies a minimum number of the optional BooleanClauses which must be + * Specifies a minimum number of the low frequent optional BooleanClauses which must be * satisfied in order to produce a match on the low frequency terms query * part. This method accepts a float value in the range [0..1) as a fraction * of the actual query terms in the low frequent clause or a number @@ -287,16 +301,44 @@ public class CommonTermsQuery extends Query { * @param min * the number of optional clauses that must match */ - public void setMinimumNumberShouldMatch(float min) { - this.minNrShouldMatch = min; + public void setLowFreqMinimumNumberShouldMatch(float min) { + this.lowFreqMinNrShouldMatch = min; } /** - * Gets the minimum number of the optional BooleanClauses which must be + * Gets the minimum number of the optional low frequent BooleanClauses which must be * satisfied. */ - public float getMinimumNumberShouldMatch() { - return minNrShouldMatch; + public float getLowFreqMinimumNumberShouldMatch() { + return lowFreqMinNrShouldMatch; + } + + /** + * Specifies a minimum number of the high frequent optional BooleanClauses which must be + * satisfied in order to produce a match on the low frequency terms query + * part. This method accepts a float value in the range [0..1) as a fraction + * of the actual query terms in the low frequent clause or a number + * >=1 as an absolut number of clauses that need to match. + * + *

+ * By default no optional clauses are necessary for a match (unless there are + * no required clauses). If this method is used, then the specified number of + * clauses is required. + *

+ * + * @param min + * the number of optional clauses that must match + */ + public void setHighFreqMinimumNumberShouldMatch(float min) { + this.highFreqMinNrShouldMatch = min; + } + + /** + * Gets the minimum number of the optional high frequent BooleanClauses which must be + * satisfied. + */ + public float getHighFreqMinimumNumberShouldMatch() { + return highFreqMinNrShouldMatch; } @Override @@ -308,7 +350,7 @@ public class CommonTermsQuery extends Query { public String toString(String field) { StringBuilder buffer = new StringBuilder(); boolean needParens = (getBoost() != 1.0) - || (getMinimumNumberShouldMatch() > 0); + || (getLowFreqMinimumNumberShouldMatch() > 0); if (needParens) { buffer.append("("); } @@ -321,9 +363,12 @@ public class CommonTermsQuery extends Query { if (needParens) { buffer.append(")"); } - if (getMinimumNumberShouldMatch() > 0) { + if (getLowFreqMinimumNumberShouldMatch() > 0 || getHighFreqMinimumNumberShouldMatch() > 0) { buffer.append('~'); - buffer.append(getMinimumNumberShouldMatch()); + buffer.append("("); + buffer.append(getLowFreqMinimumNumberShouldMatch()); + buffer.append(getHighFreqMinimumNumberShouldMatch()); + buffer.append(")"); } if (getBoost() != 1.0f) { buffer.append(ToStringUtils.boost(getBoost())); @@ -343,7 +388,8 @@ public class CommonTermsQuery extends Query { result = prime * result + ((lowFreqOccur == null) ? 0 : lowFreqOccur.hashCode()); result = prime * result + Float.floatToIntBits(maxTermFrequency); - result = prime * result + Float.floatToIntBits(minNrShouldMatch); + result = prime * result + Float.floatToIntBits(lowFreqMinNrShouldMatch); + result = prime * result + Float.floatToIntBits(highFreqMinNrShouldMatch); result = prime * result + ((terms == null) ? 0 : terms.hashCode()); return result; } @@ -363,7 +409,8 @@ public class CommonTermsQuery extends Query { if (lowFreqOccur != other.lowFreqOccur) return false; if (Float.floatToIntBits(maxTermFrequency) != Float .floatToIntBits(other.maxTermFrequency)) return false; - if (minNrShouldMatch != other.minNrShouldMatch) return false; + if (lowFreqMinNrShouldMatch != other.lowFreqMinNrShouldMatch) return false; + if (highFreqMinNrShouldMatch != other.highFreqMinNrShouldMatch) return false; if (terms == null) { if (other.terms != null) return false; } else if (!terms.equals(other.terms)) return false; diff --git a/lucene/queries/src/test/org/apache/lucene/queries/CommonTermsQueryTest.java b/lucene/queries/src/test/org/apache/lucene/queries/CommonTermsQueryTest.java index 0e94d9d08ad..60be3e80831 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/CommonTermsQueryTest.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/CommonTermsQueryTest.java @@ -19,6 +19,7 @@ package org.apache.lucene.queries; import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Random; @@ -146,6 +147,8 @@ public class CommonTermsQueryTest extends LuceneTestCase { left.add(new Term(_TestUtil.randomRealisticUnicodeString(r), _TestUtil .randomRealisticUnicodeString(r))); } + left.setHighFreqMinimumNumberShouldMatch(r.nextInt(4)); + left.setLowFreqMinimumNumberShouldMatch(r.nextInt(4)); r = new Random(seed); CommonTermsQuery right = new CommonTermsQuery(randomOccur(r), @@ -155,6 +158,8 @@ public class CommonTermsQueryTest extends LuceneTestCase { right.add(new Term(_TestUtil.randomRealisticUnicodeString(r), _TestUtil .randomRealisticUnicodeString(r))); } + right.setHighFreqMinimumNumberShouldMatch(r.nextInt(4)); + right.setLowFreqMinimumNumberShouldMatch(r.nextInt(4)); QueryUtils.checkEqual(left, right); } } @@ -200,7 +205,7 @@ public class CommonTermsQueryTest extends LuceneTestCase { query.add(new Term("field", "world")); query.add(new Term("field", "universe")); query.add(new Term("field", "right")); - query.setMinimumNumberShouldMatch(0.5f); + query.setLowFreqMinimumNumberShouldMatch(0.5f); TopDocs search = s.search(query, 10); assertEquals(search.totalHits, 1); assertEquals("0", r.document(search.scoreDocs[0].doc).get("id")); @@ -214,7 +219,7 @@ public class CommonTermsQueryTest extends LuceneTestCase { query.add(new Term("field", "world")); query.add(new Term("field", "universe")); query.add(new Term("field", "right")); - query.setMinimumNumberShouldMatch(2.0f); + query.setLowFreqMinimumNumberShouldMatch(2.0f); TopDocs search = s.search(query, 10); assertEquals(search.totalHits, 1); assertEquals("0", r.document(search.scoreDocs[0].doc).get("id")); @@ -229,7 +234,7 @@ public class CommonTermsQueryTest extends LuceneTestCase { query.add(new Term("field", "world")); query.add(new Term("field", "universe")); query.add(new Term("field", "right")); - query.setMinimumNumberShouldMatch(0.49f); + query.setLowFreqMinimumNumberShouldMatch(0.49f); TopDocs search = s.search(query, 10); assertEquals(search.totalHits, 3); assertEquals("0", r.document(search.scoreDocs[0].doc).get("id")); @@ -246,14 +251,37 @@ public class CommonTermsQueryTest extends LuceneTestCase { query.add(new Term("field", "world")); query.add(new Term("field", "universe")); query.add(new Term("field", "right")); - query.setMinimumNumberShouldMatch(1.0f); + query.setLowFreqMinimumNumberShouldMatch(1.0f); TopDocs search = s.search(query, 10); assertEquals(search.totalHits, 3); assertEquals("0", r.document(search.scoreDocs[0].doc).get("id")); assertEquals("2", r.document(search.scoreDocs[1].doc).get("id")); assertEquals("3", r.document(search.scoreDocs[2].doc).get("id")); + assertTrue(search.scoreDocs[1].score > search.scoreDocs[2].score); + } + + { + CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.SHOULD, + random().nextBoolean() ? 2.0f : 0.5f); + query.add(new Term("field", "is")); + query.add(new Term("field", "this")); + query.add(new Term("field", "end")); + query.add(new Term("field", "world")); + query.add(new Term("field", "universe")); + query.add(new Term("field", "right")); + query.setLowFreqMinimumNumberShouldMatch(1.0f); + query.setHighFreqMinimumNumberShouldMatch(4.0f); + TopDocs search = s.search(query, 10); + assertEquals(search.totalHits, 3); + assertEquals(search.scoreDocs[1].score, search.scoreDocs[2].score, 0.0f); + assertEquals("0", r.document(search.scoreDocs[0].doc).get("id")); + // doc 2 and 3 only get a score from low freq terms + assertEquals( + new HashSet<>(Arrays.asList("2", "3")), + new HashSet<>(Arrays.asList( + r.document(search.scoreDocs[1].doc).get("id"), + r.document(search.scoreDocs[2].doc).get("id")))); } - r.close(); w.close(); dir.close();