diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 19d6d67a3e4..2201088b1a6 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -451,6 +451,9 @@ Bug fixes
indexes, causing existing deletions to be applied on the incoming indexes as
well. (Shai Erera, Mike McCandless)
+* LUCENE-3068: sloppy phrase query failed to match valid documents when multiple
+ query terms had same position in the query. (Doron Cohen)
+
Test Cases
* LUCENE-3002: added 'tests.iter.min' to control 'tests.iter' by allowing to
diff --git a/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java b/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java
index cc9fad302d0..381518bbe10 100644
--- a/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java
+++ b/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java
@@ -18,7 +18,7 @@ package org.apache.lucene.search;
*/
import java.io.IOException;
-import java.util.HashMap;
+import java.util.HashSet;
final class SloppyPhraseScorer extends PhraseScorer {
private int slop;
@@ -109,8 +109,14 @@ final class SloppyPhraseScorer extends PhraseScorer {
/**
* Init PhrasePositions in place.
- * There is a one time initialization for this scorer:
+ * There is a one time initialization for this scorer (taking place at the first doc that matches all terms):
*
- Put in repeats[] each pp that has another pp with same position in the doc.
+ * This relies on that the position in PP is computed as (TP.position - offset) and
+ * so by adding offset we actually compare positions and identify that the two are
+ * the same term.
+ * An exclusion to this is two distinct terms in the same offset in query and same
+ * position in doc. This case is detected by comparing just the (query) offsets,
+ * and two such PPs are not considered "repeating".
*
- Also mark each such pp by pp.repeats = true.
*
Later can consult with repeats[] in termPositionsDiffer(pp), making that check efficient.
* In particular, this allows to score queries with no repetitions with no overhead due to this computation.
@@ -145,23 +151,26 @@ final class SloppyPhraseScorer extends PhraseScorer {
if (!checkedRepeats) {
checkedRepeats = true;
// check for repeats
- HashMap m = null;
+ HashSet m = null;
for (PhrasePositions pp = first; pp != null; pp = pp.next) {
int tpPos = pp.position + pp.offset;
for (PhrasePositions pp2 = pp.next; pp2 != null; pp2 = pp2.next) {
+ if (pp.offset == pp2.offset) {
+ continue; // not a repetition: the two PPs are originally in same offset in the query!
+ }
int tpPos2 = pp2.position + pp2.offset;
if (tpPos2 == tpPos) {
if (m == null)
- m = new HashMap();
+ m = new HashSet();
pp.repeats = true;
pp2.repeats = true;
- m.put(pp,null);
- m.put(pp2,null);
+ m.add(pp);
+ m.add(pp2);
}
}
}
if (m!=null)
- repeats = m.keySet().toArray(new PhrasePositions[0]);
+ repeats = m.toArray(new PhrasePositions[0]);
}
// with repeats must advance some repeating pp's so they all start with differing tp's
@@ -204,11 +213,16 @@ final class SloppyPhraseScorer extends PhraseScorer {
int tpPos = pp.position + pp.offset;
for (int i = 0; i < repeats.length; i++) {
PhrasePositions pp2 = repeats[i];
- if (pp2 == pp)
+ if (pp2 == pp) {
continue;
+ }
+ if (pp.offset == pp2.offset) {
+ continue; // not a repetition: the two PPs are originally in same offset in the query!
+ }
int tpPos2 = pp2.position + pp2.offset;
- if (tpPos2 == tpPos)
+ if (tpPos2 == tpPos) {
return pp.offset > pp2.offset ? pp : pp2; // do not differ: return the one with higher offset.
+ }
}
return null;
}
diff --git a/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java b/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java
index 2cca7411064..2db0d1da438 100644
--- a/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java
+++ b/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java
@@ -17,11 +17,14 @@ package org.apache.lucene.search;
* limitations under the License.
*/
+import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
+import org.apache.lucene.queryParser.ParseException;
+import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Explanation.IDFExplanation;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
@@ -423,7 +426,7 @@ public class TestMultiPhraseQuery extends LuceneTestCase {
mpq.add(new Term[] {new Term("field", "b"), new Term("field", "c")}, 0);
}
TopDocs hits = s.search(mpq, 2);
- assert hits.totalHits == 2;
+ assertEquals(2, hits.totalHits);
assertEquals(hits.scoreDocs[0].score, hits.scoreDocs[1].score, 1e-5);
/*
for(int hit=0;hit