diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 19d6d67a3e4..2201088b1a6 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -451,6 +451,9 @@ Bug fixes indexes, causing existing deletions to be applied on the incoming indexes as well. (Shai Erera, Mike McCandless) +* LUCENE-3068: sloppy phrase query failed to match valid documents when multiple + query terms had same position in the query. (Doron Cohen) + Test Cases * LUCENE-3002: added 'tests.iter.min' to control 'tests.iter' by allowing to diff --git a/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java b/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java index cc9fad302d0..381518bbe10 100644 --- a/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java +++ b/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java @@ -18,7 +18,7 @@ package org.apache.lucene.search; */ import java.io.IOException; -import java.util.HashMap; +import java.util.HashSet; final class SloppyPhraseScorer extends PhraseScorer { private int slop; @@ -109,8 +109,14 @@ final class SloppyPhraseScorer extends PhraseScorer { /** * Init PhrasePositions in place. - * There is a one time initialization for this scorer: + * There is a one time initialization for this scorer (taking place at the first doc that matches all terms): *
- Put in repeats[] each pp that has another pp with same position in the doc. + * This relies on that the position in PP is computed as (TP.position - offset) and + * so by adding offset we actually compare positions and identify that the two are + * the same term. + * An exclusion to this is two distinct terms in the same offset in query and same + * position in doc. This case is detected by comparing just the (query) offsets, + * and two such PPs are not considered "repeating". *
- Also mark each such pp by pp.repeats = true. *
Later can consult with repeats[] in termPositionsDiffer(pp), making that check efficient. * In particular, this allows to score queries with no repetitions with no overhead due to this computation. @@ -145,23 +151,26 @@ final class SloppyPhraseScorer extends PhraseScorer { if (!checkedRepeats) { checkedRepeats = true; // check for repeats - HashMap m = null; + HashSet m = null; for (PhrasePositions pp = first; pp != null; pp = pp.next) { int tpPos = pp.position + pp.offset; for (PhrasePositions pp2 = pp.next; pp2 != null; pp2 = pp2.next) { + if (pp.offset == pp2.offset) { + continue; // not a repetition: the two PPs are originally in same offset in the query! + } int tpPos2 = pp2.position + pp2.offset; if (tpPos2 == tpPos) { if (m == null) - m = new HashMap(); + m = new HashSet(); pp.repeats = true; pp2.repeats = true; - m.put(pp,null); - m.put(pp2,null); + m.add(pp); + m.add(pp2); } } } if (m!=null) - repeats = m.keySet().toArray(new PhrasePositions[0]); + repeats = m.toArray(new PhrasePositions[0]); } // with repeats must advance some repeating pp's so they all start with differing tp's @@ -204,11 +213,16 @@ final class SloppyPhraseScorer extends PhraseScorer { int tpPos = pp.position + pp.offset; for (int i = 0; i < repeats.length; i++) { PhrasePositions pp2 = repeats[i]; - if (pp2 == pp) + if (pp2 == pp) { continue; + } + if (pp.offset == pp2.offset) { + continue; // not a repetition: the two PPs are originally in same offset in the query! + } int tpPos2 = pp2.position + pp2.offset; - if (tpPos2 == tpPos) + if (tpPos2 == tpPos) { return pp.offset > pp2.offset ? pp : pp2; // do not differ: return the one with higher offset. + } } return null; } diff --git a/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java b/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java index 2cca7411064..2db0d1da438 100644 --- a/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java +++ b/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java @@ -17,11 +17,14 @@ package org.apache.lucene.search; * limitations under the License. */ +import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultiFields; +import org.apache.lucene.queryParser.ParseException; +import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.Explanation.IDFExplanation; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; @@ -423,7 +426,7 @@ public class TestMultiPhraseQuery extends LuceneTestCase { mpq.add(new Term[] {new Term("field", "b"), new Term("field", "c")}, 0); } TopDocs hits = s.search(mpq, 2); - assert hits.totalHits == 2; + assertEquals(2, hits.totalHits); assertEquals(hits.scoreDocs[0].score, hits.scoreDocs[1].score, 1e-5); /* for(int hit=0;hit