From fa93e73225cb10fab3ac8890aab1b610d292e313 Mon Sep 17 00:00:00 2001
From: Doron Cohen <doronc@apache.org>
Date: Thu, 8 Sep 2011 06:51:40 +0000
Subject: [PATCH] LUCENE-3412: SloppyPhraseScorer was returning
 non-deterministic results for queries with many repeats.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1166541 13f79535-47bb-0310-9956-ffa450edef68
---
 lucene/CHANGES.txt                            |  3 ++
 .../lucene/search/SloppyPhraseScorer.java     | 28 +++++++++++--------
 .../lucene/search/TestSloppyPhraseQuery.java  | 18 ++++++++++++
 3 files changed, 37 insertions(+), 12 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 244ce3e327f..5c588d28aa5 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -590,6 +590,9 @@ Bug fixes
   easily corrupt the index.  (Mark Miller, Robert Muir, Mike
   McCandless)
   
+* LUCENE-3412: SloppyPhraseScorer was returning non-deterministic results 
+  for queries with many repeats (Doron Cohen)
+  
 New Features
 
 * LUCENE-3290: Added FieldInvertState.numUniqueTerms 
diff --git a/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java b/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java
index d13f5cb41e6..882013979c8 100644
--- a/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java
+++ b/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java
@@ -18,7 +18,7 @@ package org.apache.lucene.search;
  */
 
 import java.io.IOException;
-import java.util.HashSet;
+import java.util.LinkedHashSet;
 
 final class SloppyPhraseScorer extends PhraseScorer {
     private int slop;
@@ -70,7 +70,7 @@ final class SloppyPhraseScorer extends PhraseScorer {
                     break;
                 }
                 PhrasePositions pp2 = null;
-                tpsDiffer = !pp.repeats || (pp2 = termPositionsDiffer(pp))==null;
+                tpsDiffer = !pp.repeats || (pp2 = termPositionsConflict(pp))==null;
                 if (pp2!=null && pp2!=pp) {
                   pp = flip(pp,pp2); // flip pp to pp2
                 }
@@ -118,7 +118,7 @@ final class SloppyPhraseScorer extends PhraseScorer {
      *       position in doc. This case is detected by comparing just the (query) offsets, 
      *       and two such PPs are not considered "repeating". 
      * <br>- Also mark each such pp by pp.repeats = true.
-     * <br>Later can consult with repeats[] in termPositionsDiffer(pp), making that check efficient.
+     * <br>Later can consult with repeats[] in termPositionsConflict(pp), making that check efficient.
      * In particular, this allows to score queries with no repetitions with no overhead due to this computation.
      * <br>- Example 1 - query with no repetitions: "ho my"~2
      * <br>- Example 2 - query with repetitions: "ho my my"~2
@@ -147,11 +147,11 @@ final class SloppyPhraseScorer extends PhraseScorer {
         for (PhrasePositions pp = first; pp != null; pp = pp.next)
             pp.firstPosition();
         
-        // one time initializatin for this scorer
+        // one time initialization for this scorer
         if (!checkedRepeats) {
             checkedRepeats = true;
             // check for repeats
-            HashSet<PhrasePositions> m = null;
+            LinkedHashSet<PhrasePositions> m = null; // see comment (*) below why order is important
             for (PhrasePositions pp = first; pp != null; pp = pp.next) {
                 int tpPos = pp.position + pp.offset;
                 for (PhrasePositions pp2 = pp.next; pp2 != null; pp2 = pp2.next) {
@@ -161,7 +161,7 @@ final class SloppyPhraseScorer extends PhraseScorer {
                     int tpPos2 = pp2.position + pp2.offset;
                     if (tpPos2 == tpPos) { 
                         if (m == null)
-                            m = new HashSet<PhrasePositions>();
+                            m = new LinkedHashSet<PhrasePositions>();
                         pp.repeats = true;
                         pp2.repeats = true;
                         m.add(pp);
@@ -173,14 +173,17 @@ final class SloppyPhraseScorer extends PhraseScorer {
                 repeats = m.toArray(new PhrasePositions[0]);
         }
         
-        // with repeats must advance some repeating pp's so they all start with differing tp's       
+        // with repeats must advance some repeating pp's so they all start with differing tp's
+        // (*) It is important that pps are handled by their original order in the query,
+        // because we advance the pp with larger offset, and so processing them in that order
+        // allows to cover all pairs.
         if (repeats!=null) {
             for (int i = 0; i < repeats.length; i++) {
                 PhrasePositions pp = repeats[i];
                 PhrasePositions pp2;
-                while ((pp2 = termPositionsDiffer(pp)) != null) {
-                  if (!pp2.nextPosition())  // out of pps that do not differ, advance the pp with higher offset 
-                      return -1;           // ran out of a term -- done  
+                while ((pp2 = termPositionsConflict(pp)) != null) {
+                  if (!pp2.nextPosition()) // among pps that do not differ, advance the pp with higher offset
+                      return -1;           // ran out of a term -- done
                 } 
             }
         }
@@ -196,6 +199,7 @@ final class SloppyPhraseScorer extends PhraseScorer {
         if (repeats!=null) {
           tmpPos = new PhrasePositions[pq.size()];
         }
+        
         return end;
     }
 
@@ -205,9 +209,9 @@ final class SloppyPhraseScorer extends PhraseScorer {
      * @return null if differ (i.e. valid) otherwise return the higher offset PhrasePositions
      * out of the first two PPs found to not differ.
      */
-    private PhrasePositions termPositionsDiffer(PhrasePositions pp) {
+    private PhrasePositions termPositionsConflict(PhrasePositions pp) {
         // efficiency note: a more efficient implementation could keep a map between repeating 
-        // pp's, so that if pp1a, pp1b, pp1c are repeats term1, and pp2a, pp2b are repeats 
+        // pp's, so that if pp1a, pp1b, pp1c are repeats of term1, and pp2a, pp2b are repeats 
         // of term2, pp2a would only be checked against pp2b but not against pp1a, pp1b, pp1c. 
         // However this would complicate code, for a rather rare case, so choice is to compromise here.
         int tpPos = pp.position + pp.offset;
diff --git a/lucene/src/test/org/apache/lucene/search/TestSloppyPhraseQuery.java b/lucene/src/test/org/apache/lucene/search/TestSloppyPhraseQuery.java
index e7ae2eafa3a..c7076c2542a 100755
--- a/lucene/src/test/org/apache/lucene/search/TestSloppyPhraseQuery.java
+++ b/lucene/src/test/org/apache/lucene/search/TestSloppyPhraseQuery.java
@@ -41,10 +41,13 @@ public class TestSloppyPhraseQuery extends LuceneTestCase {
   private static final Document DOC_2_B = makeDocument("X " + S_2 + " Y N N N N " + S_2 + " Z");
   private static final Document DOC_3_B = makeDocument("X " + S_1 + " A Y N N N N " + S_1 + " A Y");
   private static final Document DOC_4 = makeDocument("A A X A X B A X B B A A X B A A");
+  private static final Document DOC_5_3 = makeDocument("H H H X X X H H H X X X H H H");
+  private static final Document DOC_5_4 = makeDocument("H H H H");
 
   private static final PhraseQuery QUERY_1 = makePhraseQuery( S_1 );
   private static final PhraseQuery QUERY_2 = makePhraseQuery( S_2 );
   private static final PhraseQuery QUERY_4 = makePhraseQuery( "X A A");
+  private static final PhraseQuery QUERY_5_4 = makePhraseQuery( "H H H H");
 
   /**
    * Test DOC_4 and QUERY_4.
@@ -112,6 +115,21 @@ public class TestSloppyPhraseQuery extends LuceneTestCase {
     }
   }
 
+  /** LUCENE-3412 */
+  public void testDoc5_Query5_Any_Slop_Should_be_consistent() throws Exception {
+    int nRepeats = 5;
+    for (int slop=0; slop<3; slop++) {
+      for (int trial=0; trial<nRepeats; trial++) {
+        // should steadily always find this one
+        checkPhraseQuery(DOC_5_4, QUERY_5_4, slop, 1);
+      }
+      for (int trial=0; trial<nRepeats; trial++) {
+        // should steadily never find this one
+        checkPhraseQuery(DOC_5_3, QUERY_5_4, slop, 0);
+      }
+    }
+  }
+  
   private float  checkPhraseQuery(Document doc, PhraseQuery query, int slop, int expectedNumResults) throws Exception {
     query.setSlop(slop);