LUCENE-3054: PhraseQuery can in some cases stack overflow in SorterTemplate.quickSort(). This fix also adds an optimization to PhraseQuery as term with lower doc freq will also have less positions

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1098633 13f79535-47bb-0310-9956-ffa450edef68
2011-05-02 15:47:38 +00:00 · 2011-05-02 15:47:38 +00:00 · 679cfee191
parent 11bed874c2
commit 679cfee191
5 changed files with 73 additions and 5 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -491,6 +491,11 @@ Bug fixes
  very special use cases of the TokenStream-API, most users would not
  have recognized it.  (Uwe Schindler, Robert Muir)

+* LUCENE-3054: PhraseQuery can in some cases stack overflow in
+  SorterTemplate.quickSort(). This fix also adds an optimization to
+  PhraseQuery as term with lower doc freq will also have less positions.
+  (Uwe Schindler, Robert Muir, Otis Gospodnetic)
+
 ======================= Lucene 3.1.0 =======================

 Changes in backwards compatibility policy
--- a/lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java
+++ b/lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java
@ -214,7 +214,7 @@ public class MultiPhraseQuery extends Query {
          docFreq = reader.docFreq(term.field(), term.bytes());
        }

-        postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(postingsEnum, docFreq, positions.get(pos).intValue());
+        postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(postingsEnum, docFreq, positions.get(pos).intValue(), terms[0]);
      }

      // sort by increasing docFreq order
--- a/lucene/src/java/org/apache/lucene/search/PhraseQuery.java
+++ b/lucene/src/java/org/apache/lucene/search/PhraseQuery.java
@ -124,16 +124,48 @@ public class PhraseQuery extends Query {
    final DocsAndPositionsEnum postings;
    final int docFreq;
    final int position;
+    final Term term;

-    public PostingsAndFreq(DocsAndPositionsEnum postings, int docFreq, int position) {
+    public PostingsAndFreq(DocsAndPositionsEnum postings, int docFreq, int position, Term term) {
      this.postings = postings;
      this.docFreq = docFreq;
      this.position = position;
+      this.term = term;
    }

    public int compareTo(PostingsAndFreq other) {
+      if (docFreq == other.docFreq) {
+        if (position == other.position) {
+          return term.compareTo(other.term);
+        }
+        return position - other.position;
+      }
      return docFreq - other.docFreq;
    }
+
+    @Override
+    public int hashCode() {
+      final int prime = 31;
+      int result = 1;
+      result = prime * result + docFreq;
+      result = prime * result + position;
+      result = prime * result + ((term == null) ? 0 : term.hashCode());
+      return result;
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+      if (this == obj) return true;
+      if (obj == null) return false;
+      if (getClass() != obj.getClass()) return false;
+      PostingsAndFreq other = (PostingsAndFreq) obj;
+      if (docFreq != other.docFreq) return false;
+      if (position != other.position) return false;
+      if (term == null) {
+        if (other.term != null) return false;
+      } else if (!term.equals(other.term)) return false;
+      return true;
+    }
  }

  private class PhraseWeight extends Weight {
@ -197,7 +229,7 @@ public class PhraseQuery extends Query {
            return null;
          }
        }
-        postingsFreqs[i] = new PostingsAndFreq(postingsEnum, reader.docFreq(t.field(), t.bytes()), positions.get(i).intValue());
+        postingsFreqs[i] = new PostingsAndFreq(postingsEnum, reader.docFreq(t.field(), t.bytes()), positions.get(i).intValue(), t);
      }

      // sort by increasing docFreq order
--- a/lucene/src/java/org/apache/lucene/util/SorterTemplate.java
+++ b/lucene/src/java/org/apache/lucene/util/SorterTemplate.java
@ -30,6 +30,7 @@ package org.apache.lucene.util;
 public abstract class SorterTemplate {

  private static final int MERGESORT_THRESHOLD = 12;
+  private static final int MERGE_TO_QUICKSORT_THRESHOLD = 40;
  private static final int QUICKSORT_THRESHOLD = 7;

  /** Implement this method, that swaps slots {@code i} and {@code j} in your data */
@ -63,6 +64,10 @@ public abstract class SorterTemplate {
  /** Sorts via in-place, but unstable, QuickSort algorithm.
   * For small collections falls back to {@link #insertionSort(int,int)}. */
  public final void quickSort(int lo, int hi) {
+    quickSort(lo, hi, MERGE_TO_QUICKSORT_THRESHOLD);
+  }
+  
+  private void quickSort(int lo, int hi, int maxDepth) {
    final int diff = hi - lo;
    if (diff <= QUICKSORT_THRESHOLD) {
      insertionSort(lo, hi);
@ -101,8 +106,16 @@ public abstract class SorterTemplate {
      }
    }

-    quickSort(lo, left);
-    quickSort(left + 1, hi);
+    // fall back to merge sort when recursion depth gets too big
+    if (maxDepth == 0) {
+      // for testing: new Exception("Hit recursion depth limit").printStackTrace();
+      mergeSort(lo, left);
+      mergeSort(left + 1, hi);
+    } else {
+      --maxDepth;
+      quickSort(lo, left, maxDepth);
+      quickSort(left + 1, hi, maxDepth);
+    }
  }
  
  /** Sorts via stable in-place MergeSort algorithm
--- a/lucene/src/test/org/apache/lucene/util/TestArrayUtil.java
+++ b/lucene/src/test/org/apache/lucene/util/TestArrayUtil.java
@ -144,6 +144,24 @@ public class TestArrayUtil extends LuceneTestCase {
    }
  }
  
+  private Integer[] createSparseRandomArray(int maxSize) {
+    final Integer[] a = new Integer[random.nextInt(maxSize) + 1];
+    for (int i = 0; i < a.length; i++) {
+      a[i] = Integer.valueOf(random.nextInt(2));
+    }
+    return a;
+  }
+  
+  // This is a test for LUCENE-3054 (which fails without the merge sort fall back with stack overflow in most cases)
+  public void testQuickToMergeSortFallback() {
+    for (int i = 0, c = 500 * RANDOM_MULTIPLIER; i < c; i++) {
+      Integer[] a1 = createSparseRandomArray(40000), a2 = a1.clone();
+      ArrayUtil.quickSort(a1);
+      Arrays.sort(a2);
+      assertArrayEquals(a2, a1);
+    }
+  }
+  
  public void testMergeSort() {
    for (int i = 0, c = 500 * RANDOM_MULTIPLIER; i < c; i++) {
      Integer[] a1 = createRandomArray(1000), a2 = a1.clone();