From 7d0dbdaf62299389876a01621430b3ede572ed80 Mon Sep 17 00:00:00 2001 From: Paul Sanwald Date: Tue, 29 Oct 2019 04:21:52 -0400 Subject: [PATCH] LUCENE-9024: Optimize IntroSelector to use median of medians (#966) --- lucene/CHANGES.txt | 4 + .../org/apache/lucene/util/IntroSelector.java | 97 +++++++++++++++---- .../apache/lucene/util/TestIntroSelector.java | 2 +- 3 files changed, 85 insertions(+), 18 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 89406a0174e..d2d111b29d8 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -35,6 +35,10 @@ Optimizations * LUCENE-8932: BKDReader's index is now stored off-heap when the IndexInput is an instance of ByteBufferIndexInput. (Jack Conradson via Adrien Grand) +* LUCENE-9024: IntroSelector now falls back to the median of medians algorithm + instead of sorting when the maximum recursion level is exceeded, providing + better worst-case runtime. (Paul Sanwald via Adrien Grand) + Bug Fixes * LUCENE-9001: Fix race condition in SetOnce. (Przemko Robakowski) diff --git a/lucene/core/src/java/org/apache/lucene/util/IntroSelector.java b/lucene/core/src/java/org/apache/lucene/util/IntroSelector.java index 298480182b2..b492d22705f 100644 --- a/lucene/core/src/java/org/apache/lucene/util/IntroSelector.java +++ b/lucene/core/src/java/org/apache/lucene/util/IntroSelector.java @@ -20,9 +20,8 @@ import java.util.Comparator; /** Implementation of the quick select algorithm. *

It uses the median of the first, middle and last values as a pivot and - * falls back to a heap sort when the number of recursion levels exceeds - * {@code 2 lg(n)}, as a consequence it runs in linear time on average and in - * {@code n log(n)} time in the worst case.

+ * falls back to a median of medians when the number of recursion levels exceeds + * {@code 2 lg(n)}, as a consequence it runs in linear time on average.

* @lucene.internal */ public abstract class IntroSelector extends Selector { @@ -33,26 +32,90 @@ public abstract class IntroSelector extends Selector { quickSelect(from, to, k, maxDepth); } - // heap sort - // TODO: use median of median instead to have linear worst-case rather than - // n*log(n) - void slowSelect(int from, int to, int k) { - new Sorter() { + int slowSelect(int from, int to, int k) { + return medianOfMediansSelect(from, to-1, k); + } - @Override - protected void swap(int i, int j) { - IntroSelector.this.swap(i, j); + int medianOfMediansSelect(int left, int right, int k) { + do { + // Defensive check, this is also checked in the calling + // method. Including here so this method can be used + // as a self contained quickSelect implementation. + if (left == right) { + return left; } + int pivotIndex = pivot(left, right); + pivotIndex = partition(left, right, k, pivotIndex); + if (k == pivotIndex) { + return k; + } else if (k < pivotIndex) { + right = pivotIndex-1; + } else { + left = pivotIndex+1; + } + } while (left != right); + return left; + } - @Override - protected int compare(int i, int j) { - return IntroSelector.this.compare(i, j); + private int partition(int left, int right, int k, int pivotIndex) { + setPivot(pivotIndex); + swap(pivotIndex, right); + int storeIndex = left; + for (int i = left; i < right; i++) { + if (comparePivot(i) > 0) { + swap(storeIndex, i); + storeIndex++; } + } + int storeIndexEq = storeIndex; + for (int i = storeIndex; i < right; i++) { + if (comparePivot(i) == 0) { + swap(storeIndexEq, i); + storeIndexEq++; + } + } + swap(right, storeIndexEq); + if (k < storeIndex) { + return storeIndex; + } else if (k <= storeIndexEq) { + return k; + } + return storeIndexEq; + } - public void sort(int from, int to) { - heapSort(from, to); + private int pivot(int left, int right) { + if (right - left < 5) { + int pivotIndex = partition5(left, right); + return pivotIndex; + } + + for (int i = left; i <= right; i=i+5) { + int subRight = i + 4; + if (subRight > right) { + subRight = right; } - }.sort(from, to); + int median5 = partition5(i, subRight); + swap(median5, left + ((i-left)/5)); + } + int mid = ((right - left) / 10) + left + 1; + int to = left + ((right - left)/5); + return medianOfMediansSelect(left, to, mid); + } + + // selects the median of a group of at most five elements, + // implemented using insertion sort. Efficient due to + // bounded nature of data set. + private int partition5(int left, int right) { + int i = left + 1; + while( i <= right) { + int j = i; + while (j > left && compare(j-1,j)>0) { + swap(j-1, j); + j--; + } + i++; + } + return (left + right) >>> 1; } private void quickSelect(int from, int to, int k, int maxDepth) { diff --git a/lucene/core/src/test/org/apache/lucene/util/TestIntroSelector.java b/lucene/core/src/test/org/apache/lucene/util/TestIntroSelector.java index 468b2f78809..c10269afd90 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestIntroSelector.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestIntroSelector.java @@ -36,7 +36,7 @@ public class TestIntroSelector extends LuceneTestCase { final int from = random().nextInt(5); final int to = from + TestUtil.nextInt(random(), 1, 10000); final int max = random().nextBoolean() ? random().nextInt(100) : random().nextInt(100000); - Integer[] arr = new Integer[from + to + random().nextInt(5)]; + Integer[] arr = new Integer[to + random().nextInt(5)]; for (int i = 0; i < arr.length; ++i) { arr[i] = TestUtil.nextInt(random(), 0, max); }