From 8613df447f0ab431b469074b4bd89b8ce0525959 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Sat, 16 Mar 2013 16:33:30 +0000 Subject: [PATCH] LUCENE-4839: Sorter API: Use TimSort to sort doc IDs and postings lists. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1457272 13f79535-47bb-0310-9956-ffa450edef68 --- .../org/apache/lucene/util/ArrayUtil.java | 80 +++++++ .../apache/lucene/util/CollectionUtil.java | 51 +++++ .../apache/lucene/util/SorterTemplate.java | 213 +++++++++++++++++- .../org/apache/lucene/util/TestArrayUtil.java | 76 ++++++- .../lucene/util/TestCollectionUtil.java | 54 ++++- .../lucene/util/TestSorterTemplate.java | 167 ++++++++++++++ .../apache/lucene/index/sorter/Sorter.java | 5 +- .../index/sorter/SortingAtomicReader.java | 26 +-- 8 files changed, 650 insertions(+), 22 deletions(-) create mode 100644 lucene/core/src/test/org/apache/lucene/util/TestSorterTemplate.java diff --git a/lucene/core/src/java/org/apache/lucene/util/ArrayUtil.java b/lucene/core/src/java/org/apache/lucene/util/ArrayUtil.java index 9b722f9719e..d31a2e43f33 100644 --- a/lucene/core/src/java/org/apache/lucene/util/ArrayUtil.java +++ b/lucene/core/src/java/org/apache/lucene/util/ArrayUtil.java @@ -744,6 +744,46 @@ public final class ArrayUtil { mergeSort(a, 0, a.length); } + // timSorts: + + /** + * Sorts the given array slice using the {@link Comparator}. This method uses the TimSort + * algorithm, but falls back to binary sort for small arrays. + * @param fromIndex start index (inclusive) + * @param toIndex end index (exclusive) + */ + public static void timSort(T[] a, int fromIndex, int toIndex, Comparator comp) { + if (toIndex-fromIndex <= 1) return; + getSorter(a, comp).timSort(fromIndex, toIndex-1); + } + + /** + * Sorts the given array using the {@link Comparator}. This method uses the TimSort + * algorithm, but falls back to binary sort for small arrays. + */ + public static void timSort(T[] a, Comparator comp) { + timSort(a, 0, a.length, comp); + } + + /** + * Sorts the given array slice in natural order. This method uses the TimSort + * algorithm, but falls back to binary sort for small arrays. + * @param fromIndex start index (inclusive) + * @param toIndex end index (exclusive) + */ + public static > void timSort(T[] a, int fromIndex, int toIndex) { + if (toIndex-fromIndex <= 1) return; + getSorter(a).timSort(fromIndex, toIndex-1); + } + + /** + * Sorts the given array in natural order. This method uses the TimSort + * algorithm, but falls back to binary sort for small arrays. + */ + public static > void timSort(T[] a) { + timSort(a, 0, a.length); + } + // insertionSorts: /** @@ -784,4 +824,44 @@ public final class ArrayUtil { insertionSort(a, 0, a.length); } + // binarySorts: + + /** + * Sorts the given array slice using the {@link Comparator}. This method uses the binary sort + * algorithm. It is only recommended to use this algorithm for small arrays! + * @param fromIndex start index (inclusive) + * @param toIndex end index (exclusive) + */ + public static void binarySort(T[] a, int fromIndex, int toIndex, Comparator comp) { + if (toIndex-fromIndex <= 1) return; + getSorter(a, comp).binarySort(fromIndex, toIndex-1); + } + + /** + * Sorts the given array using the {@link Comparator}. This method uses the binary sort + * algorithm. It is only recommended to use this algorithm for small arrays! + */ + public static void binarySort(T[] a, Comparator comp) { + binarySort(a, 0, a.length, comp); + } + + /** + * Sorts the given array slice in natural order. This method uses the binary sort + * algorithm. It is only recommended to use this algorithm for small arrays! + * @param fromIndex start index (inclusive) + * @param toIndex end index (exclusive) + */ + public static > void binarySort(T[] a, int fromIndex, int toIndex) { + if (toIndex-fromIndex <= 1) return; + getSorter(a).binarySort(fromIndex, toIndex-1); + } + + /** + * Sorts the given array in natural order. This method uses the binary sort + * algorithm. It is only recommended to use this algorithm for small arrays! + */ + public static > void binarySort(T[] a) { + binarySort(a, 0, a.length); + } + } \ No newline at end of file diff --git a/lucene/core/src/java/org/apache/lucene/util/CollectionUtil.java b/lucene/core/src/java/org/apache/lucene/util/CollectionUtil.java index 26478934b03..486a5a365d5 100644 --- a/lucene/core/src/java/org/apache/lucene/util/CollectionUtil.java +++ b/lucene/core/src/java/org/apache/lucene/util/CollectionUtil.java @@ -143,6 +143,32 @@ public final class CollectionUtil { getSorter(list).mergeSort(0, size-1); } + // timSorts: + + /** + * Sorts the given random access {@link List} using the {@link Comparator}. + * The list must implement {@link RandomAccess}. This method uses the TimSort + * algorithm, but falls back to binary sort for small lists. + * @throws IllegalArgumentException if list is e.g. a linked list without random access. + */ + public static void timSort(List list, Comparator comp) { + final int size = list.size(); + if (size <= 1) return; + getSorter(list, comp).timSort(0, size-1); + } + + /** + * Sorts the given random access {@link List} in natural order. + * The list must implement {@link RandomAccess}. This method uses the TimSort + * algorithm, but falls back to binary sort for small lists. + * @throws IllegalArgumentException if list is e.g. a linked list without random access. + */ + public static > void timSort(List list) { + final int size = list.size(); + if (size <= 1) return; + getSorter(list).timSort(0, size-1); + } + // insertionSorts: /** @@ -168,5 +194,30 @@ public final class CollectionUtil { if (size <= 1) return; getSorter(list).insertionSort(0, size-1); } + + // binarySorts: + /** + * Sorts the given random access {@link List} using the {@link Comparator}. + * The list must implement {@link RandomAccess}. This method uses the binary sort + * algorithm. It is only recommended to use this algorithm for small lists! + * @throws IllegalArgumentException if list is e.g. a linked list without random access. + */ + public static void binarySort(List list, Comparator comp) { + final int size = list.size(); + if (size <= 1) return; + getSorter(list, comp).binarySort(0, size-1); + } + + /** + * Sorts the given random access {@link List} in natural order. + * The list must implement {@link RandomAccess}. This method uses the insertion sort + * algorithm. It is only recommended to use this algorithm for small lists! + * @throws IllegalArgumentException if list is e.g. a linked list without random access. + */ + public static > void binarySort(List list) { + final int size = list.size(); + if (size <= 1) return; + getSorter(list).binarySort(0, size-1); + } } \ No newline at end of file diff --git a/lucene/core/src/java/org/apache/lucene/util/SorterTemplate.java b/lucene/core/src/java/org/apache/lucene/util/SorterTemplate.java index 0ab243e5d9a..435b09ae59d 100644 --- a/lucene/core/src/java/org/apache/lucene/util/SorterTemplate.java +++ b/lucene/core/src/java/org/apache/lucene/util/SorterTemplate.java @@ -1,5 +1,6 @@ package org.apache.lucene.util; + /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -29,9 +30,27 @@ package org.apache.lucene.util; */ public abstract class SorterTemplate { + private static final int TIMSORT_MINRUN = 32; + private static final int TIMSORT_THRESHOLD = 64; + private static final int TIMSORT_STACKSIZE = 40; // change if you change TIMSORT_MINRUN private static final int MERGESORT_THRESHOLD = 12; private static final int QUICKSORT_THRESHOLD = 7; + static { + // check whether TIMSORT_STACKSIZE is large enough + // for a run length of TIMSORT_MINRUN and an array + // of 2B values when TimSort invariants are verified + final long[] lengths = new long[TIMSORT_STACKSIZE]; + lengths[0] = TIMSORT_MINRUN; + lengths[1] = lengths[0] + 1; + for (int i = 2; i < TIMSORT_STACKSIZE; ++i) { + lengths[i] = lengths[i-2] + lengths[i-1] + 1; + } + if (lengths[TIMSORT_STACKSIZE - 1] < Integer.MAX_VALUE) { + throw new Error("TIMSORT_STACKSIZE is too small"); + } + } + /** Implement this method, that swaps slots {@code i} and {@code j} in your data */ protected abstract void swap(int i, int j); @@ -46,7 +65,7 @@ public abstract class SorterTemplate { * Should be implemented like pivot.compareTo(valueOf(j)) */ protected abstract int comparePivot(int j); - /** Sorts via stable in-place InsertionSort algorithm + /** Sorts via stable in-place InsertionSort algorithm (O(n2)) *(ideal for small collections which are mostly presorted). */ public final void insertionSort(int lo, int hi) { for (int i = lo + 1 ; i <= hi; i++) { @@ -60,6 +79,28 @@ public abstract class SorterTemplate { } } + /** Sorts via stable in-place BinarySort algorithm (O(n2)) + * (ideal for small collections which are in random order). */ + public final void binarySort(int lo, int hi) { + for (int i = lo + 1; i <= hi; ++i) { + int l = lo; + int h = i - 1; + setPivot(i); + while (l <= h) { + final int mid = (l + h) >>> 1; + final int cmp = comparePivot(mid); + if (cmp < 0) { + h = mid - 1; + } else { + l = mid + 1; + } + } + for (int j = i; j > l; --j) { + swap(j - 1, j); + } + } + } + /** Sorts via in-place, but unstable, QuickSort algorithm. * For small collections falls back to {@link #insertionSort(int,int)}. */ public final void quickSort(final int lo, final int hi) { @@ -117,7 +158,175 @@ public abstract class SorterTemplate { quickSort(lo, left, maxDepth); quickSort(left + 1, hi, maxDepth); } - + + /** TimSort implementation. The only difference with the spec is that this + * impl reuses {@link SorterTemplate#merge(int, int, int, int, int)} to + * merge runs (in place) instead of the original merging routine from + * TimSort (which requires extra memory but might be slightly faster). */ + private class TimSort { + + final int hi; + final int minRun; + final int[] runEnds; + int stackSize; + + TimSort(int lo, int hi) { + assert hi > lo; + // +1 because the first slot is reserved and always lo + runEnds = new int[TIMSORT_STACKSIZE + 1]; + runEnds[0] = lo; + stackSize = 0; + this.hi = hi; + minRun = minRun(hi - lo + 1); + } + + /** Minimum run length for an array of length length. */ + int minRun(int length) { + assert length >= TIMSORT_MINRUN; + int n = length; + int r = 0; + while (n >= 64) { + r |= n & 1; + n >>>= 1; + } + final int minRun = n + r; + assert minRun >= TIMSORT_MINRUN && minRun <= 64; + return minRun; + } + + int runLen(int i) { + final int off = stackSize - i; + return runEnds[off] - runEnds[off - 1]; + } + + int runBase(int i) { + return runEnds[stackSize - i - 1]; + } + + int runEnd(int i) { + return runEnds[stackSize - i]; + } + + void setRunEnd(int i, int runEnd) { + runEnds[stackSize - i] = runEnd; + } + + void pushRunLen(int len) { + runEnds[stackSize + 1] = runEnds[stackSize] + len; + ++stackSize; + } + + /** Merge run i with run i+1 */ + void mergeAt(int i) { + assert stackSize > i + 1; + final int l = runBase(i+1); + final int pivot = runBase(i); + final int h = runEnd(i); + merge(l, pivot, h, pivot - l, h - pivot); + for (int j = 1; j <= i+1; ++j) { + setRunEnd(j, runEnd(j-1)); + } + --stackSize; + } + + /** Compute the length of the next run, make the run sorted and return its + * length. */ + int nextRun() { + final int runBase = runEnd(0); + if (runBase == hi) { + return 1; + } + int l = 1; // length of the run + if (compare(runBase, runBase+1) > 0) { + // run must be strictly descending + while (runBase + l <= hi && compare(runBase + l - 1, runBase + l) > 0) { + ++l; + } + if (l < minRun && runBase + l <= hi) { + l = Math.min(hi - runBase + 1, minRun); + binarySort(runBase, runBase + l - 1); + } else { + // revert + for (int i = 0, halfL = l >>> 1; i < halfL; ++i) { + swap(runBase + i, runBase + l - i - 1); + } + } + } else { + // run must be non-descending + while (runBase + l <= hi && compare(runBase + l - 1, runBase + l) <= 0) { + ++l; + } + if (l < minRun && runBase + l <= hi) { + l = Math.min(hi - runBase + 1, minRun); + binarySort(runBase, runBase + l - 1); + } // else nothing to do, the run is already sorted + } + return l; + } + + void ensureInvariants() { + while (stackSize > 1) { + final int runLen0 = runLen(0); + final int runLen1 = runLen(1); + + if (stackSize > 2) { + final int runLen2 = runLen(2); + + if (runLen2 <= runLen1 + runLen0) { + // merge the smaller of 0 and 2 with 1 + if (runLen2 < runLen0) { + mergeAt(1); + } else { + mergeAt(0); + } + continue; + } + } + + if (runLen1 <= runLen0) { + mergeAt(0); + continue; + } + + break; + } + } + + void exhaustStack() { + while (stackSize > 1) { + mergeAt(0); + } + } + + void sort() { + do { + ensureInvariants(); + + // Push a new run onto the stack + pushRunLen(nextRun()); + + } while (runEnd(0) <= hi); + + exhaustStack(); + assert runEnd(0) == hi + 1; + } + + } + + /** Sorts using TimSort, see http://svn.python.org/projects/python/trunk/Objects/listsort.txt + * and http://svn.python.org/projects/python/trunk/Objects/listobject.c. + * TimSort is a stable sorting algorithm based on MergeSort but known to + * perform extremely well on partially-sorted inputs. + * For small collections, falls back to {@link #binarySort(int, int)}. */ + public final void timSort(int lo, int hi) { + if (hi - lo <= TIMSORT_THRESHOLD) { + binarySort(lo, hi); + return; + } + + new TimSort(lo, hi).sort(); + } + /** Sorts via stable in-place MergeSort algorithm * For small collections falls back to {@link #insertionSort(int,int)}. */ public final void mergeSort(int lo, int hi) { diff --git a/lucene/core/src/test/org/apache/lucene/util/TestArrayUtil.java b/lucene/core/src/test/org/apache/lucene/util/TestArrayUtil.java index 61ed951ccc2..6b9d0a476be 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestArrayUtil.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestArrayUtil.java @@ -187,7 +187,27 @@ public class TestArrayUtil extends LuceneTestCase { assertArrayEquals(a2, a1); } } - + + public void testTimSort() { + int num = atLeast(65); + for (int i = 0; i < num; i++) { + Integer[] a1 = createRandomArray(1000), a2 = a1.clone(); + ArrayUtil.timSort(a1); + Arrays.sort(a2); + assertArrayEquals(a2, a1); + + a1 = createRandomArray(1000); + a2 = a1.clone(); + ArrayUtil.timSort(a1, Collections.reverseOrder()); + Arrays.sort(a2, Collections.reverseOrder()); + assertArrayEquals(a2, a1); + // reverse back, so we can test that completely backwards sorted array (worst case) is working: + ArrayUtil.timSort(a1); + Arrays.sort(a2); + assertArrayEquals(a2, a1); + } + } + public void testInsertionSort() { for (int i = 0, c = atLeast(500); i < c; i++) { Integer[] a1 = createRandomArray(30), a2 = a1.clone(); @@ -207,6 +227,25 @@ public class TestArrayUtil extends LuceneTestCase { } } + public void testBinarySort() { + for (int i = 0, c = atLeast(500); i < c; i++) { + Integer[] a1 = createRandomArray(30), a2 = a1.clone(); + ArrayUtil.binarySort(a1); + Arrays.sort(a2); + assertArrayEquals(a2, a1); + + a1 = createRandomArray(30); + a2 = a1.clone(); + ArrayUtil.binarySort(a1, Collections.reverseOrder()); + Arrays.sort(a2, Collections.reverseOrder()); + assertArrayEquals(a2, a1); + // reverse back, so we can test that completely backwards sorted array (worst case) is working: + ArrayUtil.binarySort(a1); + Arrays.sort(a2); + assertArrayEquals(a2, a1); + } + } + static class Item implements Comparable { final int val, order; @@ -254,16 +293,49 @@ public class TestArrayUtil extends LuceneTestCase { last = act; } } - + + public void testTimSortStability() { + final Random rnd = random(); + Item[] items = new Item[100]; + for (int i = 0; i < items.length; i++) { + // half of the items have value but same order. The value of this items is sorted, + // so they should always be in order after sorting. + // The other half has defined order, but no (-1) value (they should appear after + // all above, when sorted). + final boolean equal = rnd.nextBoolean(); + items[i] = new Item(equal ? (i+1) : -1, equal ? 0 : (rnd.nextInt(1000)+1)); + } + + if (VERBOSE) System.out.println("Before: " + Arrays.toString(items)); + // if you replace this with ArrayUtil.quickSort(), test should fail: + ArrayUtil.timSort(items); + if (VERBOSE) System.out.println("Sorted: " + Arrays.toString(items)); + + Item last = items[0]; + for (int i = 1; i < items.length; i++) { + final Item act = items[i]; + if (act.order == 0) { + // order of "equal" items should be not mixed up + assertTrue(act.val > last.val); + } + assertTrue(act.order >= last.order); + last = act; + } + } + // should produce no exceptions public void testEmptyArraySort() { Integer[] a = new Integer[0]; ArrayUtil.quickSort(a); ArrayUtil.mergeSort(a); ArrayUtil.insertionSort(a); + ArrayUtil.binarySort(a); + ArrayUtil.timSort(a); ArrayUtil.quickSort(a, Collections.reverseOrder()); ArrayUtil.mergeSort(a, Collections.reverseOrder()); + ArrayUtil.timSort(a, Collections.reverseOrder()); ArrayUtil.insertionSort(a, Collections.reverseOrder()); + ArrayUtil.binarySort(a, Collections.reverseOrder()); } } diff --git a/lucene/core/src/test/org/apache/lucene/util/TestCollectionUtil.java b/lucene/core/src/test/org/apache/lucene/util/TestCollectionUtil.java index 819ffe8d44e..a3cdc3bebd2 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestCollectionUtil.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestCollectionUtil.java @@ -72,7 +72,26 @@ public class TestCollectionUtil extends LuceneTestCase { assertEquals(list2, list1); } } - + + public void testTimSort() { + for (int i = 0, c = atLeast(500); i < c; i++) { + List list1 = createRandomList(1000), list2 = new ArrayList(list1); + CollectionUtil.timSort(list1); + Collections.sort(list2); + assertEquals(list2, list1); + + list1 = createRandomList(1000); + list2 = new ArrayList(list1); + CollectionUtil.timSort(list1, Collections.reverseOrder()); + Collections.sort(list2, Collections.reverseOrder()); + assertEquals(list2, list1); + // reverse back, so we can test that completely backwards sorted array (worst case) is working: + CollectionUtil.timSort(list1); + Collections.sort(list2); + assertEquals(list2, list1); + } + } + public void testInsertionSort() { for (int i = 0, c = atLeast(500); i < c; i++) { List list1 = createRandomList(30), list2 = new ArrayList(list1); @@ -91,25 +110,52 @@ public class TestCollectionUtil extends LuceneTestCase { assertEquals(list2, list1); } } - + + public void testBinarySort() { + for (int i = 0, c = atLeast(500); i < c; i++) { + List list1 = createRandomList(30), list2 = new ArrayList(list1); + CollectionUtil.binarySort(list1); + Collections.sort(list2); + assertEquals(list2, list1); + + list1 = createRandomList(30); + list2 = new ArrayList(list1); + CollectionUtil.binarySort(list1, Collections.reverseOrder()); + Collections.sort(list2, Collections.reverseOrder()); + assertEquals(list2, list1); + // reverse back, so we can test that completely backwards sorted array (worst case) is working: + CollectionUtil.binarySort(list1); + Collections.sort(list2); + assertEquals(list2, list1); + } + } + public void testEmptyListSort() { // should produce no exceptions List list = Arrays.asList(new Integer[0]); // LUCENE-2989 CollectionUtil.quickSort(list); CollectionUtil.mergeSort(list); + CollectionUtil.timSort(list); CollectionUtil.insertionSort(list); + CollectionUtil.binarySort(list); CollectionUtil.quickSort(list, Collections.reverseOrder()); CollectionUtil.mergeSort(list, Collections.reverseOrder()); + CollectionUtil.timSort(list, Collections.reverseOrder()); CollectionUtil.insertionSort(list, Collections.reverseOrder()); + CollectionUtil.binarySort(list, Collections.reverseOrder()); // check that empty non-random access lists pass sorting without ex (as sorting is not needed) list = new LinkedList(); CollectionUtil.quickSort(list); CollectionUtil.mergeSort(list); + CollectionUtil.timSort(list); CollectionUtil.insertionSort(list); + CollectionUtil.binarySort(list); CollectionUtil.quickSort(list, Collections.reverseOrder()); CollectionUtil.mergeSort(list, Collections.reverseOrder()); + CollectionUtil.timSort(list, Collections.reverseOrder()); CollectionUtil.insertionSort(list, Collections.reverseOrder()); + CollectionUtil.binarySort(list, Collections.reverseOrder()); } public void testOneElementListSort() { @@ -118,10 +164,14 @@ public class TestCollectionUtil extends LuceneTestCase { list.add(1); CollectionUtil.quickSort(list); CollectionUtil.mergeSort(list); + CollectionUtil.timSort(list); CollectionUtil.insertionSort(list); + CollectionUtil.binarySort(list); CollectionUtil.quickSort(list, Collections.reverseOrder()); CollectionUtil.mergeSort(list, Collections.reverseOrder()); + CollectionUtil.timSort(list, Collections.reverseOrder()); CollectionUtil.insertionSort(list, Collections.reverseOrder()); + CollectionUtil.binarySort(list, Collections.reverseOrder()); } } diff --git a/lucene/core/src/test/org/apache/lucene/util/TestSorterTemplate.java b/lucene/core/src/test/org/apache/lucene/util/TestSorterTemplate.java new file mode 100644 index 00000000000..f43ffa46428 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/util/TestSorterTemplate.java @@ -0,0 +1,167 @@ +package org.apache.lucene.util; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Arrays; + +public class TestSorterTemplate extends LuceneTestCase { + + private static final int SLOW_SORT_THRESHOLD = 1000; + + // A sorter template that compares only the last 32 bits + static class Last32BitsSorterTemplate extends SorterTemplate { + + final long[] arr; + long pivot; + + Last32BitsSorterTemplate(long[] arr) { + this.arr = arr; + } + + @Override + protected void swap(int i, int j) { + final long tmp = arr[i]; + arr[i] = arr[j]; + arr[j] = tmp; + } + + private int compareValues(long i, long j) { + // only compare the last 32 bits + final long a = i & 0xFFFFFFFFL; + final long b = j & 0xFFFFFFFFL; + return a < b ? -1 : a == b ? 0 : 1; + } + + @Override + protected int compare(int i, int j) { + return compareValues(arr[i], arr[j]); + } + + @Override + protected void setPivot(int i) { + pivot = arr[i]; + } + + @Override + protected int comparePivot(int j) { + return compareValues(pivot, arr[j]); + } + + } + + void testSort(int[] intArr) { + // we modify the array as a long[] and store the original ord in the first 32 bits + // to be able to check stability + final long[] arr = toLongsAndOrds(intArr); + + // use MergeSort as a reference + // assertArrayEquals checks for sorting + stability + // assertArrayEquals(toInts) checks for sorting only + final long[] mergeSorted = Arrays.copyOf(arr, arr.length); + new Last32BitsSorterTemplate(mergeSorted).mergeSort(0, arr.length - 1); + + if (arr.length < SLOW_SORT_THRESHOLD) { + final long[] insertionSorted = Arrays.copyOf(arr, arr.length); + new Last32BitsSorterTemplate(insertionSorted).insertionSort(0, arr.length - 1); + assertArrayEquals(mergeSorted, insertionSorted); + + final long[] binarySorted = Arrays.copyOf(arr, arr.length); + new Last32BitsSorterTemplate(binarySorted).binarySort(0, arr.length - 1); + assertArrayEquals(mergeSorted, binarySorted); + } + + final long[] quickSorted = Arrays.copyOf(arr, arr.length); + new Last32BitsSorterTemplate(quickSorted).quickSort(0, arr.length - 1); + assertArrayEquals(toInts(mergeSorted), toInts(quickSorted)); + + final long[] timSorted = Arrays.copyOf(arr, arr.length); + new Last32BitsSorterTemplate(timSorted).timSort(0, arr.length - 1); + assertArrayEquals(mergeSorted, timSorted); + } + + private int[] toInts(long[] longArr) { + int[] arr = new int[longArr.length]; + for (int i = 0; i < longArr.length; ++i) { + arr[i] = (int) longArr[i]; + } + return arr; + } + + private long[] toLongsAndOrds(int[] intArr) { + final long[] arr = new long[intArr.length]; + for (int i = 0; i < intArr.length; ++i) { + arr[i] = (((long) i) << 32) | (intArr[i] & 0xFFFFFFFFL); + } + return arr; + } + + int randomLength() { + return random().nextBoolean() + ? random().nextInt(SLOW_SORT_THRESHOLD) + : random().nextInt(100000); + } + + public void testAscending() { + final int length = randomLength(); + final int[] arr = new int[length]; + arr[0] = random().nextInt(10); + for (int i = 1; i < arr.length; ++i) { + arr[i] = arr[i-1] + _TestUtil.nextInt(random(), 0, 10); + } + testSort(arr); + } + + public void testDescending() { + final int length = randomLength(); + final int[] arr = new int[length]; + arr[0] = random().nextInt(10); + for (int i = 1; i < arr.length; ++i) { + arr[i] = arr[i-1] - _TestUtil.nextInt(random(), 0, 10); + } + testSort(arr); + } + + public void testStrictlyDescending() { + final int length = randomLength(); + final int[] arr = new int[length]; + arr[0] = random().nextInt(10); + for (int i = 1; i < arr.length; ++i) { + arr[i] = arr[i-1] - _TestUtil.nextInt(random(), 1, 10); + } + testSort(arr); + } + + public void testRandom1() { + final int length = randomLength(); + final int[] arr = new int[length]; + for (int i = 1; i < arr.length; ++i) { + arr[i] = random().nextInt(); + } + testSort(arr); + } + + public void testRandom2() { + final int length = randomLength(); + final int[] arr = new int[length]; + for (int i = 1; i < arr.length; ++i) { + arr[i] = random().nextInt(10); + } + testSort(arr); + } + +} diff --git a/lucene/misc/src/java/org/apache/lucene/index/sorter/Sorter.java b/lucene/misc/src/java/org/apache/lucene/index/sorter/Sorter.java index dac24dc2a3f..d357439cd4e 100644 --- a/lucene/misc/src/java/org/apache/lucene/index/sorter/Sorter.java +++ b/lucene/misc/src/java/org/apache/lucene/index/sorter/Sorter.java @@ -147,8 +147,9 @@ public abstract class Sorter { } SorterTemplate sorter = new DocValueSorterTemplate(docs, comparator); - // TODO: use a stable sort instead? - sorter.quickSort(0, docs.length - 1); // docs is now the newToOld mapping + // It can be common to sort a reader, add docs, sort it again, ... and in + // that case timSort can save a lot of time + sorter.timSort(0, docs.length - 1); // docs is now the newToOld mapping // The reason why we use MonotonicAppendingLongBuffer here is that it // wastes very little memory if the index is in random order but can save diff --git a/lucene/misc/src/java/org/apache/lucene/index/sorter/SortingAtomicReader.java b/lucene/misc/src/java/org/apache/lucene/index/sorter/SortingAtomicReader.java index f9c7a3af775..e850076df1e 100644 --- a/lucene/misc/src/java/org/apache/lucene/index/sorter/SortingAtomicReader.java +++ b/lucene/misc/src/java/org/apache/lucene/index/sorter/SortingAtomicReader.java @@ -307,9 +307,11 @@ public class SortingAtomicReader extends FilterAtomicReader { docs[i] = docs[j]; docs[j] = tmpDoc; - int tmpFreq = freqs[i]; - freqs[i] = freqs[j]; - freqs[j] = tmpFreq; + if (freqs != null) { + int tmpFreq = freqs[i]; + freqs[i] = freqs[j]; + freqs[j] = tmpFreq; + } } } @@ -335,8 +337,6 @@ public class SortingAtomicReader extends FilterAtomicReader { freqs[i] = in.freq(); ++i; } - SorterTemplate sorter = new DocFreqSorterTemplate(docs, freqs); - sorter.quickSort(0, i - 1); } else { freqs = null; while ((doc = in.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS){ @@ -345,8 +345,10 @@ public class SortingAtomicReader extends FilterAtomicReader { } docs[i++] = docMap.oldToNew(doc); } - Arrays.sort(docs, 0, i); } + // TimSort can save much time compared to other sorts in case of + // reverse sorting, or when sorting a concatenation of sorted readers + new DocFreqSorterTemplate(docs, freqs).timSort(0, i - 1); upto = i; } @@ -451,12 +453,9 @@ public class SortingAtomicReader extends FilterAtomicReader { int i = 0; while ((doc = in.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (i == docs.length) { - docs = ArrayUtil.grow(docs, docs.length + 1); - // don't grow() offsets since growing pattern for long and int is not the same. - // since we want docs and offsets at the same length, just grow it manually. - long[] tmp = new long[docs.length]; - System.arraycopy(offsets, 0, tmp, 0, offsets.length); - offsets = tmp; + final int newLength = ArrayUtil.oversize(i + 1, 4); + docs = Arrays.copyOf(docs, newLength); + offsets = Arrays.copyOf(offsets, newLength); } docs[i] = docMap.oldToNew(doc); offsets[i] = out.getFilePointer(); @@ -464,8 +463,7 @@ public class SortingAtomicReader extends FilterAtomicReader { i++; } upto = i; - SorterTemplate sorter = new DocOffsetSorterTemplate(docs, offsets); - sorter.quickSort(0, upto - 1); + new DocOffsetSorterTemplate(docs, offsets).timSort(0, upto - 1); out.close(); this.postingInput = new RAMInputStream("", file); }