mirror of https://github.com/apache/lucene.git
LUCENE-7299: Speed up BytesRefHash.sort.
This commit is contained in:
parent
f85dc0fef2
commit
be597a81ef
|
@ -86,6 +86,8 @@ Optimizations
|
|||
(which is used by TermsQuery, multi-term queries and several point queries).
|
||||
(Adrien Grand, Jeff Wartes, David Smiley)
|
||||
|
||||
* LUCENE-7299: Speed up BytesRefHash.sort(). (Adrien Grand)
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-7127: Fix corner case bugs in GeoPointDistanceQuery. (Robert Muir)
|
||||
|
|
|
@ -158,40 +158,23 @@ public final class BytesRefHash {
|
|||
*/
|
||||
public int[] sort() {
|
||||
final int[] compact = compact();
|
||||
new IntroSorter() {
|
||||
new StringMSBRadixSorter() {
|
||||
|
||||
BytesRef scratch = new BytesRef();
|
||||
|
||||
@Override
|
||||
protected void swap(int i, int j) {
|
||||
final int o = compact[i];
|
||||
int tmp = compact[i];
|
||||
compact[i] = compact[j];
|
||||
compact[j] = o;
|
||||
compact[j] = tmp;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int compare(int i, int j) {
|
||||
final int id1 = compact[i], id2 = compact[j];
|
||||
assert bytesStart.length > id1 && bytesStart.length > id2;
|
||||
pool.setBytesRef(scratch1, bytesStart[id1]);
|
||||
pool.setBytesRef(scratch2, bytesStart[id2]);
|
||||
return scratch1.compareTo(scratch2);
|
||||
protected BytesRef get(int i) {
|
||||
pool.setBytesRef(scratch, bytesStart[compact[i]]);
|
||||
return scratch;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void setPivot(int i) {
|
||||
final int id = compact[i];
|
||||
assert bytesStart.length > id;
|
||||
pool.setBytesRef(pivot, bytesStart[id]);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int comparePivot(int j) {
|
||||
final int id = compact[j];
|
||||
assert bytesStart.length > id;
|
||||
pool.setBytesRef(scratch2, bytesStart[id]);
|
||||
return pivot.compareTo(scratch2);
|
||||
}
|
||||
|
||||
private final BytesRef pivot = new BytesRef(),
|
||||
scratch1 = new BytesRef(), scratch2 = new BytesRef();
|
||||
}.sort(0, count);
|
||||
return compact;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,187 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.util;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/** Radix sorter for variable-length strings. This class sorts based on the most
|
||||
* significant byte first and falls back to {@link IntroSorter} when the size
|
||||
* of the buckets to sort becomes small. It is <b>NOT</b> stable.
|
||||
* Worst-case memory usage is about {@code 2.3 KB} */
|
||||
abstract class StringMSBRadixSorter extends Sorter {
|
||||
|
||||
// after that many levels of recursion we fall back to introsort anyway
|
||||
private static final int LEVEL_THRESHOLD = 8;
|
||||
// size of histograms: 256 + 1 to indicate that the string is finished
|
||||
private static final int HISTOGRAM_SIZE = 257;
|
||||
// buckets below this size will be sorted with introsort
|
||||
private static final int LENGTH_THRESHOLD = 100;
|
||||
|
||||
// we store one histogram per recursion level
|
||||
private final int[][] histograms = new int[LEVEL_THRESHOLD][];
|
||||
private final int[] endOffsets = new int[HISTOGRAM_SIZE];
|
||||
|
||||
/** Get a {@link BytesRef} for the given index. */
|
||||
protected abstract BytesRef get(int i);
|
||||
|
||||
/** Store bytes for the given index into {@code dest}, without the first k bytes. */
|
||||
private void get(int i, int k, BytesRef dest) {
|
||||
BytesRef ref = get(i);
|
||||
assert ref.length >= k;
|
||||
dest.bytes = ref.bytes;
|
||||
dest.offset = ref.offset + k;
|
||||
dest.length = ref.length - k;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected final int compare(int i, int j) {
|
||||
throw new UnsupportedOperationException("unused: not a comparison-based sort");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void sort(int from, int to) {
|
||||
checkRange(from, to);
|
||||
sort(from, to, 0);
|
||||
}
|
||||
|
||||
private void sort(int from, int to, int k) {
|
||||
if (to - from <= LENGTH_THRESHOLD || k >= LEVEL_THRESHOLD) {
|
||||
introSort(from, to, k);
|
||||
} else {
|
||||
radixSort(from, to, k);
|
||||
}
|
||||
}
|
||||
|
||||
private void introSort(int from, int to, int k) {
|
||||
new IntroSorter() {
|
||||
@Override
|
||||
protected void swap(int i, int j) {
|
||||
StringMSBRadixSorter.this.swap(i, j);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int compare(int i, int j) {
|
||||
get(i, k, scratch1);
|
||||
get(j, k, scratch2);
|
||||
return scratch1.compareTo(scratch2);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void setPivot(int i) {
|
||||
get(i, k, pivot);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int comparePivot(int j) {
|
||||
get(j, k, scratch2);
|
||||
return pivot.compareTo(scratch2);
|
||||
}
|
||||
|
||||
private final BytesRef pivot = new BytesRef(),
|
||||
scratch1 = new BytesRef(), scratch2 = new BytesRef();
|
||||
}.sort(from, to);
|
||||
}
|
||||
|
||||
private void radixSort(int from, int to, int k) {
|
||||
int[] histogram = histograms[k];
|
||||
if (histogram == null) {
|
||||
histogram = histograms[k] = new int[HISTOGRAM_SIZE];
|
||||
} else {
|
||||
Arrays.fill(histogram, 0);
|
||||
}
|
||||
|
||||
buildHistogram(from, to, k, histogram);
|
||||
|
||||
// short-circuit: if all keys have the same byte at offset k, then recurse directly
|
||||
for (int i = 0; i < HISTOGRAM_SIZE; ++i) {
|
||||
if (histogram[i] == to - from) {
|
||||
// everything is in the same bucket, recurse
|
||||
if (i > 0) {
|
||||
sort(from, to, k + 1);
|
||||
}
|
||||
return;
|
||||
} else if (histogram[i] != 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
int[] startOffsets = histogram;
|
||||
int[] endOffsets = this.endOffsets;
|
||||
sumHistogram(histogram, endOffsets);
|
||||
reorder(from, to, startOffsets, endOffsets, k);
|
||||
endOffsets = startOffsets;
|
||||
|
||||
// recurse on all but the first bucket since all keys are equals in this
|
||||
// bucket (we already compared all bytes)
|
||||
for (int prev = endOffsets[0], i = 1; i < HISTOGRAM_SIZE; ++i) {
|
||||
int h = endOffsets[i];
|
||||
final int bucketLen = h - prev;
|
||||
if (bucketLen > 1) {
|
||||
sort(from + prev, from + h, k + 1);
|
||||
}
|
||||
prev = h;
|
||||
}
|
||||
}
|
||||
|
||||
/** Return a number for the k-th character between 0 and {@link #HISTOGRAM_SIZE}. */
|
||||
private int getBucket(int id, int k) {
|
||||
BytesRef ref = get(id);
|
||||
if (ref.length <= k) {
|
||||
return 0;
|
||||
}
|
||||
final int b = ref.bytes[ref.offset + k] & 0xff;
|
||||
return b + 1;
|
||||
}
|
||||
|
||||
/** Build a histogram of the number of values per {@link #getBucket(int, int) bucket}. */
|
||||
private int[] buildHistogram(int from, int to, int k, int[] histogram) {
|
||||
for (int i = from; i < to; ++i) {
|
||||
histogram[getBucket(i, k)]++;
|
||||
}
|
||||
return histogram;
|
||||
}
|
||||
|
||||
/** Accumulate values of the histogram so that it does not store counts but
|
||||
* start offsets. {@code endOffsets} will store the end offsets. */
|
||||
private static void sumHistogram(int[] histogram, int[] endOffsets) {
|
||||
int accum = 0;
|
||||
for (int i = 0; i < HISTOGRAM_SIZE; ++i) {
|
||||
final int count = histogram[i];
|
||||
histogram[i] = accum;
|
||||
accum += count;
|
||||
endOffsets[i] = accum;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reorder based on start/end offsets for each bucket. When this method
|
||||
* returns, startOffsets and endOffsets are equal.
|
||||
* @param startOffsets start offsets per bucket
|
||||
* @param endOffsets end offsets per bucket
|
||||
*/
|
||||
private void reorder(int from, int to, int[] startOffsets, int[] endOffsets, int k) {
|
||||
// reorder in place, like the dutch flag problem
|
||||
for (int i = 0; i < HISTOGRAM_SIZE; ++i) {
|
||||
final int limit = endOffsets[i];
|
||||
for (int h1 = startOffsets[i]; h1 < limit; h1 = startOffsets[i]) {
|
||||
final int b = getBucket(from + h1, k);
|
||||
final int h2 = startOffsets[b]++;
|
||||
swap(from + h1, from + h2);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,97 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.util;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
public class TestStringMSBRadixSorter extends LuceneTestCase {
|
||||
|
||||
private void test(BytesRef[] refs, int len) {
|
||||
BytesRef[] expected = Arrays.copyOf(refs, len);
|
||||
Arrays.sort(expected);
|
||||
|
||||
new StringMSBRadixSorter() {
|
||||
|
||||
@Override
|
||||
protected BytesRef get(int i) {
|
||||
return refs[i];
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void swap(int i, int j) {
|
||||
BytesRef tmp = refs[i];
|
||||
refs[i] = refs[j];
|
||||
refs[j] = tmp;
|
||||
}
|
||||
}.sort(0, len);
|
||||
BytesRef[] actual = Arrays.copyOf(refs, len);
|
||||
assertArrayEquals(expected, actual);
|
||||
}
|
||||
|
||||
public void testEmpty() {
|
||||
test(new BytesRef[random().nextInt(5)], 0);
|
||||
}
|
||||
|
||||
public void testOneValue() {
|
||||
BytesRef bytes = new BytesRef(TestUtil.randomSimpleString(random()));
|
||||
test(new BytesRef[] { bytes }, 1);
|
||||
}
|
||||
|
||||
public void testTwoValues() {
|
||||
BytesRef bytes1 = new BytesRef(TestUtil.randomSimpleString(random()));
|
||||
BytesRef bytes2 = new BytesRef(TestUtil.randomSimpleString(random()));
|
||||
test(new BytesRef[] { bytes1, bytes2 }, 2);
|
||||
}
|
||||
|
||||
private void testRandom(int commonPrefixLen, int maxLen) {
|
||||
byte[] commonPrefix = new byte[commonPrefixLen];
|
||||
random().nextBytes(commonPrefix);
|
||||
final int len = random().nextInt(100000);
|
||||
BytesRef[] bytes = new BytesRef[len + random().nextInt(50)];
|
||||
for (int i = 0; i < len; ++i) {
|
||||
byte[] b = new byte[commonPrefixLen + random().nextInt(maxLen)];
|
||||
random().nextBytes(b);
|
||||
System.arraycopy(commonPrefix, 0, b, 0, commonPrefixLen);
|
||||
bytes[i] = new BytesRef(b);
|
||||
}
|
||||
test(bytes, len);
|
||||
}
|
||||
|
||||
public void testRandom() {
|
||||
for (int iter = 0; iter < 10; ++iter) {
|
||||
testRandom(0, 10);
|
||||
}
|
||||
}
|
||||
|
||||
public void testRandomWithLotsOfDuplicates() {
|
||||
for (int iter = 0; iter < 10; ++iter) {
|
||||
testRandom(0, 2);
|
||||
}
|
||||
}
|
||||
|
||||
public void testRandomWithSharedPrefix() {
|
||||
for (int iter = 0; iter < 10; ++iter) {
|
||||
testRandom(TestUtil.nextInt(random(), 1, 30), 10);
|
||||
}
|
||||
}
|
||||
|
||||
public void testRandomWithSharedPrefixAndLotsOfDuplicates() {
|
||||
for (int iter = 0; iter < 10; ++iter) {
|
||||
testRandom(TestUtil.nextInt(random(), 1, 30), 2);
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue