LUCENE-7399: Speed up flush of points.

This commit is contained in:
Adrien Grand 2016-07-29 17:32:29 +02:00
parent 234ea3ef89
commit 9fc4624853
16 changed files with 368 additions and 133 deletions

View File

@ -156,7 +156,8 @@ Optimizations
* LUCENE-7311: Cached term queries do not seek the terms dictionary anymore.
(Adrien Grand)
* LUCENE-7396: Faster flush of points. (Adrien Grand, Mike McCandless)
* LUCENE-7396, LUCENE-7399: Faster flush of points.
(Adrien Grand, Mike McCandless)
Other

View File

@ -16,6 +16,8 @@
*/
package org.apache.lucene.codecs;
import org.apache.lucene.util.BytesRef;
/** {@link PointsReader} whose order of points can be changed.
* This class is useful for codecs to optimize flush.
* @lucene.internal */
@ -24,8 +26,8 @@ public abstract class MutablePointsReader extends PointsReader {
/** Sole constructor. */
protected MutablePointsReader() {}
/** Fill {@code packedValue} with the packed bytes of the i-th value. */
public abstract void getValue(int i, byte[] packedValue);
/** Set {@code packedValue} with a reference to the packed bytes of the i-th value. */
public abstract void getValue(int i, BytesRef packedValue);
/** Get the k-th byte of the i-th value. */
public abstract byte getByteAt(int i, int k);

View File

@ -51,9 +51,10 @@ class PointValuesWriter {
if (value == null) {
throw new IllegalArgumentException("field=" + fieldInfo.name + ": point value must not be null");
}
if (value.length != fieldInfo.getPointDimensionCount() * fieldInfo.getPointNumBytes()) {
if (value.length != packedBytesLength) {
throw new IllegalArgumentException("field=" + fieldInfo.name + ": this field's value has length=" + value.length + " but should be " + (fieldInfo.getPointDimensionCount() * fieldInfo.getPointNumBytes()));
}
if (docIDs.length == numPoints) {
docIDs = ArrayUtil.grow(docIDs, numPoints+1);
iwBytesUsed.addAndGet((docIDs.length - numPoints) * Integer.BYTES);
@ -64,6 +65,7 @@ class PointValuesWriter {
numDocs++;
lastDocID = docID;
}
numPoints++;
}
@ -82,9 +84,12 @@ class PointValuesWriter {
if (fieldName.equals(fieldInfo.name) == false) {
throw new IllegalArgumentException("fieldName must be the same");
}
final BytesRef scratch = new BytesRef();
final byte[] packedValue = new byte[packedBytesLength];
for(int i=0;i<numPoints;i++) {
getValue(i, packedValue);
getValue(i, scratch);
assert scratch.length == packedValue.length;
System.arraycopy(scratch.bytes, scratch.offset, packedValue, 0, packedBytesLength);
visitor.visit(getDocID(i), packedValue);
}
}
@ -152,9 +157,10 @@ class PointValuesWriter {
}
@Override
public void getValue(int i, byte[] packedValue) {
public void getValue(int i, BytesRef packedValue) {
final long offset = (long) packedBytesLength * ords[i];
bytes.readBytes(offset, packedValue, 0, packedBytesLength);
packedValue.length = packedBytesLength;
bytes.setRawBytesRef(packedValue, offset);
}
@Override

View File

@ -379,6 +379,27 @@ public final class ByteBlockPool {
} while (true);
}
/**
* Set the given {@link BytesRef} so that its content is equal to the
* {@code ref.length} bytes starting at {@code offset}. Most of the time this
* method will set pointers to internal data-structures. However, in case a
* value crosses a boundary, a fresh copy will be returned.
* On the contrary to {@link #setBytesRef(BytesRef, int)}, this does not
* expect the length to be encoded with the data.
*/
public void setRawBytesRef(BytesRef ref, final long offset) {
int bufferIndex = (int) (offset >> BYTE_BLOCK_SHIFT);
int pos = (int) (offset & BYTE_BLOCK_MASK);
if (pos + ref.length <= BYTE_BLOCK_SIZE) {
ref.bytes = buffers[bufferIndex];
ref.offset = pos;
} else {
ref.bytes = new byte[ref.length];
ref.offset = 0;
readBytes(offset, ref.bytes, 0, ref.length);
}
}
/** Read a single byte at the given {@code offset}. */
public byte readByte(long offset) {
int bufferIndex = (int) (offset >> BYTE_BLOCK_SHIFT);

View File

@ -33,8 +33,8 @@ public abstract class InPlaceMergeSorter extends Sorter {
}
void mergeSort(int from, int to) {
if (to - from < INSERTION_SORT_THRESHOLD) {
insertionSort(from, to);
if (to - from < BINARY_SORT_THRESHOLD) {
binarySort(from, to);
} else {
final int mid = (from + to) >>> 1;
mergeSort(from, mid);

View File

@ -34,6 +34,8 @@ public abstract class IntroSelector extends Selector {
}
// heap sort
// TODO: use median of median instead to have linear worst-case rather than
// n*log(n)
void slowSelect(int from, int to, int k) {
new Sorter() {

View File

@ -37,8 +37,8 @@ public abstract class IntroSorter extends Sorter {
}
void quicksort(int from, int to, int maxDepth) {
if (to - from < INSERTION_SORT_THRESHOLD) {
insertionSort(from, to);
if (to - from < BINARY_SORT_THRESHOLD) {
binarySort(from, to);
return;
} else if (--maxDepth < 0) {
heapSort(from, to);
@ -83,12 +83,13 @@ public abstract class IntroSorter extends Sorter {
quicksort(left + 1, to, maxDepth);
}
/** Save the value at slot <code>i</code> so that it can later be used as a
* pivot, see {@link #comparePivot(int)}. */
// Don't rely on the slow default impl of setPivot/comparePivot since
// quicksort relies on these methods to be fast for good performance
@Override
protected abstract void setPivot(int i);
/** Compare the pivot with the slot at <code>j</code>, similarly to
* {@link #compare(int, int) compare(i, j)}. */
@Override
protected abstract int comparePivot(int j);
@Override

View File

@ -38,6 +38,7 @@ public abstract class MSBRadixSorter extends Sorter {
// we store one histogram per recursion level
private final int[][] histograms = new int[LEVEL_THRESHOLD][];
private final int[] endOffsets = new int[HISTOGRAM_SIZE];
private final int[] commonPrefix;
private final int maxLength;
@ -47,6 +48,7 @@ public abstract class MSBRadixSorter extends Sorter {
*/
protected MSBRadixSorter(int maxLength) {
this.maxLength = maxLength;
this.commonPrefix = new int[Math.min(24, maxLength)];
}
/** Return the k-th byte of the entry at index {@code i}, or {@code -1} if
@ -116,14 +118,14 @@ public abstract class MSBRadixSorter extends Sorter {
@Override
public void sort(int from, int to) {
checkRange(from, to);
sort(from, to, 0);
sort(from, to, 0, 0);
}
private void sort(int from, int to, int k) {
if (to - from <= LENGTH_THRESHOLD || k >= LEVEL_THRESHOLD) {
private void sort(int from, int to, int k, int l) {
if (to - from <= LENGTH_THRESHOLD || l >= LEVEL_THRESHOLD) {
introSort(from, to, k);
} else {
radixSort(from, to, k);
radixSort(from, to, k, l);
}
}
@ -131,28 +133,30 @@ public abstract class MSBRadixSorter extends Sorter {
getFallbackSorter(k).sort(from, to);
}
private void radixSort(int from, int to, int k) {
int[] histogram = histograms[k];
/**
* @param k the character number to compare
* @param l the level of recursion
*/
private void radixSort(int from, int to, int k, int l) {
int[] histogram = histograms[l];
if (histogram == null) {
histogram = histograms[k] = new int[HISTOGRAM_SIZE];
histogram = histograms[l] = new int[HISTOGRAM_SIZE];
} else {
Arrays.fill(histogram, 0);
}
buildHistogram(from, to, k, histogram);
// short-circuit: if all keys have the same byte at offset k, then recurse directly
for (int i = 0; i < HISTOGRAM_SIZE; ++i) {
if (histogram[i] == to - from) {
// everything is in the same bucket, recurse
if (i > 0) {
sort(from, to, k + 1);
}
return;
} else if (histogram[i] != 0) {
break;
final int commonPrefixLength = computeCommonPrefixLengthAndBuildHistogram(from, to, k, histogram);
if (commonPrefixLength > 0) {
// if there are no more chars to compare or if all entries fell into the
// first bucket (which means strings are shorter than k) then we are done
// otherwise recurse
if (k + commonPrefixLength < maxLength
&& histogram[0] < to - from) {
radixSort(from, to, k + commonPrefixLength, l);
}
return;
}
assert assertHistogram(commonPrefixLength, histogram);
int[] startOffsets = histogram;
int[] endOffsets = this.endOffsets;
@ -167,24 +171,83 @@ public abstract class MSBRadixSorter extends Sorter {
int h = endOffsets[i];
final int bucketLen = h - prev;
if (bucketLen > 1) {
sort(from + prev, from + h, k + 1);
sort(from + prev, from + h, k + 1, l + 1);
}
prev = h;
}
}
}
// only used from assert
private boolean assertHistogram(int commonPrefixLength, int[] histogram) {
int numberOfUniqueBytes = 0;
for (int freq : histogram) {
if (freq > 0) {
numberOfUniqueBytes++;
}
}
if (numberOfUniqueBytes == 1) {
assert commonPrefixLength >= 1;
} else {
assert commonPrefixLength == 0 : commonPrefixLength;
}
return true;
}
/** Return a number for the k-th character between 0 and {@link #HISTOGRAM_SIZE}. */
private int getBucket(int i, int k) {
return byteAt(i, k) + 1;
}
/** Build a histogram of the number of values per {@link #getBucket(int, int) bucket}. */
private int[] buildHistogram(int from, int to, int k, int[] histogram) {
/** Build a histogram of the number of values per {@link #getBucket(int, int) bucket}
* and return a common prefix length for all visited values.
* @see #buildHistogram */
private int computeCommonPrefixLengthAndBuildHistogram(int from, int to, int k, int[] histogram) {
final int[] commonPrefix = this.commonPrefix;
int commonPrefixLength = Math.min(commonPrefix.length, maxLength - k);
for (int j = 0; j < commonPrefixLength; ++j) {
final int b = byteAt(from, k + j);
commonPrefix[j] = b;
if (b == -1) {
commonPrefixLength = j + 1;
break;
}
}
int i;
outer: for (i = from + 1; i < to; ++i) {
for (int j = 0; j < commonPrefixLength; ++j) {
final int b = byteAt(i, k + j);
if (b != commonPrefix[j]) {
commonPrefixLength = j;
if (commonPrefixLength == 0) { // we have no common prefix
histogram[commonPrefix[0] + 1] = i - from;
histogram[b + 1] = 1;
break outer;
}
break;
}
}
}
if (i < to) {
// the loop got broken because there is no common prefix
assert commonPrefixLength == 0;
buildHistogram(i + 1, to, k, histogram);
} else {
assert commonPrefixLength > 0;
histogram[commonPrefix[0] + 1] = to - from;
}
return commonPrefixLength;
}
/** Build an histogram of the k-th characters of values occurring between
* offsets {@code from} and {@code to}, using {@link #getBucket}. */
private void buildHistogram(int from, int to, int k, int[] histogram) {
for (int i = from; i < to; ++i) {
histogram[getBucket(i, k)]++;
}
return histogram;
}
/** Accumulate values of the histogram so that it does not store counts but

View File

@ -36,6 +36,7 @@ public abstract class RadixSelector extends Selector {
// we store one histogram per recursion level
private final int[] histogram = new int[HISTOGRAM_SIZE];
private final int[] commonPrefix;
private final int maxLength;
@ -45,6 +46,7 @@ public abstract class RadixSelector extends Selector {
*/
protected RadixSelector(int maxLength) {
this.maxLength = maxLength;
this.commonPrefix = new int[Math.min(24, maxLength)];
}
/** Return the k-th byte of the entry at index {@code i}, or {@code -1} if
@ -112,22 +114,37 @@ public abstract class RadixSelector extends Selector {
@Override
public void select(int from, int to, int k) {
checkArgs(from, to, k);
select(from, to, k, 0);
select(from, to, k, 0, 0);
}
private void select(int from, int to, int k, int d) {
private void select(int from, int to, int k, int d, int l) {
if (to - from <= LENGTH_THRESHOLD || d >= LEVEL_THRESHOLD) {
getFallbackSelector(d).select(from, to, k);
} else {
radixSelect(from, to, k, d);
radixSelect(from, to, k, d, l);
}
}
private void radixSelect(int from, int to, int k, int d) {
/**
* @param d the character number to compare
* @param l the level of recursion
*/
private void radixSelect(int from, int to, int k, int d, int l) {
final int[] histogram = this.histogram;
Arrays.fill(histogram, 0);
buildHistogram(from, to, d, histogram);
final int commonPrefixLength = computeCommonPrefixLengthAndBuildHistogram(from, to, d, histogram);
if (commonPrefixLength > 0) {
// if there are no more chars to compare or if all entries fell into the
// first bucket (which means strings are shorter than d) then we are done
// otherwise recurse
if (d + commonPrefixLength < maxLength
&& histogram[0] < to - from) {
radixSelect(from, to, k, d + commonPrefixLength, l);
}
return;
}
assert assertHistogram(commonPrefixLength, histogram);
int bucketFrom = from;
for (int bucket = 0; bucket < HISTOGRAM_SIZE; ++bucket) {
@ -138,7 +155,7 @@ public abstract class RadixSelector extends Selector {
if (bucket != 0 && d + 1 < maxLength) {
// all elements in bucket 0 are equal so we only need to recurse if bucket != 0
select(bucketFrom, bucketTo, k, d + 1);
select(bucketFrom, bucketTo, k, d + 1, l + 1);
}
return;
}
@ -147,17 +164,76 @@ public abstract class RadixSelector extends Selector {
throw new AssertionError("Unreachable code");
}
// only used from assert
private boolean assertHistogram(int commonPrefixLength, int[] histogram) {
int numberOfUniqueBytes = 0;
for (int freq : histogram) {
if (freq > 0) {
numberOfUniqueBytes++;
}
}
if (numberOfUniqueBytes == 1) {
assert commonPrefixLength >= 1;
} else {
assert commonPrefixLength == 0;
}
return true;
}
/** Return a number for the k-th character between 0 and {@link #HISTOGRAM_SIZE}. */
private int getBucket(int i, int k) {
return byteAt(i, k) + 1;
}
/** Build a histogram of the number of values per {@link #getBucket(int, int) bucket}. */
private int[] buildHistogram(int from, int to, int k, int[] histogram) {
/** Build a histogram of the number of values per {@link #getBucket(int, int) bucket}
* and return a common prefix length for all visited values.
* @see #buildHistogram */
private int computeCommonPrefixLengthAndBuildHistogram(int from, int to, int k, int[] histogram) {
final int[] commonPrefix = this.commonPrefix;
int commonPrefixLength = Math.min(commonPrefix.length, maxLength - k);
for (int j = 0; j < commonPrefixLength; ++j) {
final int b = byteAt(from, k + j);
commonPrefix[j] = b;
if (b == -1) {
commonPrefixLength = j + 1;
break;
}
}
int i;
outer: for (i = from + 1; i < to; ++i) {
for (int j = 0; j < commonPrefixLength; ++j) {
final int b = byteAt(i, k + j);
if (b != commonPrefix[j]) {
commonPrefixLength = j;
if (commonPrefixLength == 0) { // we have no common prefix
histogram[commonPrefix[0] + 1] = i - from;
histogram[b + 1] = 1;
break outer;
}
break;
}
}
}
if (i < to) {
// the loop got broken because there is no common prefix
assert commonPrefixLength == 0;
buildHistogram(i + 1, to, k, histogram);
} else {
assert commonPrefixLength > 0;
histogram[commonPrefix[0] + 1] = to - from;
}
return commonPrefixLength;
}
/** Build an histogram of the k-th characters of values occurring between
* offsets {@code from} and {@code to}, using {@link #getBucket}. */
private void buildHistogram(int from, int to, int k, int[] histogram) {
for (int i = from; i < to; ++i) {
histogram[getBucket(i, k)]++;
}
return histogram;
}
/** Reorder elements so that all of them that fall into {@code bucket} are

View File

@ -23,7 +23,7 @@ import java.util.Comparator;
* @lucene.internal */
public abstract class Sorter {
static final int INSERTION_SORT_THRESHOLD = 20;
static final int BINARY_SORT_THRESHOLD = 20;
/** Sole constructor, used for inheritance. */
protected Sorter() {}
@ -36,6 +36,20 @@ public abstract class Sorter {
/** Swap values at slots <code>i</code> and <code>j</code>. */
protected abstract void swap(int i, int j);
private int pivotIndex;
/** Save the value at slot <code>i</code> so that it can later be used as a
* pivot, see {@link #comparePivot(int)}. */
protected void setPivot(int i) {
pivotIndex = i;
}
/** Compare the pivot with the slot at <code>j</code>, similarly to
* {@link #compare(int, int) compare(i, j)}. */
protected int comparePivot(int j) {
return compare(pivotIndex, j);
}
/** Sort the slice which starts at <code>from</code> (inclusive) and ends at
* <code>to</code> (exclusive). */
public abstract void sort(int from, int to);
@ -163,54 +177,41 @@ public abstract class Sorter {
}
}
void insertionSort(int from, int to) {
for (int i = from + 1; i < to; ++i) {
for (int j = i; j > from; --j) {
if (compare(j - 1, j) > 0) {
swap(j - 1, j);
} else {
break;
}
}
}
}
/**
* A binary sort implementation. This performs {@code O(n*log(n))} comparisons
* and {@code O(n^2)} swaps. It is typically used by more sophisticated
* implementations as a fall-back when the numbers of items to sort has become
* less than {@value #BINARY_SORT_THRESHOLD}.
*/
void binarySort(int from, int to) {
binarySort(from, to, from + 1);
}
void binarySort(int from, int to, int i) {
for ( ; i < to; ++i) {
setPivot(i);
int l = from;
int h = i - 1;
while (l <= h) {
final int mid = (l + h) >>> 1;
final int cmp = compare(i, mid);
final int cmp = comparePivot(mid);
if (cmp < 0) {
h = mid - 1;
} else {
l = mid + 1;
}
}
switch (i - l) {
case 2:
swap(l + 1, l + 2);
swap(l, l + 1);
break;
case 1:
swap(l, l + 1);
break;
case 0:
break;
default:
for (int j = i; j > l; --j) {
swap(j - 1, j);
}
break;
for (int j = i; j > l; --j) {
swap(j - 1, j);
}
}
}
/**
* Use heap sort to sort items between {@code from} inclusive and {@code to}
* exclusive. This runs in {@code O(n*log(n))} and is used as a fall-back by
* {@link IntroSorter}.
*/
void heapSort(int from, int to) {
if (to - from <= 1) {
return;

View File

@ -112,7 +112,8 @@ public class BKDWriter implements Closeable {
final byte[] scratchDiff;
final byte[] scratch1;
final byte[] scratch2;
final BytesRef scratchBytesRef = new BytesRef();
final BytesRef scratchBytesRef1 = new BytesRef();
final BytesRef scratchBytesRef2 = new BytesRef();
final int[] commonPrefixLengths;
protected final FixedBitSet docsSeen;
@ -174,7 +175,6 @@ public class BKDWriter implements Closeable {
packedBytesLength = numDims * bytesPerDim;
scratchDiff = new byte[bytesPerDim];
scratchBytesRef.length = packedBytesLength;
scratch1 = new byte[packedBytesLength];
scratch2 = new byte[packedBytesLength];
commonPrefixLengths = new int[numDims];
@ -465,14 +465,14 @@ public class BKDWriter implements Closeable {
Arrays.fill(minPackedValue, (byte) 0xff);
Arrays.fill(maxPackedValue, (byte) 0);
for (int i = 0; i < Math.toIntExact(pointCount); ++i) {
reader.getValue(i, scratch1);
reader.getValue(i, scratchBytesRef1);
for(int dim=0;dim<numDims;dim++) {
int offset = dim*bytesPerDim;
if (StringHelper.compare(bytesPerDim, scratch1, offset, minPackedValue, offset) < 0) {
System.arraycopy(scratch1, offset, minPackedValue, offset, bytesPerDim);
if (StringHelper.compare(bytesPerDim, scratchBytesRef1.bytes, scratchBytesRef1.offset + offset, minPackedValue, offset) < 0) {
System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + offset, minPackedValue, offset, bytesPerDim);
}
if (StringHelper.compare(bytesPerDim, scratch1, offset, maxPackedValue, offset) > 0) {
System.arraycopy(scratch1, offset, maxPackedValue, offset, bytesPerDim);
if (StringHelper.compare(bytesPerDim, scratchBytesRef1.bytes, scratchBytesRef1.offset + offset, maxPackedValue, offset) > 0) {
System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + offset, maxPackedValue, offset, bytesPerDim);
}
}
@ -811,6 +811,7 @@ public class BKDWriter implements Closeable {
}
private PointWriter sort(int dim) throws IOException {
assert dim >= 0 && dim < numDims;
if (heapPointWriter != null) {
@ -1251,13 +1252,13 @@ public class BKDWriter implements Closeable {
// Compute common prefixes
Arrays.fill(commonPrefixLengths, bytesPerDim);
reader.getValue(from, scratch1);
reader.getValue(from, scratchBytesRef1);
for (int i = from + 1; i < to; ++i) {
reader.getValue(i, scratch2);
reader.getValue(i, scratchBytesRef2);
for (int dim=0;dim<numDims;dim++) {
final int offset = dim * bytesPerDim;
for(int j=0;j<commonPrefixLengths[dim];j++) {
if (scratch1[offset+j] != scratch2[offset+j]) {
if (scratchBytesRef1.bytes[scratchBytesRef1.offset+offset+j] != scratchBytesRef2.bytes[scratchBytesRef2.offset+offset+j]) {
commonPrefixLengths[dim] = j;
break;
}
@ -1294,7 +1295,7 @@ public class BKDWriter implements Closeable {
// sort by sortedDim
MutablePointsReaderUtils.sortByDim(sortedDim, bytesPerDim, commonPrefixLengths,
reader, from, to, scratch1, scratch2);
reader, from, to, scratchBytesRef1, scratchBytesRef2);
// Save the block file pointer:
leafBlockFPs[nodeID - leafNodeOffset] = out.getFilePointer();
@ -1307,22 +1308,16 @@ public class BKDWriter implements Closeable {
writeLeafBlockDocs(out, docIDs, 0, count);
// Write the common prefixes:
reader.getValue(from, scratch1);
reader.getValue(from, scratchBytesRef1);
System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset, scratch1, 0, packedBytesLength);
writeCommonPrefixes(out, commonPrefixLengths, scratch1);
// Write the full values:
IntFunction<BytesRef> packedValues = new IntFunction<BytesRef>() {
final BytesRef scratch = new BytesRef(packedBytesLength);
{
scratch.offset = 0;
scratch.length = packedBytesLength;
}
@Override
public BytesRef apply(int i) {
reader.getValue(from + i, scratch.bytes);
return scratch;
reader.getValue(from + i, scratchBytesRef1);
return scratchBytesRef1;
}
};
assert valuesInOrderAndBounds(count, sortedDim, minPackedValue, maxPackedValue, packedValues,
@ -1344,18 +1339,20 @@ public class BKDWriter implements Closeable {
}
}
MutablePointsReaderUtils.partition(maxDoc, splitDim, bytesPerDim, commonPrefixLen,
reader, from, to, mid, scratch1, scratch2);
reader, from, to, mid, scratchBytesRef1, scratchBytesRef2);
// set the split value
final int address = nodeID * (1+bytesPerDim);
splitPackedValues[address] = (byte) splitDim;
reader.getValue(mid, scratch1);
System.arraycopy(scratch1, splitDim * bytesPerDim, splitPackedValues, address + 1, bytesPerDim);
reader.getValue(mid, scratchBytesRef1);
System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + splitDim * bytesPerDim, splitPackedValues, address + 1, bytesPerDim);
byte[] minSplitPackedValue = Arrays.copyOf(minPackedValue, packedBytesLength);
byte[] maxSplitPackedValue = Arrays.copyOf(maxPackedValue, packedBytesLength);
System.arraycopy(scratch1, splitDim * bytesPerDim, minSplitPackedValue, splitDim * bytesPerDim, bytesPerDim);
System.arraycopy(scratch1, splitDim * bytesPerDim, maxSplitPackedValue, splitDim * bytesPerDim, bytesPerDim);
System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + splitDim * bytesPerDim,
minSplitPackedValue, splitDim * bytesPerDim, bytesPerDim);
System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + splitDim * bytesPerDim,
maxSplitPackedValue, splitDim * bytesPerDim, bytesPerDim);
// recurse
build(nodeID * 2, leafNodeOffset, reader, from, mid, out,

View File

@ -17,6 +17,7 @@
package org.apache.lucene.util.bkd;
import org.apache.lucene.codecs.MutablePointsReader;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntroSelector;
import org.apache.lucene.util.IntroSorter;
import org.apache.lucene.util.MSBRadixSorter;
@ -54,8 +55,8 @@ final class MutablePointsReaderUtils {
protected org.apache.lucene.util.Sorter getFallbackSorter(int k) {
return new IntroSorter() {
final byte[] pivot = new byte[packedBytesLength];
final byte[] scratch = new byte[packedBytesLength];
final BytesRef pivot = new BytesRef();
final BytesRef scratch = new BytesRef();
int pivotDoc;
@Override
@ -73,7 +74,7 @@ final class MutablePointsReaderUtils {
protected int comparePivot(int j) {
if (k < packedBytesLength) {
reader.getValue(j, scratch);
int cmp = StringHelper.compare(packedBytesLength - k, pivot, k, scratch, k);
int cmp = StringHelper.compare(packedBytesLength - k, pivot.bytes, pivot.offset + k, scratch.bytes, scratch.offset + k);
if (cmp != 0) {
return cmp;
}
@ -89,7 +90,7 @@ final class MutablePointsReaderUtils {
/** Sort points on the given dimension. */
static void sortByDim(int sortedDim, int bytesPerDim, int[] commonPrefixLengths,
MutablePointsReader reader, int from, int to,
byte[] scratch1, byte[] scratch2) {
BytesRef scratch1, BytesRef scratch2) {
// No need for a fancy radix sort here, this is called on the leaves only so
// there are not many values to sort
@ -97,7 +98,7 @@ final class MutablePointsReaderUtils {
final int numBytesToCompare = bytesPerDim - commonPrefixLengths[sortedDim];
new IntroSorter() {
final byte[] pivot = scratch1;
final BytesRef pivot = scratch1;
int pivotDoc = -1;
@Override
@ -114,7 +115,7 @@ final class MutablePointsReaderUtils {
@Override
protected int comparePivot(int j) {
reader.getValue(j, scratch2);
int cmp = StringHelper.compare(numBytesToCompare, pivot, offset, scratch2, offset);
int cmp = StringHelper.compare(numBytesToCompare, pivot.bytes, pivot.offset + offset, scratch2.bytes, scratch2.offset + offset);
if (cmp == 0) {
cmp = pivotDoc - reader.getDocID(j);
}
@ -128,7 +129,7 @@ final class MutablePointsReaderUtils {
* equal to it. */
static void partition(int maxDoc, int splitDim, int bytesPerDim, int commonPrefixLen,
MutablePointsReader reader, int from, int to, int mid,
byte[] scratch1, byte[] scratch2) {
BytesRef scratch1, BytesRef scratch2) {
final int offset = splitDim * bytesPerDim + commonPrefixLen;
final int cmpBytes = bytesPerDim - commonPrefixLen;
final int bitsPerDocId = PackedInts.bitsRequired(maxDoc - 1);
@ -138,7 +139,7 @@ final class MutablePointsReaderUtils {
protected Selector getFallbackSelector(int k) {
return new IntroSelector() {
final byte[] pivot = scratch1;
final BytesRef pivot = scratch1;
int pivotDoc;
@Override
@ -156,7 +157,7 @@ final class MutablePointsReaderUtils {
protected int comparePivot(int j) {
if (k < cmpBytes) {
reader.getValue(j, scratch2);
int cmp = StringHelper.compare(cmpBytes - k, pivot, offset + k, scratch2, offset + k);
int cmp = StringHelper.compare(cmpBytes - k, pivot.bytes, pivot.offset + offset + k, scratch2.bytes, scratch2.offset + offset + k);
if (cmp != 0) {
return cmp;
}

View File

@ -45,12 +45,25 @@ public class TestByteBlockPool extends LuceneTestCase {
for (BytesRef expected : list) {
ref.grow(expected.length);
ref.setLength(expected.length);
if (random().nextBoolean()) {
pool.readBytes(position, ref.bytes(), 0, ref.length());
} else {
for (int i = 0; i < ref.length(); ++i) {
ref.setByteAt(i, pool.readByte(position + i));
}
switch (random().nextInt(3)) {
case 0:
// copy bytes
pool.readBytes(position, ref.bytes(), 0, ref.length());
break;
case 1:
// copy bytes one by one
for (int i = 0; i < ref.length(); ++i) {
ref.setByteAt(i, pool.readByte(position + i));
}
break;
case 2:
BytesRef scratch = new BytesRef();
scratch.length = ref.length();
pool.setRawBytesRef(scratch, position);
System.arraycopy(scratch.bytes, scratch.offset, ref.bytes(), 0, ref.length());
break;
default:
fail();
}
assertEquals(expected, ref.get());
position += ref.length();

View File

@ -41,9 +41,12 @@ public class TestMSBRadixSorter extends LuceneTestCase {
break;
}
final int finalMaxLength = maxLength;
new MSBRadixSorter(maxLength) {
@Override
protected int byteAt(int i, int k) {
assertTrue(k < finalMaxLength);
BytesRef ref = refs[i];
if (ref.length <= k) {
return -1;

View File

@ -36,13 +36,41 @@ public class TestRadixSelector extends LuceneTestCase {
random().nextBytes(bytes);
arr[i] = new BytesRef(bytes);
}
doTest(arr, from, to, maxLen);
}
public void testSharedPrefixes() {
for (int iter = 0; iter < 100; ++iter) {
doTestSharedPrefixes();
}
}
private void doTestSharedPrefixes() {
final int from = random().nextInt(5);
final int to = from + TestUtil.nextInt(random(), 1, 10000);
final int maxLen = TestUtil.nextInt(random(), 1, 12);
BytesRef[] arr = new BytesRef[from + to + random().nextInt(5)];
for (int i = 0; i < arr.length; ++i) {
byte[] bytes = new byte[TestUtil.nextInt(random(), 0, maxLen)];
random().nextBytes(bytes);
arr[i] = new BytesRef(bytes);
}
final int sharedPrefixLength = Math.min(arr[0].length, TestUtil.nextInt(random(), 1, maxLen));
for (int i = 1; i < arr.length; ++i) {
System.arraycopy(arr[0].bytes, arr[0].offset, arr[i].bytes, arr[i].offset, Math.min(sharedPrefixLength, arr[i].length));
}
doTest(arr, from, to, maxLen);
}
private void doTest(BytesRef[] arr, int from, int to, int maxLen) {
final int k = TestUtil.nextInt(random(), from, to - 1);
BytesRef[] expected = arr.clone();
Arrays.sort(expected, from, to);
BytesRef[] actual = arr.clone();
RadixSelector selector = new RadixSelector(random().nextBoolean() ? maxLen : Integer.MAX_VALUE) {
final int enforcedMaxLen = random().nextBoolean() ? maxLen : Integer.MAX_VALUE;
RadixSelector selector = new RadixSelector(enforcedMaxLen) {
@Override
protected void swap(int i, int j) {
@ -51,6 +79,7 @@ public class TestRadixSelector extends LuceneTestCase {
@Override
protected int byteAt(int i, int k) {
assertTrue(k < enforcedMaxLen);
BytesRef b = actual[i];
if (k >= b.length) {
return -1;

View File

@ -44,7 +44,7 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
Arrays.sort(points, new Comparator<Point>() {
@Override
public int compare(Point o1, Point o2) {
int cmp = StringHelper.compare(bytesPerDim, o1.packedValue, 0, o2.packedValue, 0);
int cmp = o1.packedValue.compareTo(o2.packedValue);
if (cmp == 0) {
cmp = Integer.compare(o1.doc, o2.doc);
}
@ -70,19 +70,25 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
for (int i = 0; i < commonPrefixLengths.length; ++i) {
commonPrefixLengths[i] = TestUtil.nextInt(random(), 0, bytesPerDim);
}
BytesRef firstValue = points[0].packedValue;
for (int i = 1; i < points.length; ++i) {
for (int dim = 0; dim < numDims; ++dim) {
int offset = dim * bytesPerDim;
System.arraycopy(points[0].packedValue, offset, points[i].packedValue, offset, commonPrefixLengths[dim]);
BytesRef packedValue = points[i].packedValue;
System.arraycopy(firstValue.bytes, firstValue.offset + offset, packedValue.bytes, packedValue.offset + offset, commonPrefixLengths[dim]);
}
}
DummyPointsReader reader = new DummyPointsReader(points);
final int sortedDim = random().nextInt(numDims);
MutablePointsReaderUtils.sortByDim(sortedDim, bytesPerDim, commonPrefixLengths, reader, 0, points.length,
new byte[numDims * bytesPerDim], new byte[numDims * bytesPerDim]);
new BytesRef(), new BytesRef());
for (int i = 1; i < points.length; ++i) {
final int offset = sortedDim * bytesPerDim;
int cmp = StringHelper.compare(bytesPerDim, reader.points[i-1].packedValue, offset, reader.points[i].packedValue, offset);
BytesRef previousValue = reader.points[i-1].packedValue;
BytesRef currentValue = reader.points[i].packedValue;
int cmp = StringHelper.compare(bytesPerDim,
previousValue.bytes, previousValue.offset + offset,
currentValue.bytes, currentValue.offset + offset);
if (cmp == 0) {
cmp = reader.points[i - 1].doc - reader.points[i].doc;
}
@ -103,17 +109,23 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
Point[] points = createRandomPoints(numDims, bytesPerDim, maxDoc);
int commonPrefixLength = TestUtil.nextInt(random(), 0, bytesPerDim);
final int splitDim = random().nextInt(numDims);
BytesRef firstValue = points[0].packedValue;
for (int i = 1; i < points.length; ++i) {
BytesRef packedValue = points[i].packedValue;
int offset = splitDim * bytesPerDim;
System.arraycopy(points[0].packedValue, offset, points[i].packedValue, offset, commonPrefixLength);
System.arraycopy(firstValue.bytes, firstValue.offset + offset, packedValue.bytes, packedValue.offset + offset, commonPrefixLength);
}
DummyPointsReader reader = new DummyPointsReader(points);
final int pivot = TestUtil.nextInt(random(), 0, points.length - 1);
MutablePointsReaderUtils.partition(maxDoc, splitDim, bytesPerDim, commonPrefixLength, reader, 0, points.length, pivot,
new byte[numDims * bytesPerDim], new byte[numDims * bytesPerDim]);
new BytesRef(), new BytesRef());
BytesRef pivotValue = reader.points[pivot].packedValue;
int offset = splitDim * bytesPerDim;
for (int i = 0; i < points.length; ++i) {
int cmp = StringHelper.compare(bytesPerDim, reader.points[i].packedValue, offset, reader.points[pivot].packedValue, offset);
BytesRef value = reader.points[i].packedValue;
int cmp = StringHelper.compare(bytesPerDim,
value.bytes, value.offset + offset,
pivotValue.bytes, pivotValue.offset + offset);
if (cmp == 0) {
cmp = reader.points[i].doc - reader.points[pivot].doc;
}
@ -140,11 +152,15 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
}
private static class Point {
final byte[] packedValue;
final BytesRef packedValue;
final int doc;
Point(byte[] packedValue, int doc) {
this.packedValue = packedValue;
// use a non-null offset to make sure MutablePointsReaderUtils does not ignore it
this.packedValue = new BytesRef(packedValue.length + 1);
this.packedValue.bytes[0] = (byte) random().nextInt(256);
this.packedValue.offset = 1;
this.packedValue.length = packedValue.length;
this.doc = doc;
}
@ -154,17 +170,17 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
return false;
}
Point that = (Point) obj;
return Arrays.equals(packedValue, that.packedValue) && doc == that.doc;
return packedValue.equals(that.packedValue) && doc == that.doc;
}
@Override
public int hashCode() {
return 31 * Arrays.hashCode(packedValue) + doc;
return 31 * packedValue.hashCode() + doc;
}
@Override
public String toString() {
return "value=" + new BytesRef(packedValue) + " doc=" + doc;
return "value=" + packedValue + " doc=" + doc;
}
}
@ -187,13 +203,16 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
}
@Override
public void getValue(int i, byte[] packedValue) {
System.arraycopy(points[i].packedValue, 0, packedValue, 0, points[i].packedValue.length);
public void getValue(int i, BytesRef packedValue) {
packedValue.bytes = points[i].packedValue.bytes;
packedValue.offset = points[i].packedValue.offset;
packedValue.length = points[i].packedValue.length;
}
@Override
public byte getByteAt(int i, int k) {
return points[i].packedValue[k];
BytesRef packedValue = points[i].packedValue;
return packedValue.bytes[packedValue.offset + k];
}
@Override