LUCENE-8888: Improve distribution of points with data dimensions in BKD tree leaves (#747)

This commit is contained in:
Ignacio Vera 2019-07-04 10:50:23 +02:00 committed by GitHub
parent 8bf5e8be64
commit 5bf6cf2edd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 443 additions and 205 deletions

View File

@ -175,6 +175,9 @@ Optimizations
* LUCENE-8901: Load frequencies lazily only when needed in BlockDocsEnum and
BlockImpactsEverythingEnum (Mayya Sharipova).
* LUCENE-8888: Optimize distribution of points with data dimensions in
BKD tree leaves. (Ignacio Vera)
Test Framework
* LUCENE-8825: CheckHits now display the shard index in case of mismatch

View File

@ -596,7 +596,7 @@ final class SimpleTextBKDWriter implements Closeable {
assert pointCount / numLeaves <= maxPointsInLeafNode: "pointCount=" + pointCount + " numLeaves=" + numLeaves + " maxPointsInLeafNode=" + maxPointsInLeafNode;
//We re-use the selector so we do not need to create an object every time.
BKDRadixSelector radixSelector = new BKDRadixSelector(numDataDims, bytesPerDim, maxPointsSortInHeap, tempDir, tempFileNamePrefix);
BKDRadixSelector radixSelector = new BKDRadixSelector(numDataDims, numIndexDims, bytesPerDim, maxPointsSortInHeap, tempDir, tempFileNamePrefix);
boolean success = false;
try {
@ -604,7 +604,7 @@ final class SimpleTextBKDWriter implements Closeable {
build(1, numLeaves, points, out,
radixSelector, minPackedValue, maxPackedValue,
splitPackedValues, leafBlockFPs);
splitPackedValues, leafBlockFPs, new int[maxPointsInLeafNode]);
// If no exception, we should have cleaned everything up:
@ -876,7 +876,7 @@ final class SimpleTextBKDWriter implements Closeable {
}
// sort by sortedDim
MutablePointsReaderUtils.sortByDim(sortedDim, bytesPerDim, commonPrefixLengths,
MutablePointsReaderUtils.sortByDim(numDataDims, numIndexDims, sortedDim, bytesPerDim, commonPrefixLengths,
reader, from, to, scratchBytesRef1, scratchBytesRef2);
// Save the block file pointer:
@ -919,7 +919,7 @@ final class SimpleTextBKDWriter implements Closeable {
break;
}
}
MutablePointsReaderUtils.partition(maxDoc, splitDim, bytesPerDim, commonPrefixLen,
MutablePointsReaderUtils.partition(numDataDims, numIndexDims, maxDoc, splitDim, bytesPerDim, commonPrefixLen,
reader, from, to, mid, scratchBytesRef1, scratchBytesRef2);
// set the split value
@ -950,7 +950,8 @@ final class SimpleTextBKDWriter implements Closeable {
BKDRadixSelector radixSelector,
byte[] minPackedValue, byte[] maxPackedValue,
byte[] splitPackedValues,
long[] leafBlockFPs) throws IOException {
long[] leafBlockFPs,
int[] spareDocIds) throws IOException {
if (nodeID >= leafNodeOffset) {
@ -1009,7 +1010,12 @@ final class SimpleTextBKDWriter implements Closeable {
// loading the values:
int count = to - from;
assert count > 0: "nodeID=" + nodeID + " leafNodeOffset=" + leafNodeOffset;
writeLeafBlockDocs(out, heapSource.docIDs, from, count);
// Write doc IDs
int[] docIDs = spareDocIds;
for (int i = 0; i < count; i++) {
docIDs[i] = heapSource.getPackedValueSlice(from + i).docID();
}
writeLeafBlockDocs(out, spareDocIds, 0, count);
// TODO: minor opto: we don't really have to write the actual common prefixes, because BKDReader on recursing can regenerate it for us
// from the index, much like how terms dict does so from the FST:
@ -1029,7 +1035,7 @@ final class SimpleTextBKDWriter implements Closeable {
}
};
assert valuesInOrderAndBounds(count, sortedDim, minPackedValue, maxPackedValue, packedValues,
heapSource.docIDs, from);
docIDs, 0);
writeLeafBlockPackedValues(out, commonPrefixLengths, count, sortedDim, packedValues);
} else {
@ -1074,12 +1080,12 @@ final class SimpleTextBKDWriter implements Closeable {
// Recurse on left tree:
build(2*nodeID, leafNodeOffset, pathSlices[0], out, radixSelector,
minPackedValue, maxSplitPackedValue, splitPackedValues, leafBlockFPs);
minPackedValue, maxSplitPackedValue, splitPackedValues, leafBlockFPs, spareDocIds);
// TODO: we could "tail recurse" here? have our parent discard its refs as we recurse right?
// Recurse on right tree:
build(2*nodeID+1, leafNodeOffset, pathSlices[1], out, radixSelector,
minSplitPackedValue, maxPackedValue, splitPackedValues, leafBlockFPs);
minSplitPackedValue, maxPackedValue, splitPackedValues, leafBlockFPs, spareDocIds);
}
}
@ -1131,6 +1137,13 @@ final class SimpleTextBKDWriter implements Closeable {
if (cmp > 0) {
throw new AssertionError("values out of order: last value=" + new BytesRef(lastPackedValue) + " current value=" + new BytesRef(packedValue, packedValueOffset, packedBytesLength) + " ord=" + ord + " sortedDim=" + sortedDim);
}
if (cmp == 0 && numDataDims > numIndexDims) {
int dataOffset = numIndexDims * bytesPerDim;
cmp = Arrays.compareUnsigned(lastPackedValue, dataOffset, packedBytesLength, packedValue, packedValueOffset + dataOffset, packedValueOffset + packedBytesLength);
if (cmp > 0) {
throw new AssertionError("data values out of order: last value=" + new BytesRef(lastPackedValue) + " current value=" + new BytesRef(packedValue, packedValueOffset, packedBytesLength) + " ord=" + ord);
}
}
if (cmp == 0 && doc < lastDoc) {
throw new AssertionError("docs out of order: last doc=" + lastDoc + " current doc=" + doc + " ord=" + ord + " sortedDim=" + sortedDim);
}

View File

@ -47,6 +47,8 @@ public final class BKDRadixSelector {
private final int bytesSorted;
//data dimensions size
private final int packedBytesLength;
// data dimensions plus docID size
private final int packedBytesDocIDLength;
//flag to when we are moving to sort on heap
private final int maxPointsSortInHeap;
//reusable buffer
@ -59,18 +61,26 @@ public final class BKDRadixSelector {
private final Directory tempDir;
// prefix for temp files
private final String tempFileNamePrefix;
// data and index dimensions
private final int numDataDims, numIndexDims;
/**
* Sole constructor.
*/
public BKDRadixSelector(int numDim, int bytesPerDim, int maxPointsSortInHeap, Directory tempDir, String tempFileNamePrefix) {
public BKDRadixSelector(int numDataDims, int numIndexDims, int bytesPerDim, int maxPointsSortInHeap, Directory tempDir, String tempFileNamePrefix) {
this.bytesPerDim = bytesPerDim;
this.packedBytesLength = numDim * bytesPerDim;
this.bytesSorted = bytesPerDim + Integer.BYTES;
this.numDataDims = numDataDims;
this.numIndexDims = numIndexDims;
this.packedBytesLength = numDataDims * bytesPerDim;
this.packedBytesDocIDLength = packedBytesLength + Integer.BYTES;
// Selection and sorting is done in a given dimension. In case the value of the dimension are equal
// between two points we tie break first using the data-only dimensions and if those are still equal
// we tie-break on the docID. Here we account for all bytes used in the process.
this.bytesSorted = bytesPerDim + (numDataDims - numIndexDims) * bytesPerDim + Integer.BYTES;
this.maxPointsSortInHeap = maxPointsSortInHeap;
int numberOfPointsOffline = MAX_SIZE_OFFLINE_BUFFER / (packedBytesLength + Integer.BYTES);
this.offlineBuffer = new byte[numberOfPointsOffline * (packedBytesLength + Integer.BYTES)];
int numberOfPointsOffline = MAX_SIZE_OFFLINE_BUFFER / packedBytesDocIDLength;
this.offlineBuffer = new byte[numberOfPointsOffline * packedBytesDocIDLength];
this.partitionBucket = new int[bytesSorted];
this.histogram = new long[HISTOGRAM_SIZE];
this.scratch = new byte[bytesSorted];
@ -133,12 +143,12 @@ public final class BKDRadixSelector {
assert commonPrefixPosition > dimCommonPrefix;
reader.next();
PointValue pointValue = reader.pointValue();
BytesRef packedValueDocID = pointValue.packedValueDocIDBytes();
// copy dimension
BytesRef packedValue = pointValue.packedValue();
System.arraycopy(packedValue.bytes, packedValue.offset + offset, scratch, 0, bytesPerDim);
// copy docID
BytesRef docIDBytes = pointValue.docIDBytes();
System.arraycopy(docIDBytes.bytes, docIDBytes.offset, scratch, bytesPerDim, Integer.BYTES);
System.arraycopy(packedValueDocID.bytes, packedValueDocID.offset + offset, scratch, 0, bytesPerDim);
// copy data dimensions and docID
System.arraycopy(packedValueDocID.bytes, packedValueDocID.offset + numIndexDims * bytesPerDim, scratch, bytesPerDim, (numDataDims - numIndexDims) * bytesPerDim + Integer.BYTES);
for (long i = from + 1; i < to; i++) {
reader.next();
pointValue = reader.pointValue();
@ -156,13 +166,15 @@ public final class BKDRadixSelector {
//check common prefix and adjust histogram
final int startIndex = (dimCommonPrefix > bytesPerDim) ? bytesPerDim : dimCommonPrefix;
final int endIndex = (commonPrefixPosition > bytesPerDim) ? bytesPerDim : commonPrefixPosition;
packedValue = pointValue.packedValue();
int j = Arrays.mismatch(scratch, startIndex, endIndex, packedValue.bytes, packedValue.offset + offset + startIndex, packedValue.offset + offset + endIndex);
packedValueDocID = pointValue.packedValueDocIDBytes();
int j = Arrays.mismatch(scratch, startIndex, endIndex, packedValueDocID.bytes, packedValueDocID.offset + offset + startIndex, packedValueDocID.offset + offset + endIndex);
if (j == -1) {
if (commonPrefixPosition > bytesPerDim) {
//tie-break on docID
docIDBytes = pointValue.docIDBytes();
int k = Arrays.mismatch(scratch, bytesPerDim, commonPrefixPosition, docIDBytes.bytes, docIDBytes.offset, docIDBytes.offset + commonPrefixPosition - bytesPerDim);
//tie-break on data dimensions + docID
final int startTieBreak = numIndexDims * bytesPerDim;
final int endTieBreak = startTieBreak + commonPrefixPosition - bytesPerDim;
int k = Arrays.mismatch(scratch, bytesPerDim, commonPrefixPosition,
packedValueDocID.bytes, packedValueDocID.offset + startTieBreak, packedValueDocID.offset + endTieBreak);
if (k != -1) {
commonPrefixPosition = bytesPerDim + k;
Arrays.fill(histogram, 0);
@ -194,8 +206,8 @@ public final class BKDRadixSelector {
BytesRef packedValue = pointValue.packedValue();
bucket = packedValue.bytes[packedValue.offset + offset + commonPrefixPosition] & 0xff;
} else {
BytesRef docIDValue = pointValue.docIDBytes();
bucket = docIDValue.bytes[docIDValue.offset + commonPrefixPosition - bytesPerDim] & 0xff;
BytesRef packedValueDocID = pointValue.packedValueDocIDBytes();
bucket = packedValueDocID.bytes[packedValueDocID.offset + numIndexDims * bytesPerDim + commonPrefixPosition - bytesPerDim] & 0xff;
}
return bucket;
}
@ -309,10 +321,11 @@ public final class BKDRadixSelector {
return partition;
}
private byte[] heapRadixSelect(HeapPointWriter points, int dim, int from, int to, int partitionPoint, int commonPrefix) {
final int offset = dim * bytesPerDim + commonPrefix;
final int dimCmpBytes = bytesPerDim - commonPrefix;
new RadixSelector(bytesSorted - commonPrefix) {
private byte[] heapRadixSelect(HeapPointWriter points, int dim, int from, int to, int partitionPoint, int commonPrefixLength) {
final int dimOffset = dim * bytesPerDim + commonPrefixLength;
final int dimCmpBytes = bytesPerDim - commonPrefixLength;
final int dataOffset = numIndexDims * bytesPerDim - dimCmpBytes;
new RadixSelector(bytesSorted - commonPrefixLength) {
@Override
protected void swap(int i, int j) {
@ -324,23 +337,23 @@ public final class BKDRadixSelector {
assert k >= 0 : "negative prefix " + k;
if (k < dimCmpBytes) {
// dim bytes
return points.block[i * packedBytesLength + offset + k] & 0xff;
return points.block[i * packedBytesDocIDLength + dimOffset + k] & 0xff;
} else {
// doc id
int s = 3 - (k - dimCmpBytes);
return (points.docIDs[i] >>> (s * 8)) & 0xff;
// data bytes
return points.block[i * packedBytesDocIDLength + dataOffset + k] & 0xff;
}
}
@Override
protected Selector getFallbackSelector(int d) {
int skypedBytes = d + commonPrefix;
final int start = dim * bytesPerDim + skypedBytes;
final int end = dim * bytesPerDim + bytesPerDim;
final int skypedBytes = d + commonPrefixLength;
final int dimStart = dim * bytesPerDim + skypedBytes;
final int dimEnd = dim * bytesPerDim + bytesPerDim;
final int dataOffset = numIndexDims * bytesPerDim;
// data length is composed by the data dimensions plus the docID
final int dataLength = (numDataDims - numIndexDims) * bytesPerDim + Integer.BYTES;
return new IntroSelector() {
int pivotDoc = -1;
@Override
protected void swap(int i, int j) {
points.swap(i, j);
@ -349,36 +362,37 @@ public final class BKDRadixSelector {
@Override
protected void setPivot(int i) {
if (skypedBytes < bytesPerDim) {
System.arraycopy(points.block, i * packedBytesLength + dim * bytesPerDim, scratch, 0, bytesPerDim);
System.arraycopy(points.block, i * packedBytesDocIDLength + dim * bytesPerDim, scratch, 0, bytesPerDim);
}
pivotDoc = points.docIDs[i];
System.arraycopy(points.block, i * packedBytesDocIDLength + dataOffset, scratch, bytesPerDim, dataLength);
}
@Override
protected int compare(int i, int j) {
if (skypedBytes < bytesPerDim) {
int iOffset = i * packedBytesLength;
int jOffset = j * packedBytesLength;
int cmp = Arrays.compareUnsigned(points.block, iOffset + start, iOffset + end,
points.block, jOffset + start, jOffset + end);
int iOffset = i * packedBytesDocIDLength;
int jOffset = j * packedBytesDocIDLength;
int cmp = Arrays.compareUnsigned(points.block, iOffset + dimStart, iOffset + dimEnd, points.block, jOffset + dimStart, jOffset + dimEnd);
if (cmp != 0) {
return cmp;
}
}
return points.docIDs[i] - points.docIDs[j];
int iOffset = i * packedBytesDocIDLength + dataOffset;
int jOffset = j * packedBytesDocIDLength + dataOffset;
return Arrays.compareUnsigned(points.block, iOffset, iOffset + dataLength, points.block, jOffset, jOffset + dataLength);
}
@Override
protected int comparePivot(int j) {
if (skypedBytes < bytesPerDim) {
int jOffset = j * packedBytesLength;
int cmp = Arrays.compareUnsigned(scratch, skypedBytes, bytesPerDim,
points.block, jOffset + start, jOffset + end);
int jOffset = j * packedBytesDocIDLength;
int cmp = Arrays.compareUnsigned(scratch, skypedBytes, bytesPerDim, points.block, jOffset + dimStart, jOffset + dimEnd);
if (cmp != 0) {
return cmp;
}
}
return pivotDoc - points.docIDs[j];
int jOffset = j * packedBytesDocIDLength + dataOffset;
return Arrays.compareUnsigned(scratch, bytesPerDim, bytesPerDim + dataLength, points.block, jOffset, jOffset + dataLength);
}
};
}
@ -393,8 +407,9 @@ public final class BKDRadixSelector {
/** Sort the heap writer by the specified dim. It is used to sort the leaves of the tree */
public void heapRadixSort(final HeapPointWriter points, int from, int to, int dim, int commonPrefixLength) {
final int offset = dim * bytesPerDim + commonPrefixLength;
final int dimOffset = dim * bytesPerDim + commonPrefixLength;
final int dimCmpBytes = bytesPerDim - commonPrefixLength;
final int dataOffset = numIndexDims * bytesPerDim - dimCmpBytes;
new MSBRadixSorter(bytesSorted - commonPrefixLength) {
@Override
@ -402,11 +417,10 @@ public final class BKDRadixSelector {
assert k >= 0 : "negative prefix " + k;
if (k < dimCmpBytes) {
// dim bytes
return points.block[i * packedBytesLength + offset + k] & 0xff;
return points.block[i * packedBytesDocIDLength + dimOffset + k] & 0xff;
} else {
// doc id
int s = 3 - (k - dimCmpBytes);
return (points.docIDs[i] >>> (s * 8)) & 0xff;
// data bytes
return points.block[i * packedBytesDocIDLength + dataOffset + k] & 0xff;
}
}
@ -417,13 +431,14 @@ public final class BKDRadixSelector {
@Override
protected Sorter getFallbackSorter(int k) {
int skypedBytes = k + commonPrefixLength;
final int start = dim * bytesPerDim + skypedBytes;
final int end = dim * bytesPerDim + bytesPerDim;
final int skypedBytes = k + commonPrefixLength;
final int dimStart = dim * bytesPerDim + skypedBytes;
final int dimEnd = dim * bytesPerDim + bytesPerDim;
final int dataOffset = numIndexDims * bytesPerDim;
// data length is composed by the data dimensions plus the docID
final int dataLength = (numDataDims - numIndexDims) * bytesPerDim + Integer.BYTES;
return new IntroSorter() {
int pivotDoc = -1;
@Override
protected void swap(int i, int j) {
points.swap(i, j);
@ -432,36 +447,37 @@ public final class BKDRadixSelector {
@Override
protected void setPivot(int i) {
if (skypedBytes < bytesPerDim) {
System.arraycopy(points.block, i * packedBytesLength + dim * bytesPerDim, scratch, 0, bytesPerDim);
System.arraycopy(points.block, i * packedBytesDocIDLength + dim * bytesPerDim, scratch, 0, bytesPerDim);
}
pivotDoc = points.docIDs[i];
System.arraycopy(points.block, i * packedBytesDocIDLength + dataOffset, scratch, bytesPerDim, dataLength);
}
@Override
protected int compare(int i, int j) {
if (skypedBytes < bytesPerDim) {
int iOffset = i * packedBytesLength;
int jOffset = j * packedBytesLength;
int cmp = Arrays.compareUnsigned(points.block, iOffset + start, iOffset + end,
points.block, jOffset + start, jOffset + end);
int iOffset = i * packedBytesDocIDLength;
int jOffset = j * packedBytesDocIDLength;
int cmp = Arrays.compareUnsigned(points.block, iOffset + dimStart, iOffset + dimEnd, points.block, jOffset + dimStart, jOffset + dimEnd);
if (cmp != 0) {
return cmp;
}
}
return points.docIDs[i] - points.docIDs[j];
int iOffset = i * packedBytesDocIDLength + dataOffset;
int jOffset = j * packedBytesDocIDLength + dataOffset;
return Arrays.compareUnsigned(points.block, iOffset, iOffset + dataLength, points.block, jOffset, jOffset + dataLength);
}
@Override
protected int comparePivot(int j) {
if (skypedBytes < bytesPerDim) {
int jOffset = j * packedBytesLength;
int cmp = Arrays.compareUnsigned(scratch, skypedBytes, bytesPerDim,
points.block, jOffset + start, jOffset + end);
int jOffset = j * packedBytesDocIDLength;
int cmp = Arrays.compareUnsigned(scratch, skypedBytes, bytesPerDim, points.block, jOffset + dimStart, jOffset + dimEnd);
if (cmp != 0) {
return cmp;
}
}
return pivotDoc - points.docIDs[j];
int jOffset = j * packedBytesDocIDLength + dataOffset;
return Arrays.compareUnsigned(scratch, bytesPerDim, bytesPerDim + dataLength, points.block, jOffset, jOffset + dataLength);
}
};
}

View File

@ -774,7 +774,7 @@ public class BKDWriter implements Closeable {
assert pointCount / numLeaves <= maxPointsInLeafNode: "pointCount=" + pointCount + " numLeaves=" + numLeaves + " maxPointsInLeafNode=" + maxPointsInLeafNode;
//We re-use the selector so we do not need to create an object every time.
BKDRadixSelector radixSelector = new BKDRadixSelector(numDataDims, bytesPerDim, maxPointsSortInHeap, tempDir, tempFileNamePrefix);
BKDRadixSelector radixSelector = new BKDRadixSelector(numDataDims, numIndexDims, bytesPerDim, maxPointsSortInHeap, tempDir, tempFileNamePrefix);
boolean success = false;
try {
@ -785,7 +785,8 @@ public class BKDWriter implements Closeable {
minPackedValue, maxPackedValue,
parentSplits,
splitPackedValues,
leafBlockFPs);
leafBlockFPs,
new int[maxPointsInLeafNode]);
assert Arrays.equals(parentSplits, new int[numIndexDims]);
// If no exception, we should have cleaned everything up:
@ -1361,7 +1362,7 @@ public class BKDWriter implements Closeable {
}
// sort by sortedDim
MutablePointsReaderUtils.sortByDim(sortedDim, bytesPerDim, commonPrefixLengths,
MutablePointsReaderUtils.sortByDim(numDataDims, numIndexDims, sortedDim, bytesPerDim, commonPrefixLengths,
reader, from, to, scratchBytesRef1, scratchBytesRef2);
BytesRef comparator = scratchBytesRef1;
@ -1428,7 +1429,7 @@ public class BKDWriter implements Closeable {
commonPrefixLen = bytesPerDim;
}
MutablePointsReaderUtils.partition(maxDoc, splitDim, bytesPerDim, commonPrefixLen,
MutablePointsReaderUtils.partition(numDataDims, numIndexDims, maxDoc, splitDim, bytesPerDim, commonPrefixLen,
reader, from, to, mid, scratchBytesRef1, scratchBytesRef2);
// set the split value
@ -1465,7 +1466,8 @@ public class BKDWriter implements Closeable {
byte[] minPackedValue, byte[] maxPackedValue,
int[] parentSplits,
byte[] splitPackedValues,
long[] leafBlockFPs) throws IOException {
long[] leafBlockFPs,
int[] spareDocIds) throws IOException {
if (nodeID >= leafNodeOffset) {
@ -1525,7 +1527,13 @@ public class BKDWriter implements Closeable {
// loading the values:
int count = to - from;
assert count > 0: "nodeID=" + nodeID + " leafNodeOffset=" + leafNodeOffset;
writeLeafBlockDocs(out, heapSource.docIDs, from, count);
assert count <= spareDocIds.length : "count=" + count + " > length=" + spareDocIds.length;
// Write doc IDs
int[] docIDs = spareDocIds;
for (int i = 0; i < count; i++) {
docIDs[i] = heapSource.getPackedValueSlice(from + i).docID();
}
writeLeafBlockDocs(out, docIDs, 0, count);
// TODO: minor opto: we don't really have to write the actual common prefixes, because BKDReader on recursing can regenerate it for us
// from the index, much like how terms dict does so from the FST:
@ -1548,7 +1556,7 @@ public class BKDWriter implements Closeable {
}
};
assert valuesInOrderAndBounds(count, sortedDim, minPackedValue, maxPackedValue, packedValues,
heapSource.docIDs, from);
docIDs, 0);
writeLeafBlockPackedValues(out, commonPrefixLengths, count, sortedDim, packedValues, leafCardinality);
} else {
@ -1595,12 +1603,12 @@ public class BKDWriter implements Closeable {
// Recurse on left tree:
build(2 * nodeID, leafNodeOffset, slices[0],
out, radixSelector, minPackedValue, maxSplitPackedValue,
parentSplits, splitPackedValues, leafBlockFPs);
parentSplits, splitPackedValues, leafBlockFPs, spareDocIds);
// Recurse on right tree:
build(2 * nodeID + 1, leafNodeOffset, slices[1],
out, radixSelector, minSplitPackedValue, maxPackedValue
, parentSplits, splitPackedValues, leafBlockFPs);
, parentSplits, splitPackedValues, leafBlockFPs, spareDocIds);
parentSplits[splitDim]--;
}
@ -1654,6 +1662,13 @@ public class BKDWriter implements Closeable {
if (cmp > 0) {
throw new AssertionError("values out of order: last value=" + new BytesRef(lastPackedValue) + " current value=" + new BytesRef(packedValue, packedValueOffset, packedBytesLength) + " ord=" + ord);
}
if (cmp == 0 && numDataDims > numIndexDims) {
int dataOffset = numIndexDims * bytesPerDim;
cmp = Arrays.compareUnsigned(lastPackedValue, dataOffset, packedBytesLength, packedValue, packedValueOffset + dataOffset, packedValueOffset + packedBytesLength);
if (cmp > 0) {
throw new AssertionError("data values out of order: last value=" + new BytesRef(lastPackedValue) + " current value=" + new BytesRef(packedValue, packedValueOffset, packedBytesLength) + " ord=" + ord);
}
}
if (cmp == 0 && doc < lastDoc) {
throw new AssertionError("docs out of order: last doc=" + lastDoc + " current doc=" + doc + " ord=" + ord);
}

View File

@ -27,16 +27,16 @@ public final class HeapPointReader implements PointReader {
private int curRead;
final byte[] block;
final int packedBytesLength;
final int[] docIDs;
final int packedBytesDocIDLength;
final int end;
private final HeapPointValue pointValue;
public HeapPointReader(byte[] block, int packedBytesLength, int[] docIDs, int start, int end) {
public HeapPointReader(byte[] block, int packedBytesLength, int start, int end) {
this.block = block;
this.docIDs = docIDs;
curRead = start-1;
this.end = end;
this.packedBytesLength = packedBytesLength;
this.packedBytesDocIDLength = packedBytesLength + Integer.BYTES;
if (start < end) {
this.pointValue = new HeapPointValue(block, packedBytesLength);
} else {
@ -53,7 +53,7 @@ public final class HeapPointReader implements PointReader {
@Override
public PointValue pointValue() {
pointValue.setValue(curRead * packedBytesLength, docIDs[curRead]);
pointValue.setOffset(curRead * packedBytesDocIDLength);
return pointValue;
}
@ -66,21 +66,22 @@ public final class HeapPointReader implements PointReader {
*/
static class HeapPointValue implements PointValue {
BytesRef packedValue;
BytesRef docIDBytes;
int docID;
final BytesRef packedValue;
final BytesRef packedValueDocID;
final int packedValueLength;
public HeapPointValue(byte[] value, int packedLength) {
packedValue = new BytesRef(value, 0, packedLength);
docIDBytes = new BytesRef(new byte[4]);
HeapPointValue(byte[] value, int packedValueLength) {
this.packedValueLength = packedValueLength;
this.packedValue = new BytesRef(value, 0, packedValueLength);
this.packedValueDocID = new BytesRef(value, 0, packedValueLength + Integer.BYTES);
}
/**
* Sets a new value by changing the offset and docID.
* Sets a new value by changing the offset.
*/
public void setValue(int offset, int docID) {
this.docID = docID;
public void setOffset(int offset) {
packedValue.offset = offset;
packedValueDocID.offset = offset;
}
@Override
@ -90,16 +91,14 @@ public final class HeapPointReader implements PointReader {
@Override
public int docID() {
return docID;
int position = packedValueDocID.offset + packedValueLength;
return ((packedValueDocID.bytes[position] & 0xFF) << 24) | ((packedValueDocID.bytes[++position] & 0xFF) << 16)
| ((packedValueDocID.bytes[++position] & 0xFF) << 8) | (packedValueDocID.bytes[++position] & 0xFF);
}
@Override
public BytesRef docIDBytes() {
docIDBytes.bytes[0] = (byte) (docID >> 24);
docIDBytes.bytes[1] = (byte) (docID >> 16);
docIDBytes.bytes[2] = (byte) (docID >> 8);
docIDBytes.bytes[3] = (byte) (docID >> 0);
return docIDBytes;
public BytesRef packedValueDocIDBytes() {
return packedValueDocID;
}
}
}

View File

@ -26,10 +26,10 @@ import org.apache.lucene.util.BytesRef;
* @lucene.internal
* */
public final class HeapPointWriter implements PointWriter {
public final int[] docIDs;
public final byte[] block;
final int size;
final int packedBytesLength;
final int packedBytesDocIDLength;
private final byte[] scratch;
private int nextWrite;
private boolean closed;
@ -38,11 +38,11 @@ public final class HeapPointWriter implements PointWriter {
public HeapPointWriter(int size, int packedBytesLength) {
this.docIDs = new int[size];
this.block = new byte[packedBytesLength * size];
this.size = size;
this.packedBytesDocIDLength = packedBytesLength + Integer.BYTES;
this.packedBytesLength = packedBytesLength;
this.scratch = new byte[packedBytesLength];
this.block = new byte[packedBytesDocIDLength * size];
this.size = size;
this.scratch = new byte[packedBytesDocIDLength];
if (size > 0) {
pointValue = new HeapPointReader.HeapPointValue(block, packedBytesLength);
} else {
@ -54,7 +54,7 @@ public final class HeapPointWriter implements PointWriter {
/** Returns a reference, in <code>result</code>, to the byte[] slice holding this value */
public PointValue getPackedValueSlice(int index) {
assert index < nextWrite : "nextWrite=" + (nextWrite) + " vs index=" + index;
pointValue.setValue(index * packedBytesLength, docIDs[index]);
pointValue.setOffset(index * packedBytesDocIDLength);
return pointValue;
}
@ -63,8 +63,12 @@ public final class HeapPointWriter implements PointWriter {
assert closed == false : "point writer is already closed";
assert packedValue.length == packedBytesLength : "[packedValue] must have length [" + packedBytesLength + "] but was [" + packedValue.length + "]";
assert nextWrite < size : "nextWrite=" + (nextWrite + 1) + " vs size=" + size;
System.arraycopy(packedValue, 0, block, nextWrite * packedBytesLength, packedBytesLength);
docIDs[nextWrite] = docID;
System.arraycopy(packedValue, 0, block, nextWrite * packedBytesDocIDLength, packedBytesLength);
int position = nextWrite * packedBytesDocIDLength + packedBytesLength;
block[position] = (byte) (docID >> 24);
block[++position] = (byte) (docID >> 16);
block[++position] = (byte) (docID >> 8);
block[++position] = (byte) (docID >> 0);
nextWrite++;
}
@ -72,27 +76,23 @@ public final class HeapPointWriter implements PointWriter {
public void append(PointValue pointValue) {
assert closed == false : "point writer is already closed";
assert nextWrite < size : "nextWrite=" + (nextWrite + 1) + " vs size=" + size;
BytesRef packedValue = pointValue.packedValue();
assert packedValue.length == packedBytesLength : "[packedValue] must have length [" + (packedBytesLength) + "] but was [" + packedValue.length + "]";
System.arraycopy(packedValue.bytes, packedValue.offset, block, nextWrite * packedBytesLength, packedBytesLength);
docIDs[nextWrite] = pointValue.docID();
BytesRef packedValueDocID = pointValue.packedValueDocIDBytes();
assert packedValueDocID.length == packedBytesDocIDLength : "[packedValue] must have length [" + (packedBytesDocIDLength) + "] but was [" + packedValueDocID.length + "]";
System.arraycopy(packedValueDocID.bytes, packedValueDocID.offset, block, nextWrite * packedBytesDocIDLength, packedBytesDocIDLength);
nextWrite++;
}
public void swap(int i, int j) {
int docID = docIDs[i];
docIDs[i] = docIDs[j];
docIDs[j] = docID;
int indexI = i * packedBytesLength;
int indexJ = j * packedBytesLength;
int indexI = i * packedBytesDocIDLength;
int indexJ = j * packedBytesDocIDLength;
// scratch1 = values[i]
System.arraycopy(block, indexI, scratch, 0, packedBytesLength);
System.arraycopy(block, indexI, scratch, 0, packedBytesDocIDLength);
// values[i] = values[j]
System.arraycopy(block, indexJ, block, indexI, packedBytesLength);
System.arraycopy(block, indexJ, block, indexI, packedBytesDocIDLength);
// values[j] = scratch1
System.arraycopy(scratch, 0, block, indexJ, packedBytesLength);
System.arraycopy(scratch, 0, block, indexJ, packedBytesDocIDLength);
}
public int computeCardinality(int from, int to, int numDataDims, int bytesPerDim, int[] commonPrefixLengths) {
@ -102,8 +102,8 @@ public final class HeapPointWriter implements PointWriter {
for (int dim = 0; dim < numDataDims; dim++) {
final int start = dim * bytesPerDim + commonPrefixLengths[dim];
final int end = dim * bytesPerDim + bytesPerDim;
if (Arrays.mismatch(block, i * packedBytesLength + start, i * packedBytesLength + end,
block, (i - 1) * packedBytesLength + start, (i - 1) * packedBytesLength + end) != -1) {
if (Arrays.mismatch(block, i * packedBytesDocIDLength + start, i * packedBytesDocIDLength + end,
block, (i - 1) * packedBytesDocIDLength + start, (i - 1) * packedBytesDocIDLength + end) != -1) {
leafCardinality++;
break;
}
@ -120,9 +120,9 @@ public final class HeapPointWriter implements PointWriter {
@Override
public PointReader getReader(long start, long length) {
assert closed : "point writer is still open and trying to get a reader";
assert start + length <= docIDs.length: "start=" + start + " length=" + length + " docIDs.length=" + docIDs.length;
assert start + length <= size: "start=" + start + " length=" + length + " docIDs.length=" + size;
assert start + length <= nextWrite: "start=" + start + " length=" + length + " nextWrite=" + nextWrite;
return new HeapPointReader(block, packedBytesLength, docIDs, (int) start, Math.toIntExact(start+length));
return new HeapPointReader(block, packedBytesLength, (int) start, Math.toIntExact(start+length));
}
@Override
@ -136,6 +136,6 @@ public final class HeapPointWriter implements PointWriter {
@Override
public String toString() {
return "HeapPointWriter(count=" + nextWrite + " size=" + docIDs.length + ")";
return "HeapPointWriter(count=" + nextWrite + " size=" + size + ")";
}
}

View File

@ -78,7 +78,8 @@ public final class MutablePointsReaderUtils {
protected int comparePivot(int j) {
if (k < packedBytesLength) {
reader.getValue(j, scratch);
int cmp = Arrays.compareUnsigned(pivot.bytes, pivot.offset + k, pivot.offset + k + packedBytesLength - k, scratch.bytes, scratch.offset + k, scratch.offset + k + packedBytesLength - k);
int cmp = Arrays.compareUnsigned(pivot.bytes, pivot.offset + k, pivot.offset + k + packedBytesLength - k,
scratch.bytes, scratch.offset + k, scratch.offset + k + packedBytesLength - k);
if (cmp != 0) {
return cmp;
}
@ -92,14 +93,16 @@ public final class MutablePointsReaderUtils {
}
/** Sort points on the given dimension. */
public static void sortByDim(int sortedDim, int bytesPerDim, int[] commonPrefixLengths,
public static void sortByDim(int numDataDim, int numIndexDim, int sortedDim, int bytesPerDim, int[] commonPrefixLengths,
MutablePointValues reader, int from, int to,
BytesRef scratch1, BytesRef scratch2) {
final int start = sortedDim * bytesPerDim + commonPrefixLengths[sortedDim];
final int dimEnd = sortedDim * bytesPerDim + bytesPerDim;
final int dataStart = numIndexDim * bytesPerDim;
final int dataEnd = dataStart + (numDataDim - numIndexDim) * bytesPerDim;
// No need for a fancy radix sort here, this is called on the leaves only so
// there are not many values to sort
final int offset = sortedDim * bytesPerDim + commonPrefixLengths[sortedDim];
final int numBytesToCompare = bytesPerDim - commonPrefixLengths[sortedDim];
new IntroSorter() {
final BytesRef pivot = scratch1;
@ -119,9 +122,14 @@ public final class MutablePointsReaderUtils {
@Override
protected int comparePivot(int j) {
reader.getValue(j, scratch2);
int cmp = Arrays.compareUnsigned(pivot.bytes, pivot.offset + offset, pivot.offset + offset + numBytesToCompare, scratch2.bytes, scratch2.offset + offset, scratch2.offset + offset + numBytesToCompare);
int cmp = Arrays.compareUnsigned(pivot.bytes, pivot.offset + start, pivot.offset + dimEnd, scratch2.bytes,
scratch2.offset + start, scratch2.offset + dimEnd);
if (cmp == 0) {
cmp = pivotDoc - reader.getDocID(j);
cmp = Arrays.compareUnsigned(pivot.bytes, pivot.offset + dataStart, pivot.offset + dataEnd,
scratch2.bytes, scratch2.offset + dataStart, scratch2.offset + dataEnd);
if (cmp == 0) {
cmp = pivotDoc - reader.getDocID(j);
}
}
return cmp;
}
@ -131,16 +139,20 @@ public final class MutablePointsReaderUtils {
/** Partition points around {@code mid}. All values on the left must be less
* than or equal to it and all values on the right must be greater than or
* equal to it. */
public static void partition(int maxDoc, int splitDim, int bytesPerDim, int commonPrefixLen,
public static void partition(int numDataDim, int numIndexDim, int maxDoc, int splitDim, int bytesPerDim, int commonPrefixLen,
MutablePointValues reader, int from, int to, int mid,
BytesRef scratch1, BytesRef scratch2) {
final int offset = splitDim * bytesPerDim + commonPrefixLen;
final int cmpBytes = bytesPerDim - commonPrefixLen;
final int dimOffset = splitDim * bytesPerDim + commonPrefixLen;
final int dimCmpBytes = bytesPerDim - commonPrefixLen;
final int dataOffset = numIndexDim * bytesPerDim;
final int dataCmpBytes = (numDataDim - numIndexDim) * bytesPerDim + dimCmpBytes;
final int bitsPerDocId = PackedInts.bitsRequired(maxDoc - 1);
new RadixSelector(cmpBytes + (bitsPerDocId + 7) / 8) {
new RadixSelector(dataCmpBytes + (bitsPerDocId + 7) / 8) {
@Override
protected Selector getFallbackSelector(int k) {
final int dataStart = (k < dimCmpBytes) ? dataOffset : dataOffset + k - dimCmpBytes;
final int dataEnd = numDataDim * bytesPerDim;
return new IntroSelector() {
final BytesRef pivot = scratch1;
@ -159,9 +171,18 @@ public final class MutablePointsReaderUtils {
@Override
protected int comparePivot(int j) {
if (k < cmpBytes) {
if (k < dimCmpBytes) {
reader.getValue(j, scratch2);
int cmp = Arrays.compareUnsigned(pivot.bytes, pivot.offset + offset + k, pivot.offset + offset + k + cmpBytes - k, scratch2.bytes, scratch2.offset + offset + k, scratch2.offset + offset + k + cmpBytes - k);
int cmp = Arrays.compareUnsigned(pivot.bytes, pivot.offset + dimOffset + k, pivot.offset + dimOffset + dimCmpBytes,
scratch2.bytes, scratch2.offset + dimOffset + k, scratch2.offset + dimOffset + dimCmpBytes);
if (cmp != 0) {
return cmp;
}
}
if (k < dataCmpBytes) {
reader.getValue(j, scratch2);
int cmp = Arrays.compareUnsigned(pivot.bytes, pivot.offset + dataStart, pivot.offset + dataEnd,
scratch2.bytes, scratch2.offset + dataStart, scratch2.offset + dataEnd);
if (cmp != 0) {
return cmp;
}
@ -178,10 +199,12 @@ public final class MutablePointsReaderUtils {
@Override
protected int byteAt(int i, int k) {
if (k < cmpBytes) {
return Byte.toUnsignedInt(reader.getByteAt(i, offset + k));
if (k < dimCmpBytes) {
return Byte.toUnsignedInt(reader.getByteAt(i, dimOffset + k));
} else if (k < dataCmpBytes) {
return Byte.toUnsignedInt(reader.getByteAt(i, dataOffset + k - dimCmpBytes));
} else {
final int shift = bitsPerDocId - ((k - cmpBytes + 1) << 3);
final int shift = bitsPerDocId - ((k - dataCmpBytes + 1) << 3);
return (reader.getDocID(i) >>> Math.max(0, shift)) & 0xff;
}
}

View File

@ -137,12 +137,14 @@ public final class OfflinePointReader implements PointReader {
*/
static class OfflinePointValue implements PointValue {
BytesRef packedValue;
BytesRef docIDBytes;
final BytesRef packedValue;
final BytesRef packedValueDocID;
final int packedValueLength;
OfflinePointValue(byte[] value, int packedValueLength) {
packedValue = new BytesRef(value, 0, packedValueLength);
docIDBytes = new BytesRef(value, packedValueLength, Integer.BYTES);
this.packedValueLength = packedValueLength;
this.packedValue = new BytesRef(value, 0, packedValueLength);
this.packedValueDocID = new BytesRef(value, 0, packedValueLength + Integer.BYTES);
}
/**
@ -150,7 +152,7 @@ public final class OfflinePointReader implements PointReader {
*/
public void setOffset(int offset) {
packedValue.offset = offset;
docIDBytes.offset = offset + packedValue.length;
packedValueDocID.offset = offset;
}
@Override
@ -160,14 +162,14 @@ public final class OfflinePointReader implements PointReader {
@Override
public int docID() {
int position =docIDBytes.offset;
return ((docIDBytes.bytes[position] & 0xFF) << 24) | ((docIDBytes.bytes[++position] & 0xFF) << 16)
| ((docIDBytes.bytes[++position] & 0xFF) << 8) | (docIDBytes.bytes[++position] & 0xFF);
int position = packedValueDocID.offset + packedValueLength;
return ((packedValueDocID.bytes[position] & 0xFF) << 24) | ((packedValueDocID.bytes[++position] & 0xFF) << 16)
| ((packedValueDocID.bytes[++position] & 0xFF) << 8) | (packedValueDocID.bytes[++position] & 0xFF);
}
@Override
public BytesRef docIDBytes() {
return docIDBytes;
public BytesRef packedValueDocIDBytes() {
return packedValueDocID;
}
}

View File

@ -62,12 +62,9 @@ public final class OfflinePointWriter implements PointWriter {
@Override
public void append(PointValue pointValue) throws IOException {
assert closed == false : "Point writer is already closed";
BytesRef packedValue = pointValue.packedValue();
assert packedValue.length == packedBytesLength : "[packedValue] must have length [" + packedBytesLength + "] but was [" + packedValue.length + "]";
out.writeBytes(packedValue.bytes, packedValue.offset, packedValue.length);
BytesRef docIDBytes = pointValue.docIDBytes();
assert docIDBytes.length == Integer.BYTES : "[docIDBytes] must have length [" + Integer.BYTES + "] but was [" + docIDBytes.length + "]";
out.writeBytes(docIDBytes.bytes, docIDBytes.offset, docIDBytes.length);
BytesRef packedValueDocID = pointValue.packedValueDocIDBytes();
assert packedValueDocID.length == packedBytesLength + Integer.BYTES : "[packedValue and docID] must have length [" + (packedBytesLength + Integer.BYTES) + "] but was [" + packedValueDocID.length + "]";
out.writeBytes(packedValueDocID.bytes, packedValueDocID.offset, packedValueDocID.length);
count++;
assert expectedCount == 0 || count <= expectedCount : "expectedCount=" + expectedCount + " vs count=" + count;
}

View File

@ -28,10 +28,11 @@ public interface PointValue {
/** Returns the packed values for the dimensions */
BytesRef packedValue();
/** Returns the document id */
/** Returns the docID */
int docID();
/** Returns the byte representation of the document id */
BytesRef docIDBytes();
/** Returns the byte representation of the packed value
* together with the docID */
BytesRef packedValueDocIDBytes();
}

View File

@ -1097,6 +1097,80 @@ public class TestBKD extends LuceneTestCase {
}
}
public void testCheckDataDimOptimalOrder() throws IOException {
Directory dir = newDirectory();
final int numValues = atLeast(5000);
final int maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 500);
final int numBytesPerDim = TestUtil.nextInt(random(), 1, 4);
final double maxMB = (float) 3.0 + (3*random().nextDouble());
final int numIndexDims = TestUtil.nextInt(random(), 1, 8);
final int numDataDims = TestUtil.nextInt(random(), numIndexDims, 8);
final byte[] pointValue1 = new byte[numDataDims * numBytesPerDim];
final byte[] pointValue2 = new byte[numDataDims * numBytesPerDim];
random().nextBytes(pointValue1);
random().nextBytes(pointValue2);
// equal index dimensions but different data dimensions
for (int i = 0; i < numIndexDims; i++) {
System.arraycopy(pointValue1, i * numBytesPerDim, pointValue2, i * numBytesPerDim, numBytesPerDim);
}
BKDWriter w = new BKDWriter(2 * numValues, dir, "_temp", numDataDims, numIndexDims, numBytesPerDim, maxPointsInLeafNode,
maxMB, 2 * numValues);
for (int i = 0; i < numValues; ++i) {
w.add(pointValue1, i);
w.add(pointValue2, i);
}
final long indexFP;
try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
indexFP = w.finish(out);
w.close();
}
IndexInput pointsIn = dir.openInput("bkd", IOContext.DEFAULT);
pointsIn.seek(indexFP);
BKDReader points = new BKDReader(pointsIn);
points.intersect(new IntersectVisitor() {
byte[] previous = null;
boolean hasChanged = false;
@Override
public void visit(int docID) {
throw new UnsupportedOperationException();
}
@Override
public void visit(int docID, byte[] packedValue) {
if (previous == null) {
previous = new byte[numDataDims * numBytesPerDim];
System.arraycopy(packedValue, 0, previous, 0, numDataDims * numBytesPerDim);
} else {
int mismatch = Arrays.mismatch(packedValue, previous);
if (mismatch != -1) {
if (hasChanged == false) {
hasChanged = true;
System.arraycopy(packedValue, 0, previous, 0, numDataDims * numBytesPerDim);
} else {
fail("Points are not in optimal order");
}
}
}
}
@Override
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
return Relation.CELL_CROSSES_QUERY;
}
});
pointsIn.close();
dir.close();
}
public void test2DLongOrdsOffline() throws Exception {
try (Directory dir = newDirectory()) {
int numDocs = 100000;

View File

@ -208,7 +208,7 @@ public class TestBKDRadixSelector extends LuceneTestCase {
}
private void verify(Directory dir, PointWriter points, int dataDimensions, int indexDimensions, long start, long end, long middle, int packedLength, int bytesPerDimensions, int sortedOnHeap) throws IOException{
BKDRadixSelector radixSelector = new BKDRadixSelector(dataDimensions, bytesPerDimensions, sortedOnHeap, dir, "test");
BKDRadixSelector radixSelector = new BKDRadixSelector(dataDimensions, indexDimensions, bytesPerDimensions, sortedOnHeap, dir, "test");
//we only split by indexed dimension so we check for each only those dimension
for (int splitDim = 0; splitDim < indexDimensions; splitDim++) {
//We need to make a copy of the data as it is deleted in the process
@ -224,9 +224,15 @@ public class TestBKDRadixSelector extends LuceneTestCase {
int cmp = Arrays.compareUnsigned(max, 0, bytesPerDimensions, min, 0, bytesPerDimensions);
assertTrue(cmp <= 0);
if (cmp == 0) {
int maxDocID = getMaxDocId(slices[0], bytesPerDimensions, splitDim, partitionPoint);
int minDocId = getMinDocId(slices[1], bytesPerDimensions, splitDim, partitionPoint);
assertTrue(minDocId >= maxDocID);
byte[] maxDataDim = getMaxDataDimension(slices[0], bytesPerDimensions, dataDimensions, indexDimensions, max, splitDim);
byte[] minDataDim = getMinDataDimension(slices[1], bytesPerDimensions, dataDimensions, indexDimensions, min, splitDim);
cmp = Arrays.compareUnsigned(maxDataDim, 0, (dataDimensions - indexDimensions) * bytesPerDimensions, minDataDim, 0, (dataDimensions - indexDimensions) * bytesPerDimensions);
assertTrue(cmp <= 0);
if (cmp == 0) {
int maxDocID = getMaxDocId(slices[0], bytesPerDimensions, splitDim, partitionPoint, dataDimensions, indexDimensions,maxDataDim);
int minDocId = getMinDocId(slices[1], bytesPerDimensions, splitDim, partitionPoint, dataDimensions, indexDimensions,minDataDim);
assertTrue(minDocId >= maxDocID);
}
}
assertTrue(Arrays.equals(partitionPoint, min));
slices[0].writer.destroy();
@ -292,14 +298,17 @@ public class TestBKDRadixSelector extends LuceneTestCase {
return min;
}
private int getMinDocId(BKDRadixSelector.PathSlice p, int bytesPerDimension, int dimension, byte[] partitionPoint) throws IOException {
private int getMinDocId(BKDRadixSelector.PathSlice p, int bytesPerDimension, int dimension, byte[] partitionPoint, int dataDims, int indexDims, byte[] dataDim) throws IOException {
int docID = Integer.MAX_VALUE;
try (PointReader reader = p.writer.getReader(p.start, p.count)) {
while (reader.next()) {
PointValue pointValue = reader.pointValue();
BytesRef packedValue = pointValue.packedValue();
int offset = dimension * bytesPerDimension;
if (Arrays.compareUnsigned(packedValue.bytes, packedValue.offset + offset, packedValue.offset + offset + bytesPerDimension, partitionPoint, 0, bytesPerDimension) == 0) {
int dataOffset = indexDims * bytesPerDimension;
int dataLength = (dataDims - indexDims) * bytesPerDimension;
if (Arrays.compareUnsigned(packedValue.bytes, packedValue.offset + offset, packedValue.offset + offset + bytesPerDimension, partitionPoint, 0, bytesPerDimension) == 0
&& Arrays.compareUnsigned(packedValue.bytes, packedValue.offset + dataOffset, packedValue.offset + dataOffset + dataLength, dataDim, 0, dataLength) == 0) {
int newDocID = pointValue.docID();
if (newDocID < docID) {
docID = newDocID;
@ -310,6 +319,26 @@ public class TestBKDRadixSelector extends LuceneTestCase {
return docID;
}
private byte[] getMinDataDimension(BKDRadixSelector.PathSlice p, int bytesPerDimension, int dataDims, int indexDims, byte[] minDim, int splitDim) throws IOException {
byte[] min = new byte[(dataDims - indexDims) * bytesPerDimension];
Arrays.fill(min, (byte) 0xff);
int offset = splitDim * bytesPerDimension;
try (PointReader reader = p.writer.getReader(p.start, p.count)) {
byte[] value = new byte[(dataDims - indexDims) * bytesPerDimension];
while (reader.next()) {
PointValue pointValue = reader.pointValue();
BytesRef packedValue = pointValue.packedValue();
if (Arrays.mismatch(minDim, 0, bytesPerDimension, packedValue.bytes, packedValue.offset + offset, packedValue.offset + offset + bytesPerDimension) == -1) {
System.arraycopy(packedValue.bytes, packedValue.offset + indexDims * bytesPerDimension, value, 0, (dataDims - indexDims) * bytesPerDimension);
if (Arrays.compareUnsigned(min, 0, (dataDims - indexDims) * bytesPerDimension, value, 0, (dataDims - indexDims) * bytesPerDimension) > 0) {
System.arraycopy(value, 0, min, 0, (dataDims - indexDims) * bytesPerDimension);
}
}
}
}
return min;
}
private byte[] getMax(BKDRadixSelector.PathSlice p, int bytesPerDimension, int dimension) throws IOException {
byte[] max = new byte[bytesPerDimension];
Arrays.fill(max, (byte) 0);
@ -327,14 +356,37 @@ public class TestBKDRadixSelector extends LuceneTestCase {
return max;
}
private int getMaxDocId(BKDRadixSelector.PathSlice p, int bytesPerDimension, int dimension, byte[] partitionPoint) throws IOException {
private byte[] getMaxDataDimension(BKDRadixSelector.PathSlice p, int bytesPerDimension, int dataDims, int indexDims, byte[] maxDim, int splitDim) throws IOException {
byte[] max = new byte[(dataDims - indexDims) * bytesPerDimension];
Arrays.fill(max, (byte) 0);
int offset = splitDim * bytesPerDimension;
try (PointReader reader = p.writer.getReader(p.start, p.count)) {
byte[] value = new byte[(dataDims - indexDims) * bytesPerDimension];
while (reader.next()) {
PointValue pointValue = reader.pointValue();
BytesRef packedValue = pointValue.packedValue();
if (Arrays.mismatch(maxDim, 0, bytesPerDimension, packedValue.bytes, packedValue.offset + offset, packedValue.offset + offset + bytesPerDimension) == -1) {
System.arraycopy(packedValue.bytes, packedValue.offset + indexDims * bytesPerDimension, value, 0, (dataDims - indexDims) * bytesPerDimension);
if (Arrays.compareUnsigned(max, 0, (dataDims - indexDims) * bytesPerDimension, value, 0, (dataDims - indexDims) * bytesPerDimension) < 0) {
System.arraycopy(value, 0, max, 0, (dataDims - indexDims) * bytesPerDimension);
}
}
}
}
return max;
}
private int getMaxDocId(BKDRadixSelector.PathSlice p, int bytesPerDimension, int dimension, byte[] partitionPoint, int dataDims, int indexDims, byte[] dataDim) throws IOException {
int docID = Integer.MIN_VALUE;
try (PointReader reader = p.writer.getReader(p.start, p.count)) {
while (reader.next()) {
PointValue pointValue = reader.pointValue();
BytesRef packedValue = pointValue.packedValue();
int offset = dimension * bytesPerDimension;
if (Arrays.compareUnsigned(packedValue.bytes, packedValue.offset + offset, packedValue.offset + offset + bytesPerDimension, partitionPoint, 0, bytesPerDimension) == 0) {
int dataOffset = indexDims * bytesPerDimension;
int dataLength = (dataDims - indexDims) * bytesPerDimension;
if (Arrays.compareUnsigned(packedValue.bytes, packedValue.offset + offset, packedValue.offset + offset + bytesPerDimension, partitionPoint, 0, bytesPerDimension) == 0
&& Arrays.compareUnsigned(packedValue.bytes, packedValue.offset + dataOffset, packedValue.offset + dataOffset + dataLength, dataDim, 0, dataLength) == 0) {
int newDocID = pointValue.docID();
if (newDocID > docID) {
docID = newDocID;

View File

@ -116,7 +116,7 @@ public class TestBKDRadixSort extends LuceneTestCase {
private void verifySort(HeapPointWriter points, int dataDimensions, int indexDimensions, int start, int end, int bytesPerDim) throws IOException{
int packedBytesLength = dataDimensions * bytesPerDim;
Directory dir = newDirectory();
BKDRadixSelector radixSelector = new BKDRadixSelector(dataDimensions, bytesPerDim, 1000, dir, "test");
BKDRadixSelector radixSelector = new BKDRadixSelector(dataDimensions, indexDimensions, bytesPerDim, 1000, dir, "test");
// we check for each dimension
for (int splitDim = 0; splitDim < dataDimensions; splitDim++) {
radixSelector.heapRadixSort(points, start, end, splitDim, getRandomCommonPrefix(points, start, end, bytesPerDim, splitDim));
@ -129,6 +129,11 @@ public class TestBKDRadixSort extends LuceneTestCase {
BytesRef value = pointValue.packedValue();
int cmp = Arrays.compareUnsigned(value.bytes, value.offset + dimOffset, value.offset + dimOffset + bytesPerDim, previous, dimOffset, dimOffset + bytesPerDim);
assertTrue(cmp >= 0);
if (cmp == 0) {
int dataOffset = indexDimensions * bytesPerDim;
cmp = Arrays.compareUnsigned(value.bytes, value.offset + dataOffset, value.offset + packedBytesLength, previous, dataOffset, packedBytesLength);
assertTrue(cmp >= 0);
}
if (cmp == 0) {
assertTrue(pointValue.docID() >= previousDocId);
}

View File

@ -37,7 +37,7 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
private void doTestSort() {
final int bytesPerDim = TestUtil.nextInt(random(), 1, 16);
final int maxDoc = TestUtil.nextInt(random(), 1, 1 << random().nextInt(30));
Point[] points = createRandomPoints(1, bytesPerDim, maxDoc);
Point[] points = createRandomPoints(1, 1, bytesPerDim, maxDoc, new int[1]);
DummyPointsReader reader = new DummyPointsReader(points);
MutablePointsReaderUtils.sort(maxDoc, bytesPerDim, reader, 0, points.length);
Arrays.sort(points, new Comparator<Point>() {
@ -61,25 +61,15 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
}
private void doTestSortByDim() {
final int numDims = TestUtil.nextInt(random(), 1, 8);
final int numIndexDims = TestUtil.nextInt(random(), 1, 8);
final int numDataDims = TestUtil.nextInt(random(), numIndexDims, 8);
final int bytesPerDim = TestUtil.nextInt(random(), 1, 16);
final int maxDoc = TestUtil.nextInt(random(), 1, 1 << random().nextInt(30));
Point[] points = createRandomPoints(numDims, bytesPerDim, maxDoc);
int[] commonPrefixLengths = new int[numDims];
for (int i = 0; i < commonPrefixLengths.length; ++i) {
commonPrefixLengths[i] = TestUtil.nextInt(random(), 0, bytesPerDim);
}
BytesRef firstValue = points[0].packedValue;
for (int i = 1; i < points.length; ++i) {
for (int dim = 0; dim < numDims; ++dim) {
int offset = dim * bytesPerDim;
BytesRef packedValue = points[i].packedValue;
System.arraycopy(firstValue.bytes, firstValue.offset + offset, packedValue.bytes, packedValue.offset + offset, commonPrefixLengths[dim]);
}
}
int[] commonPrefixLengths = new int[numDataDims];
Point[] points = createRandomPoints(numDataDims, numIndexDims, bytesPerDim, maxDoc, commonPrefixLengths);
DummyPointsReader reader = new DummyPointsReader(points);
final int sortedDim = random().nextInt(numDims);
MutablePointsReaderUtils.sortByDim(sortedDim, bytesPerDim, commonPrefixLengths, reader, 0, points.length,
final int sortedDim = random().nextInt(numIndexDims);
MutablePointsReaderUtils.sortByDim(numDataDims, numIndexDims, sortedDim, bytesPerDim, commonPrefixLengths, reader, 0, points.length,
new BytesRef(), new BytesRef());
for (int i = 1; i < points.length; ++i) {
final int offset = sortedDim * bytesPerDim;
@ -87,7 +77,13 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
BytesRef currentValue = reader.points[i].packedValue;
int cmp = Arrays.compareUnsigned(previousValue.bytes, previousValue.offset + offset, previousValue.offset + offset + bytesPerDim, currentValue.bytes, currentValue.offset + offset, currentValue.offset + offset + bytesPerDim);
if (cmp == 0) {
cmp = reader.points[i - 1].doc - reader.points[i].doc;
int dataDimOffset = numIndexDims * bytesPerDim;
int dataDimsLength = (numDataDims - numIndexDims) * bytesPerDim;
cmp = Arrays.compareUnsigned(previousValue.bytes, previousValue.offset + dataDimOffset, previousValue.offset + dataDimOffset + dataDimsLength,
currentValue.bytes, currentValue.offset + dataDimOffset, currentValue.offset + dataDimOffset + dataDimsLength);
if (cmp == 0) {
cmp = reader.points[i - 1].doc - reader.points[i].doc;
}
}
assertTrue(cmp <= 0);
}
@ -100,29 +96,31 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
}
private void doTestPartition() {
final int numDims = TestUtil.nextInt(random(), 1, 8);
final int numIndexDims = TestUtil.nextInt(random(), 1, 8);
final int numDataDims = TestUtil.nextInt(random(), numIndexDims, 8);
final int bytesPerDim = TestUtil.nextInt(random(), 1, 16);
int[] commonPrefixLengths = new int[numDataDims];
final int maxDoc = TestUtil.nextInt(random(), 1, 1 << random().nextInt(30));
Point[] points = createRandomPoints(numDims, bytesPerDim, maxDoc);
int commonPrefixLength = TestUtil.nextInt(random(), 0, bytesPerDim);
final int splitDim = random().nextInt(numDims);
BytesRef firstValue = points[0].packedValue;
for (int i = 1; i < points.length; ++i) {
BytesRef packedValue = points[i].packedValue;
int offset = splitDim * bytesPerDim;
System.arraycopy(firstValue.bytes, firstValue.offset + offset, packedValue.bytes, packedValue.offset + offset, commonPrefixLength);
}
Point[] points = createRandomPoints(numDataDims, numIndexDims, bytesPerDim, maxDoc, commonPrefixLengths);
final int splitDim = random().nextInt(numIndexDims);
DummyPointsReader reader = new DummyPointsReader(points);
final int pivot = TestUtil.nextInt(random(), 0, points.length - 1);
MutablePointsReaderUtils.partition(maxDoc, splitDim, bytesPerDim, commonPrefixLength, reader, 0, points.length, pivot,
MutablePointsReaderUtils.partition(numDataDims, numIndexDims, maxDoc, splitDim, bytesPerDim, commonPrefixLengths[splitDim], reader, 0, points.length, pivot,
new BytesRef(), new BytesRef());
BytesRef pivotValue = reader.points[pivot].packedValue;
int offset = splitDim * bytesPerDim;
for (int i = 0; i < points.length; ++i) {
BytesRef value = reader.points[i].packedValue;
int cmp = Arrays.compareUnsigned(value.bytes, value.offset + offset, value.offset + offset + bytesPerDim, pivotValue.bytes, pivotValue.offset + offset, pivotValue.offset + offset + bytesPerDim);
int cmp = Arrays.compareUnsigned(value.bytes, value.offset + offset, value.offset + offset + bytesPerDim,
pivotValue.bytes, pivotValue.offset + offset, pivotValue.offset + offset + bytesPerDim);
if (cmp == 0) {
cmp = reader.points[i].doc - reader.points[pivot].doc;
int dataDimOffset = numIndexDims * bytesPerDim;
int dataDimsLength = (numDataDims - numIndexDims) * bytesPerDim;
cmp = Arrays.compareUnsigned(value.bytes, value.offset + dataDimOffset, value.offset + dataDimOffset + dataDimsLength,
pivotValue.bytes, pivotValue.offset + dataDimOffset, pivotValue.offset + dataDimOffset + dataDimsLength);
if (cmp == 0) {
cmp = reader.points[i].doc - reader.points[pivot].doc;
}
}
if (i < pivot) {
assertTrue(cmp <= 0);
@ -134,14 +132,54 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
}
}
private static Point[] createRandomPoints(int numDims, int bytesPerDim, int maxDoc) {
final int packedBytesLength = numDims * bytesPerDim;
private static Point[] createRandomPoints(int numDataDims, int numIndexdims, int bytesPerDim, int maxDoc, int[] commonPrefixLengths) {
assertTrue(commonPrefixLengths.length == numDataDims);
final int packedBytesLength = numDataDims * bytesPerDim;
final int numPoints = TestUtil.nextInt(random(), 1, 100000);
Point[] points = new Point[numPoints];
for (int i = 0; i < numPoints; ++i) {
byte[] value = new byte[packedBytesLength];
random().nextBytes(value);
points[i] = new Point(value, random().nextInt(maxDoc));
if (random().nextInt(5) != 0) {
for (int i = 0; i < numPoints; ++i) {
byte[] value = new byte[packedBytesLength];
random().nextBytes(value);
points[i] = new Point(value, random().nextInt(maxDoc));
}
for (int i = 0; i < numDataDims; ++i) {
commonPrefixLengths[i] = TestUtil.nextInt(random(), 0, bytesPerDim);
}
BytesRef firstValue = points[0].packedValue;
for (int i = 1; i < points.length; ++i) {
for (int dim = 0; dim < numDataDims; ++dim) {
int offset = dim * bytesPerDim;
BytesRef packedValue = points[i].packedValue;
System.arraycopy(firstValue.bytes, firstValue.offset + offset, packedValue.bytes, packedValue.offset + offset, commonPrefixLengths[dim]);
}
}
} else {
//index dim are equal, data dims different
byte[] indexDims = new byte[numIndexdims * bytesPerDim];
random().nextBytes(indexDims);
byte[] dataDims = new byte[(numDataDims - numIndexdims) * bytesPerDim];
for (int i = 0; i < numPoints; ++i) {
byte[] value = new byte[packedBytesLength];
System.arraycopy(indexDims, 0, value, 0, numIndexdims * bytesPerDim);
random().nextBytes(dataDims);
System.arraycopy(dataDims, 0, value, numIndexdims * bytesPerDim, (numDataDims - numIndexdims) * bytesPerDim);
points[i] = new Point(value, random().nextInt(maxDoc));
}
for (int i = 0; i < numIndexdims; ++i) {
commonPrefixLengths[i] = bytesPerDim;
}
for (int i = numDataDims; i < numDataDims; ++i) {
commonPrefixLengths[i] = TestUtil.nextInt(random(), 0, bytesPerDim);
}
BytesRef firstValue = points[0].packedValue;
for (int i = 1; i < points.length; ++i) {
for (int dim = numIndexdims; dim < numDataDims; ++dim) {
int offset = dim * bytesPerDim;
BytesRef packedValue = points[i].packedValue;
System.arraycopy(firstValue.bytes, firstValue.offset + offset, packedValue.bytes, packedValue.offset + offset, commonPrefixLengths[dim]);
}
}
}
return points;
}