From 5bf6cf2eddf60a0d2696f31b9a252eb7af6f9c32 Mon Sep 17 00:00:00 2001 From: Ignacio Vera Date: Thu, 4 Jul 2019 10:50:23 +0200 Subject: [PATCH] LUCENE-8888: Improve distribution of points with data dimensions in BKD tree leaves (#747) --- lucene/CHANGES.txt | 3 + .../simpletext/SimpleTextBKDWriter.java | 31 ++-- .../lucene/util/bkd/BKDRadixSelector.java | 140 ++++++++++-------- .../org/apache/lucene/util/bkd/BKDWriter.java | 33 +++-- .../lucene/util/bkd/HeapPointReader.java | 39 +++-- .../lucene/util/bkd/HeapPointWriter.java | 50 +++---- .../util/bkd/MutablePointsReaderUtils.java | 53 +++++-- .../lucene/util/bkd/OfflinePointReader.java | 22 +-- .../lucene/util/bkd/OfflinePointWriter.java | 9 +- .../apache/lucene/util/bkd/PointValue.java | 7 +- .../org/apache/lucene/util/bkd/TestBKD.java | 74 +++++++++ .../lucene/util/bkd/TestBKDRadixSelector.java | 68 ++++++++- .../lucene/util/bkd/TestBKDRadixSort.java | 7 +- .../bkd/TestMutablePointsReaderUtils.java | 112 +++++++++----- 14 files changed, 443 insertions(+), 205 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 1ed979cc336..a56bc832787 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -175,6 +175,9 @@ Optimizations * LUCENE-8901: Load frequencies lazily only when needed in BlockDocsEnum and BlockImpactsEverythingEnum (Mayya Sharipova). +* LUCENE-8888: Optimize distribution of points with data dimensions in + BKD tree leaves. (Ignacio Vera) + Test Framework * LUCENE-8825: CheckHits now display the shard index in case of mismatch diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDWriter.java index aefa00c444c..ec41d874c94 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDWriter.java @@ -596,7 +596,7 @@ final class SimpleTextBKDWriter implements Closeable { assert pointCount / numLeaves <= maxPointsInLeafNode: "pointCount=" + pointCount + " numLeaves=" + numLeaves + " maxPointsInLeafNode=" + maxPointsInLeafNode; //We re-use the selector so we do not need to create an object every time. - BKDRadixSelector radixSelector = new BKDRadixSelector(numDataDims, bytesPerDim, maxPointsSortInHeap, tempDir, tempFileNamePrefix); + BKDRadixSelector radixSelector = new BKDRadixSelector(numDataDims, numIndexDims, bytesPerDim, maxPointsSortInHeap, tempDir, tempFileNamePrefix); boolean success = false; try { @@ -604,7 +604,7 @@ final class SimpleTextBKDWriter implements Closeable { build(1, numLeaves, points, out, radixSelector, minPackedValue, maxPackedValue, - splitPackedValues, leafBlockFPs); + splitPackedValues, leafBlockFPs, new int[maxPointsInLeafNode]); // If no exception, we should have cleaned everything up: @@ -876,7 +876,7 @@ final class SimpleTextBKDWriter implements Closeable { } // sort by sortedDim - MutablePointsReaderUtils.sortByDim(sortedDim, bytesPerDim, commonPrefixLengths, + MutablePointsReaderUtils.sortByDim(numDataDims, numIndexDims, sortedDim, bytesPerDim, commonPrefixLengths, reader, from, to, scratchBytesRef1, scratchBytesRef2); // Save the block file pointer: @@ -919,7 +919,7 @@ final class SimpleTextBKDWriter implements Closeable { break; } } - MutablePointsReaderUtils.partition(maxDoc, splitDim, bytesPerDim, commonPrefixLen, + MutablePointsReaderUtils.partition(numDataDims, numIndexDims, maxDoc, splitDim, bytesPerDim, commonPrefixLen, reader, from, to, mid, scratchBytesRef1, scratchBytesRef2); // set the split value @@ -950,7 +950,8 @@ final class SimpleTextBKDWriter implements Closeable { BKDRadixSelector radixSelector, byte[] minPackedValue, byte[] maxPackedValue, byte[] splitPackedValues, - long[] leafBlockFPs) throws IOException { + long[] leafBlockFPs, + int[] spareDocIds) throws IOException { if (nodeID >= leafNodeOffset) { @@ -1009,7 +1010,12 @@ final class SimpleTextBKDWriter implements Closeable { // loading the values: int count = to - from; assert count > 0: "nodeID=" + nodeID + " leafNodeOffset=" + leafNodeOffset; - writeLeafBlockDocs(out, heapSource.docIDs, from, count); + // Write doc IDs + int[] docIDs = spareDocIds; + for (int i = 0; i < count; i++) { + docIDs[i] = heapSource.getPackedValueSlice(from + i).docID(); + } + writeLeafBlockDocs(out, spareDocIds, 0, count); // TODO: minor opto: we don't really have to write the actual common prefixes, because BKDReader on recursing can regenerate it for us // from the index, much like how terms dict does so from the FST: @@ -1029,7 +1035,7 @@ final class SimpleTextBKDWriter implements Closeable { } }; assert valuesInOrderAndBounds(count, sortedDim, minPackedValue, maxPackedValue, packedValues, - heapSource.docIDs, from); + docIDs, 0); writeLeafBlockPackedValues(out, commonPrefixLengths, count, sortedDim, packedValues); } else { @@ -1074,12 +1080,12 @@ final class SimpleTextBKDWriter implements Closeable { // Recurse on left tree: build(2*nodeID, leafNodeOffset, pathSlices[0], out, radixSelector, - minPackedValue, maxSplitPackedValue, splitPackedValues, leafBlockFPs); + minPackedValue, maxSplitPackedValue, splitPackedValues, leafBlockFPs, spareDocIds); // TODO: we could "tail recurse" here? have our parent discard its refs as we recurse right? // Recurse on right tree: build(2*nodeID+1, leafNodeOffset, pathSlices[1], out, radixSelector, - minSplitPackedValue, maxPackedValue, splitPackedValues, leafBlockFPs); + minSplitPackedValue, maxPackedValue, splitPackedValues, leafBlockFPs, spareDocIds); } } @@ -1131,6 +1137,13 @@ final class SimpleTextBKDWriter implements Closeable { if (cmp > 0) { throw new AssertionError("values out of order: last value=" + new BytesRef(lastPackedValue) + " current value=" + new BytesRef(packedValue, packedValueOffset, packedBytesLength) + " ord=" + ord + " sortedDim=" + sortedDim); } + if (cmp == 0 && numDataDims > numIndexDims) { + int dataOffset = numIndexDims * bytesPerDim; + cmp = Arrays.compareUnsigned(lastPackedValue, dataOffset, packedBytesLength, packedValue, packedValueOffset + dataOffset, packedValueOffset + packedBytesLength); + if (cmp > 0) { + throw new AssertionError("data values out of order: last value=" + new BytesRef(lastPackedValue) + " current value=" + new BytesRef(packedValue, packedValueOffset, packedBytesLength) + " ord=" + ord); + } + } if (cmp == 0 && doc < lastDoc) { throw new AssertionError("docs out of order: last doc=" + lastDoc + " current doc=" + doc + " ord=" + ord + " sortedDim=" + sortedDim); } diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDRadixSelector.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDRadixSelector.java index 84fce3fa8e7..5a271103dbb 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDRadixSelector.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDRadixSelector.java @@ -47,6 +47,8 @@ public final class BKDRadixSelector { private final int bytesSorted; //data dimensions size private final int packedBytesLength; + // data dimensions plus docID size + private final int packedBytesDocIDLength; //flag to when we are moving to sort on heap private final int maxPointsSortInHeap; //reusable buffer @@ -59,18 +61,26 @@ public final class BKDRadixSelector { private final Directory tempDir; // prefix for temp files private final String tempFileNamePrefix; + // data and index dimensions + private final int numDataDims, numIndexDims; /** * Sole constructor. */ - public BKDRadixSelector(int numDim, int bytesPerDim, int maxPointsSortInHeap, Directory tempDir, String tempFileNamePrefix) { + public BKDRadixSelector(int numDataDims, int numIndexDims, int bytesPerDim, int maxPointsSortInHeap, Directory tempDir, String tempFileNamePrefix) { this.bytesPerDim = bytesPerDim; - this.packedBytesLength = numDim * bytesPerDim; - this.bytesSorted = bytesPerDim + Integer.BYTES; + this.numDataDims = numDataDims; + this.numIndexDims = numIndexDims; + this.packedBytesLength = numDataDims * bytesPerDim; + this.packedBytesDocIDLength = packedBytesLength + Integer.BYTES; + // Selection and sorting is done in a given dimension. In case the value of the dimension are equal + // between two points we tie break first using the data-only dimensions and if those are still equal + // we tie-break on the docID. Here we account for all bytes used in the process. + this.bytesSorted = bytesPerDim + (numDataDims - numIndexDims) * bytesPerDim + Integer.BYTES; this.maxPointsSortInHeap = maxPointsSortInHeap; - int numberOfPointsOffline = MAX_SIZE_OFFLINE_BUFFER / (packedBytesLength + Integer.BYTES); - this.offlineBuffer = new byte[numberOfPointsOffline * (packedBytesLength + Integer.BYTES)]; + int numberOfPointsOffline = MAX_SIZE_OFFLINE_BUFFER / packedBytesDocIDLength; + this.offlineBuffer = new byte[numberOfPointsOffline * packedBytesDocIDLength]; this.partitionBucket = new int[bytesSorted]; this.histogram = new long[HISTOGRAM_SIZE]; this.scratch = new byte[bytesSorted]; @@ -133,12 +143,12 @@ public final class BKDRadixSelector { assert commonPrefixPosition > dimCommonPrefix; reader.next(); PointValue pointValue = reader.pointValue(); + BytesRef packedValueDocID = pointValue.packedValueDocIDBytes(); // copy dimension - BytesRef packedValue = pointValue.packedValue(); - System.arraycopy(packedValue.bytes, packedValue.offset + offset, scratch, 0, bytesPerDim); - // copy docID - BytesRef docIDBytes = pointValue.docIDBytes(); - System.arraycopy(docIDBytes.bytes, docIDBytes.offset, scratch, bytesPerDim, Integer.BYTES); + System.arraycopy(packedValueDocID.bytes, packedValueDocID.offset + offset, scratch, 0, bytesPerDim); + // copy data dimensions and docID + System.arraycopy(packedValueDocID.bytes, packedValueDocID.offset + numIndexDims * bytesPerDim, scratch, bytesPerDim, (numDataDims - numIndexDims) * bytesPerDim + Integer.BYTES); + for (long i = from + 1; i < to; i++) { reader.next(); pointValue = reader.pointValue(); @@ -156,13 +166,15 @@ public final class BKDRadixSelector { //check common prefix and adjust histogram final int startIndex = (dimCommonPrefix > bytesPerDim) ? bytesPerDim : dimCommonPrefix; final int endIndex = (commonPrefixPosition > bytesPerDim) ? bytesPerDim : commonPrefixPosition; - packedValue = pointValue.packedValue(); - int j = Arrays.mismatch(scratch, startIndex, endIndex, packedValue.bytes, packedValue.offset + offset + startIndex, packedValue.offset + offset + endIndex); + packedValueDocID = pointValue.packedValueDocIDBytes(); + int j = Arrays.mismatch(scratch, startIndex, endIndex, packedValueDocID.bytes, packedValueDocID.offset + offset + startIndex, packedValueDocID.offset + offset + endIndex); if (j == -1) { if (commonPrefixPosition > bytesPerDim) { - //tie-break on docID - docIDBytes = pointValue.docIDBytes(); - int k = Arrays.mismatch(scratch, bytesPerDim, commonPrefixPosition, docIDBytes.bytes, docIDBytes.offset, docIDBytes.offset + commonPrefixPosition - bytesPerDim); + //tie-break on data dimensions + docID + final int startTieBreak = numIndexDims * bytesPerDim; + final int endTieBreak = startTieBreak + commonPrefixPosition - bytesPerDim; + int k = Arrays.mismatch(scratch, bytesPerDim, commonPrefixPosition, + packedValueDocID.bytes, packedValueDocID.offset + startTieBreak, packedValueDocID.offset + endTieBreak); if (k != -1) { commonPrefixPosition = bytesPerDim + k; Arrays.fill(histogram, 0); @@ -194,8 +206,8 @@ public final class BKDRadixSelector { BytesRef packedValue = pointValue.packedValue(); bucket = packedValue.bytes[packedValue.offset + offset + commonPrefixPosition] & 0xff; } else { - BytesRef docIDValue = pointValue.docIDBytes(); - bucket = docIDValue.bytes[docIDValue.offset + commonPrefixPosition - bytesPerDim] & 0xff; + BytesRef packedValueDocID = pointValue.packedValueDocIDBytes(); + bucket = packedValueDocID.bytes[packedValueDocID.offset + numIndexDims * bytesPerDim + commonPrefixPosition - bytesPerDim] & 0xff; } return bucket; } @@ -309,10 +321,11 @@ public final class BKDRadixSelector { return partition; } - private byte[] heapRadixSelect(HeapPointWriter points, int dim, int from, int to, int partitionPoint, int commonPrefix) { - final int offset = dim * bytesPerDim + commonPrefix; - final int dimCmpBytes = bytesPerDim - commonPrefix; - new RadixSelector(bytesSorted - commonPrefix) { + private byte[] heapRadixSelect(HeapPointWriter points, int dim, int from, int to, int partitionPoint, int commonPrefixLength) { + final int dimOffset = dim * bytesPerDim + commonPrefixLength; + final int dimCmpBytes = bytesPerDim - commonPrefixLength; + final int dataOffset = numIndexDims * bytesPerDim - dimCmpBytes; + new RadixSelector(bytesSorted - commonPrefixLength) { @Override protected void swap(int i, int j) { @@ -324,23 +337,23 @@ public final class BKDRadixSelector { assert k >= 0 : "negative prefix " + k; if (k < dimCmpBytes) { // dim bytes - return points.block[i * packedBytesLength + offset + k] & 0xff; + return points.block[i * packedBytesDocIDLength + dimOffset + k] & 0xff; } else { - // doc id - int s = 3 - (k - dimCmpBytes); - return (points.docIDs[i] >>> (s * 8)) & 0xff; + // data bytes + return points.block[i * packedBytesDocIDLength + dataOffset + k] & 0xff; } } @Override protected Selector getFallbackSelector(int d) { - int skypedBytes = d + commonPrefix; - final int start = dim * bytesPerDim + skypedBytes; - final int end = dim * bytesPerDim + bytesPerDim; + final int skypedBytes = d + commonPrefixLength; + final int dimStart = dim * bytesPerDim + skypedBytes; + final int dimEnd = dim * bytesPerDim + bytesPerDim; + final int dataOffset = numIndexDims * bytesPerDim; + // data length is composed by the data dimensions plus the docID + final int dataLength = (numDataDims - numIndexDims) * bytesPerDim + Integer.BYTES; return new IntroSelector() { - int pivotDoc = -1; - @Override protected void swap(int i, int j) { points.swap(i, j); @@ -349,36 +362,37 @@ public final class BKDRadixSelector { @Override protected void setPivot(int i) { if (skypedBytes < bytesPerDim) { - System.arraycopy(points.block, i * packedBytesLength + dim * bytesPerDim, scratch, 0, bytesPerDim); + System.arraycopy(points.block, i * packedBytesDocIDLength + dim * bytesPerDim, scratch, 0, bytesPerDim); } - pivotDoc = points.docIDs[i]; + System.arraycopy(points.block, i * packedBytesDocIDLength + dataOffset, scratch, bytesPerDim, dataLength); } @Override protected int compare(int i, int j) { if (skypedBytes < bytesPerDim) { - int iOffset = i * packedBytesLength; - int jOffset = j * packedBytesLength; - int cmp = Arrays.compareUnsigned(points.block, iOffset + start, iOffset + end, - points.block, jOffset + start, jOffset + end); + int iOffset = i * packedBytesDocIDLength; + int jOffset = j * packedBytesDocIDLength; + int cmp = Arrays.compareUnsigned(points.block, iOffset + dimStart, iOffset + dimEnd, points.block, jOffset + dimStart, jOffset + dimEnd); if (cmp != 0) { return cmp; } } - return points.docIDs[i] - points.docIDs[j]; + int iOffset = i * packedBytesDocIDLength + dataOffset; + int jOffset = j * packedBytesDocIDLength + dataOffset; + return Arrays.compareUnsigned(points.block, iOffset, iOffset + dataLength, points.block, jOffset, jOffset + dataLength); } @Override protected int comparePivot(int j) { if (skypedBytes < bytesPerDim) { - int jOffset = j * packedBytesLength; - int cmp = Arrays.compareUnsigned(scratch, skypedBytes, bytesPerDim, - points.block, jOffset + start, jOffset + end); + int jOffset = j * packedBytesDocIDLength; + int cmp = Arrays.compareUnsigned(scratch, skypedBytes, bytesPerDim, points.block, jOffset + dimStart, jOffset + dimEnd); if (cmp != 0) { return cmp; } } - return pivotDoc - points.docIDs[j]; + int jOffset = j * packedBytesDocIDLength + dataOffset; + return Arrays.compareUnsigned(scratch, bytesPerDim, bytesPerDim + dataLength, points.block, jOffset, jOffset + dataLength); } }; } @@ -393,8 +407,9 @@ public final class BKDRadixSelector { /** Sort the heap writer by the specified dim. It is used to sort the leaves of the tree */ public void heapRadixSort(final HeapPointWriter points, int from, int to, int dim, int commonPrefixLength) { - final int offset = dim * bytesPerDim + commonPrefixLength; + final int dimOffset = dim * bytesPerDim + commonPrefixLength; final int dimCmpBytes = bytesPerDim - commonPrefixLength; + final int dataOffset = numIndexDims * bytesPerDim - dimCmpBytes; new MSBRadixSorter(bytesSorted - commonPrefixLength) { @Override @@ -402,11 +417,10 @@ public final class BKDRadixSelector { assert k >= 0 : "negative prefix " + k; if (k < dimCmpBytes) { // dim bytes - return points.block[i * packedBytesLength + offset + k] & 0xff; + return points.block[i * packedBytesDocIDLength + dimOffset + k] & 0xff; } else { - // doc id - int s = 3 - (k - dimCmpBytes); - return (points.docIDs[i] >>> (s * 8)) & 0xff; + // data bytes + return points.block[i * packedBytesDocIDLength + dataOffset + k] & 0xff; } } @@ -417,13 +431,14 @@ public final class BKDRadixSelector { @Override protected Sorter getFallbackSorter(int k) { - int skypedBytes = k + commonPrefixLength; - final int start = dim * bytesPerDim + skypedBytes; - final int end = dim * bytesPerDim + bytesPerDim; + final int skypedBytes = k + commonPrefixLength; + final int dimStart = dim * bytesPerDim + skypedBytes; + final int dimEnd = dim * bytesPerDim + bytesPerDim; + final int dataOffset = numIndexDims * bytesPerDim; + // data length is composed by the data dimensions plus the docID + final int dataLength = (numDataDims - numIndexDims) * bytesPerDim + Integer.BYTES; return new IntroSorter() { - int pivotDoc = -1; - @Override protected void swap(int i, int j) { points.swap(i, j); @@ -432,36 +447,37 @@ public final class BKDRadixSelector { @Override protected void setPivot(int i) { if (skypedBytes < bytesPerDim) { - System.arraycopy(points.block, i * packedBytesLength + dim * bytesPerDim, scratch, 0, bytesPerDim); + System.arraycopy(points.block, i * packedBytesDocIDLength + dim * bytesPerDim, scratch, 0, bytesPerDim); } - pivotDoc = points.docIDs[i]; + System.arraycopy(points.block, i * packedBytesDocIDLength + dataOffset, scratch, bytesPerDim, dataLength); } @Override protected int compare(int i, int j) { if (skypedBytes < bytesPerDim) { - int iOffset = i * packedBytesLength; - int jOffset = j * packedBytesLength; - int cmp = Arrays.compareUnsigned(points.block, iOffset + start, iOffset + end, - points.block, jOffset + start, jOffset + end); + int iOffset = i * packedBytesDocIDLength; + int jOffset = j * packedBytesDocIDLength; + int cmp = Arrays.compareUnsigned(points.block, iOffset + dimStart, iOffset + dimEnd, points.block, jOffset + dimStart, jOffset + dimEnd); if (cmp != 0) { return cmp; } } - return points.docIDs[i] - points.docIDs[j]; + int iOffset = i * packedBytesDocIDLength + dataOffset; + int jOffset = j * packedBytesDocIDLength + dataOffset; + return Arrays.compareUnsigned(points.block, iOffset, iOffset + dataLength, points.block, jOffset, jOffset + dataLength); } @Override protected int comparePivot(int j) { if (skypedBytes < bytesPerDim) { - int jOffset = j * packedBytesLength; - int cmp = Arrays.compareUnsigned(scratch, skypedBytes, bytesPerDim, - points.block, jOffset + start, jOffset + end); + int jOffset = j * packedBytesDocIDLength; + int cmp = Arrays.compareUnsigned(scratch, skypedBytes, bytesPerDim, points.block, jOffset + dimStart, jOffset + dimEnd); if (cmp != 0) { return cmp; } } - return pivotDoc - points.docIDs[j]; + int jOffset = j * packedBytesDocIDLength + dataOffset; + return Arrays.compareUnsigned(scratch, bytesPerDim, bytesPerDim + dataLength, points.block, jOffset, jOffset + dataLength); } }; } diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java index ae7b362fcbd..650d4af82e0 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java @@ -774,7 +774,7 @@ public class BKDWriter implements Closeable { assert pointCount / numLeaves <= maxPointsInLeafNode: "pointCount=" + pointCount + " numLeaves=" + numLeaves + " maxPointsInLeafNode=" + maxPointsInLeafNode; //We re-use the selector so we do not need to create an object every time. - BKDRadixSelector radixSelector = new BKDRadixSelector(numDataDims, bytesPerDim, maxPointsSortInHeap, tempDir, tempFileNamePrefix); + BKDRadixSelector radixSelector = new BKDRadixSelector(numDataDims, numIndexDims, bytesPerDim, maxPointsSortInHeap, tempDir, tempFileNamePrefix); boolean success = false; try { @@ -785,7 +785,8 @@ public class BKDWriter implements Closeable { minPackedValue, maxPackedValue, parentSplits, splitPackedValues, - leafBlockFPs); + leafBlockFPs, + new int[maxPointsInLeafNode]); assert Arrays.equals(parentSplits, new int[numIndexDims]); // If no exception, we should have cleaned everything up: @@ -1361,7 +1362,7 @@ public class BKDWriter implements Closeable { } // sort by sortedDim - MutablePointsReaderUtils.sortByDim(sortedDim, bytesPerDim, commonPrefixLengths, + MutablePointsReaderUtils.sortByDim(numDataDims, numIndexDims, sortedDim, bytesPerDim, commonPrefixLengths, reader, from, to, scratchBytesRef1, scratchBytesRef2); BytesRef comparator = scratchBytesRef1; @@ -1428,7 +1429,7 @@ public class BKDWriter implements Closeable { commonPrefixLen = bytesPerDim; } - MutablePointsReaderUtils.partition(maxDoc, splitDim, bytesPerDim, commonPrefixLen, + MutablePointsReaderUtils.partition(numDataDims, numIndexDims, maxDoc, splitDim, bytesPerDim, commonPrefixLen, reader, from, to, mid, scratchBytesRef1, scratchBytesRef2); // set the split value @@ -1465,7 +1466,8 @@ public class BKDWriter implements Closeable { byte[] minPackedValue, byte[] maxPackedValue, int[] parentSplits, byte[] splitPackedValues, - long[] leafBlockFPs) throws IOException { + long[] leafBlockFPs, + int[] spareDocIds) throws IOException { if (nodeID >= leafNodeOffset) { @@ -1525,7 +1527,13 @@ public class BKDWriter implements Closeable { // loading the values: int count = to - from; assert count > 0: "nodeID=" + nodeID + " leafNodeOffset=" + leafNodeOffset; - writeLeafBlockDocs(out, heapSource.docIDs, from, count); + assert count <= spareDocIds.length : "count=" + count + " > length=" + spareDocIds.length; + // Write doc IDs + int[] docIDs = spareDocIds; + for (int i = 0; i < count; i++) { + docIDs[i] = heapSource.getPackedValueSlice(from + i).docID(); + } + writeLeafBlockDocs(out, docIDs, 0, count); // TODO: minor opto: we don't really have to write the actual common prefixes, because BKDReader on recursing can regenerate it for us // from the index, much like how terms dict does so from the FST: @@ -1548,7 +1556,7 @@ public class BKDWriter implements Closeable { } }; assert valuesInOrderAndBounds(count, sortedDim, minPackedValue, maxPackedValue, packedValues, - heapSource.docIDs, from); + docIDs, 0); writeLeafBlockPackedValues(out, commonPrefixLengths, count, sortedDim, packedValues, leafCardinality); } else { @@ -1595,12 +1603,12 @@ public class BKDWriter implements Closeable { // Recurse on left tree: build(2 * nodeID, leafNodeOffset, slices[0], out, radixSelector, minPackedValue, maxSplitPackedValue, - parentSplits, splitPackedValues, leafBlockFPs); + parentSplits, splitPackedValues, leafBlockFPs, spareDocIds); // Recurse on right tree: build(2 * nodeID + 1, leafNodeOffset, slices[1], out, radixSelector, minSplitPackedValue, maxPackedValue - , parentSplits, splitPackedValues, leafBlockFPs); + , parentSplits, splitPackedValues, leafBlockFPs, spareDocIds); parentSplits[splitDim]--; } @@ -1654,6 +1662,13 @@ public class BKDWriter implements Closeable { if (cmp > 0) { throw new AssertionError("values out of order: last value=" + new BytesRef(lastPackedValue) + " current value=" + new BytesRef(packedValue, packedValueOffset, packedBytesLength) + " ord=" + ord); } + if (cmp == 0 && numDataDims > numIndexDims) { + int dataOffset = numIndexDims * bytesPerDim; + cmp = Arrays.compareUnsigned(lastPackedValue, dataOffset, packedBytesLength, packedValue, packedValueOffset + dataOffset, packedValueOffset + packedBytesLength); + if (cmp > 0) { + throw new AssertionError("data values out of order: last value=" + new BytesRef(lastPackedValue) + " current value=" + new BytesRef(packedValue, packedValueOffset, packedBytesLength) + " ord=" + ord); + } + } if (cmp == 0 && doc < lastDoc) { throw new AssertionError("docs out of order: last doc=" + lastDoc + " current doc=" + doc + " ord=" + ord); } diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointReader.java index 735b3138b21..4cee1ef036d 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointReader.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointReader.java @@ -27,16 +27,16 @@ public final class HeapPointReader implements PointReader { private int curRead; final byte[] block; final int packedBytesLength; - final int[] docIDs; + final int packedBytesDocIDLength; final int end; private final HeapPointValue pointValue; - public HeapPointReader(byte[] block, int packedBytesLength, int[] docIDs, int start, int end) { + public HeapPointReader(byte[] block, int packedBytesLength, int start, int end) { this.block = block; - this.docIDs = docIDs; curRead = start-1; this.end = end; this.packedBytesLength = packedBytesLength; + this.packedBytesDocIDLength = packedBytesLength + Integer.BYTES; if (start < end) { this.pointValue = new HeapPointValue(block, packedBytesLength); } else { @@ -53,7 +53,7 @@ public final class HeapPointReader implements PointReader { @Override public PointValue pointValue() { - pointValue.setValue(curRead * packedBytesLength, docIDs[curRead]); + pointValue.setOffset(curRead * packedBytesDocIDLength); return pointValue; } @@ -66,21 +66,22 @@ public final class HeapPointReader implements PointReader { */ static class HeapPointValue implements PointValue { - BytesRef packedValue; - BytesRef docIDBytes; - int docID; + final BytesRef packedValue; + final BytesRef packedValueDocID; + final int packedValueLength; - public HeapPointValue(byte[] value, int packedLength) { - packedValue = new BytesRef(value, 0, packedLength); - docIDBytes = new BytesRef(new byte[4]); + HeapPointValue(byte[] value, int packedValueLength) { + this.packedValueLength = packedValueLength; + this.packedValue = new BytesRef(value, 0, packedValueLength); + this.packedValueDocID = new BytesRef(value, 0, packedValueLength + Integer.BYTES); } /** - * Sets a new value by changing the offset and docID. + * Sets a new value by changing the offset. */ - public void setValue(int offset, int docID) { - this.docID = docID; + public void setOffset(int offset) { packedValue.offset = offset; + packedValueDocID.offset = offset; } @Override @@ -90,16 +91,14 @@ public final class HeapPointReader implements PointReader { @Override public int docID() { - return docID; + int position = packedValueDocID.offset + packedValueLength; + return ((packedValueDocID.bytes[position] & 0xFF) << 24) | ((packedValueDocID.bytes[++position] & 0xFF) << 16) + | ((packedValueDocID.bytes[++position] & 0xFF) << 8) | (packedValueDocID.bytes[++position] & 0xFF); } @Override - public BytesRef docIDBytes() { - docIDBytes.bytes[0] = (byte) (docID >> 24); - docIDBytes.bytes[1] = (byte) (docID >> 16); - docIDBytes.bytes[2] = (byte) (docID >> 8); - docIDBytes.bytes[3] = (byte) (docID >> 0); - return docIDBytes; + public BytesRef packedValueDocIDBytes() { + return packedValueDocID; } } } diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointWriter.java index 306ec5c46ee..695d7e1f2bd 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointWriter.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointWriter.java @@ -26,10 +26,10 @@ import org.apache.lucene.util.BytesRef; * @lucene.internal * */ public final class HeapPointWriter implements PointWriter { - public final int[] docIDs; public final byte[] block; final int size; final int packedBytesLength; + final int packedBytesDocIDLength; private final byte[] scratch; private int nextWrite; private boolean closed; @@ -38,11 +38,11 @@ public final class HeapPointWriter implements PointWriter { public HeapPointWriter(int size, int packedBytesLength) { - this.docIDs = new int[size]; - this.block = new byte[packedBytesLength * size]; - this.size = size; + this.packedBytesDocIDLength = packedBytesLength + Integer.BYTES; this.packedBytesLength = packedBytesLength; - this.scratch = new byte[packedBytesLength]; + this.block = new byte[packedBytesDocIDLength * size]; + this.size = size; + this.scratch = new byte[packedBytesDocIDLength]; if (size > 0) { pointValue = new HeapPointReader.HeapPointValue(block, packedBytesLength); } else { @@ -54,7 +54,7 @@ public final class HeapPointWriter implements PointWriter { /** Returns a reference, in result, to the byte[] slice holding this value */ public PointValue getPackedValueSlice(int index) { assert index < nextWrite : "nextWrite=" + (nextWrite) + " vs index=" + index; - pointValue.setValue(index * packedBytesLength, docIDs[index]); + pointValue.setOffset(index * packedBytesDocIDLength); return pointValue; } @@ -63,8 +63,12 @@ public final class HeapPointWriter implements PointWriter { assert closed == false : "point writer is already closed"; assert packedValue.length == packedBytesLength : "[packedValue] must have length [" + packedBytesLength + "] but was [" + packedValue.length + "]"; assert nextWrite < size : "nextWrite=" + (nextWrite + 1) + " vs size=" + size; - System.arraycopy(packedValue, 0, block, nextWrite * packedBytesLength, packedBytesLength); - docIDs[nextWrite] = docID; + System.arraycopy(packedValue, 0, block, nextWrite * packedBytesDocIDLength, packedBytesLength); + int position = nextWrite * packedBytesDocIDLength + packedBytesLength; + block[position] = (byte) (docID >> 24); + block[++position] = (byte) (docID >> 16); + block[++position] = (byte) (docID >> 8); + block[++position] = (byte) (docID >> 0); nextWrite++; } @@ -72,27 +76,23 @@ public final class HeapPointWriter implements PointWriter { public void append(PointValue pointValue) { assert closed == false : "point writer is already closed"; assert nextWrite < size : "nextWrite=" + (nextWrite + 1) + " vs size=" + size; - BytesRef packedValue = pointValue.packedValue(); - assert packedValue.length == packedBytesLength : "[packedValue] must have length [" + (packedBytesLength) + "] but was [" + packedValue.length + "]"; - System.arraycopy(packedValue.bytes, packedValue.offset, block, nextWrite * packedBytesLength, packedBytesLength); - docIDs[nextWrite] = pointValue.docID(); + BytesRef packedValueDocID = pointValue.packedValueDocIDBytes(); + assert packedValueDocID.length == packedBytesDocIDLength : "[packedValue] must have length [" + (packedBytesDocIDLength) + "] but was [" + packedValueDocID.length + "]"; + System.arraycopy(packedValueDocID.bytes, packedValueDocID.offset, block, nextWrite * packedBytesDocIDLength, packedBytesDocIDLength); nextWrite++; } public void swap(int i, int j) { - int docID = docIDs[i]; - docIDs[i] = docIDs[j]; - docIDs[j] = docID; - int indexI = i * packedBytesLength; - int indexJ = j * packedBytesLength; + int indexI = i * packedBytesDocIDLength; + int indexJ = j * packedBytesDocIDLength; // scratch1 = values[i] - System.arraycopy(block, indexI, scratch, 0, packedBytesLength); + System.arraycopy(block, indexI, scratch, 0, packedBytesDocIDLength); // values[i] = values[j] - System.arraycopy(block, indexJ, block, indexI, packedBytesLength); + System.arraycopy(block, indexJ, block, indexI, packedBytesDocIDLength); // values[j] = scratch1 - System.arraycopy(scratch, 0, block, indexJ, packedBytesLength); + System.arraycopy(scratch, 0, block, indexJ, packedBytesDocIDLength); } public int computeCardinality(int from, int to, int numDataDims, int bytesPerDim, int[] commonPrefixLengths) { @@ -102,8 +102,8 @@ public final class HeapPointWriter implements PointWriter { for (int dim = 0; dim < numDataDims; dim++) { final int start = dim * bytesPerDim + commonPrefixLengths[dim]; final int end = dim * bytesPerDim + bytesPerDim; - if (Arrays.mismatch(block, i * packedBytesLength + start, i * packedBytesLength + end, - block, (i - 1) * packedBytesLength + start, (i - 1) * packedBytesLength + end) != -1) { + if (Arrays.mismatch(block, i * packedBytesDocIDLength + start, i * packedBytesDocIDLength + end, + block, (i - 1) * packedBytesDocIDLength + start, (i - 1) * packedBytesDocIDLength + end) != -1) { leafCardinality++; break; } @@ -120,9 +120,9 @@ public final class HeapPointWriter implements PointWriter { @Override public PointReader getReader(long start, long length) { assert closed : "point writer is still open and trying to get a reader"; - assert start + length <= docIDs.length: "start=" + start + " length=" + length + " docIDs.length=" + docIDs.length; + assert start + length <= size: "start=" + start + " length=" + length + " docIDs.length=" + size; assert start + length <= nextWrite: "start=" + start + " length=" + length + " nextWrite=" + nextWrite; - return new HeapPointReader(block, packedBytesLength, docIDs, (int) start, Math.toIntExact(start+length)); + return new HeapPointReader(block, packedBytesLength, (int) start, Math.toIntExact(start+length)); } @Override @@ -136,6 +136,6 @@ public final class HeapPointWriter implements PointWriter { @Override public String toString() { - return "HeapPointWriter(count=" + nextWrite + " size=" + docIDs.length + ")"; + return "HeapPointWriter(count=" + nextWrite + " size=" + size + ")"; } } diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java b/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java index 5323476267d..15288590cce 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java @@ -78,7 +78,8 @@ public final class MutablePointsReaderUtils { protected int comparePivot(int j) { if (k < packedBytesLength) { reader.getValue(j, scratch); - int cmp = Arrays.compareUnsigned(pivot.bytes, pivot.offset + k, pivot.offset + k + packedBytesLength - k, scratch.bytes, scratch.offset + k, scratch.offset + k + packedBytesLength - k); + int cmp = Arrays.compareUnsigned(pivot.bytes, pivot.offset + k, pivot.offset + k + packedBytesLength - k, + scratch.bytes, scratch.offset + k, scratch.offset + k + packedBytesLength - k); if (cmp != 0) { return cmp; } @@ -92,14 +93,16 @@ public final class MutablePointsReaderUtils { } /** Sort points on the given dimension. */ - public static void sortByDim(int sortedDim, int bytesPerDim, int[] commonPrefixLengths, + public static void sortByDim(int numDataDim, int numIndexDim, int sortedDim, int bytesPerDim, int[] commonPrefixLengths, MutablePointValues reader, int from, int to, BytesRef scratch1, BytesRef scratch2) { + final int start = sortedDim * bytesPerDim + commonPrefixLengths[sortedDim]; + final int dimEnd = sortedDim * bytesPerDim + bytesPerDim; + final int dataStart = numIndexDim * bytesPerDim; + final int dataEnd = dataStart + (numDataDim - numIndexDim) * bytesPerDim; // No need for a fancy radix sort here, this is called on the leaves only so // there are not many values to sort - final int offset = sortedDim * bytesPerDim + commonPrefixLengths[sortedDim]; - final int numBytesToCompare = bytesPerDim - commonPrefixLengths[sortedDim]; new IntroSorter() { final BytesRef pivot = scratch1; @@ -119,9 +122,14 @@ public final class MutablePointsReaderUtils { @Override protected int comparePivot(int j) { reader.getValue(j, scratch2); - int cmp = Arrays.compareUnsigned(pivot.bytes, pivot.offset + offset, pivot.offset + offset + numBytesToCompare, scratch2.bytes, scratch2.offset + offset, scratch2.offset + offset + numBytesToCompare); + int cmp = Arrays.compareUnsigned(pivot.bytes, pivot.offset + start, pivot.offset + dimEnd, scratch2.bytes, + scratch2.offset + start, scratch2.offset + dimEnd); if (cmp == 0) { - cmp = pivotDoc - reader.getDocID(j); + cmp = Arrays.compareUnsigned(pivot.bytes, pivot.offset + dataStart, pivot.offset + dataEnd, + scratch2.bytes, scratch2.offset + dataStart, scratch2.offset + dataEnd); + if (cmp == 0) { + cmp = pivotDoc - reader.getDocID(j); + } } return cmp; } @@ -131,16 +139,20 @@ public final class MutablePointsReaderUtils { /** Partition points around {@code mid}. All values on the left must be less * than or equal to it and all values on the right must be greater than or * equal to it. */ - public static void partition(int maxDoc, int splitDim, int bytesPerDim, int commonPrefixLen, + public static void partition(int numDataDim, int numIndexDim, int maxDoc, int splitDim, int bytesPerDim, int commonPrefixLen, MutablePointValues reader, int from, int to, int mid, BytesRef scratch1, BytesRef scratch2) { - final int offset = splitDim * bytesPerDim + commonPrefixLen; - final int cmpBytes = bytesPerDim - commonPrefixLen; + final int dimOffset = splitDim * bytesPerDim + commonPrefixLen; + final int dimCmpBytes = bytesPerDim - commonPrefixLen; + final int dataOffset = numIndexDim * bytesPerDim; + final int dataCmpBytes = (numDataDim - numIndexDim) * bytesPerDim + dimCmpBytes; final int bitsPerDocId = PackedInts.bitsRequired(maxDoc - 1); - new RadixSelector(cmpBytes + (bitsPerDocId + 7) / 8) { + new RadixSelector(dataCmpBytes + (bitsPerDocId + 7) / 8) { @Override protected Selector getFallbackSelector(int k) { + final int dataStart = (k < dimCmpBytes) ? dataOffset : dataOffset + k - dimCmpBytes; + final int dataEnd = numDataDim * bytesPerDim; return new IntroSelector() { final BytesRef pivot = scratch1; @@ -159,9 +171,18 @@ public final class MutablePointsReaderUtils { @Override protected int comparePivot(int j) { - if (k < cmpBytes) { + if (k < dimCmpBytes) { reader.getValue(j, scratch2); - int cmp = Arrays.compareUnsigned(pivot.bytes, pivot.offset + offset + k, pivot.offset + offset + k + cmpBytes - k, scratch2.bytes, scratch2.offset + offset + k, scratch2.offset + offset + k + cmpBytes - k); + int cmp = Arrays.compareUnsigned(pivot.bytes, pivot.offset + dimOffset + k, pivot.offset + dimOffset + dimCmpBytes, + scratch2.bytes, scratch2.offset + dimOffset + k, scratch2.offset + dimOffset + dimCmpBytes); + if (cmp != 0) { + return cmp; + } + } + if (k < dataCmpBytes) { + reader.getValue(j, scratch2); + int cmp = Arrays.compareUnsigned(pivot.bytes, pivot.offset + dataStart, pivot.offset + dataEnd, + scratch2.bytes, scratch2.offset + dataStart, scratch2.offset + dataEnd); if (cmp != 0) { return cmp; } @@ -178,10 +199,12 @@ public final class MutablePointsReaderUtils { @Override protected int byteAt(int i, int k) { - if (k < cmpBytes) { - return Byte.toUnsignedInt(reader.getByteAt(i, offset + k)); + if (k < dimCmpBytes) { + return Byte.toUnsignedInt(reader.getByteAt(i, dimOffset + k)); + } else if (k < dataCmpBytes) { + return Byte.toUnsignedInt(reader.getByteAt(i, dataOffset + k - dimCmpBytes)); } else { - final int shift = bitsPerDocId - ((k - cmpBytes + 1) << 3); + final int shift = bitsPerDocId - ((k - dataCmpBytes + 1) << 3); return (reader.getDocID(i) >>> Math.max(0, shift)) & 0xff; } } diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointReader.java index 6218429b071..030bd719fea 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointReader.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointReader.java @@ -137,12 +137,14 @@ public final class OfflinePointReader implements PointReader { */ static class OfflinePointValue implements PointValue { - BytesRef packedValue; - BytesRef docIDBytes; + final BytesRef packedValue; + final BytesRef packedValueDocID; + final int packedValueLength; OfflinePointValue(byte[] value, int packedValueLength) { - packedValue = new BytesRef(value, 0, packedValueLength); - docIDBytes = new BytesRef(value, packedValueLength, Integer.BYTES); + this.packedValueLength = packedValueLength; + this.packedValue = new BytesRef(value, 0, packedValueLength); + this.packedValueDocID = new BytesRef(value, 0, packedValueLength + Integer.BYTES); } /** @@ -150,7 +152,7 @@ public final class OfflinePointReader implements PointReader { */ public void setOffset(int offset) { packedValue.offset = offset; - docIDBytes.offset = offset + packedValue.length; + packedValueDocID.offset = offset; } @Override @@ -160,14 +162,14 @@ public final class OfflinePointReader implements PointReader { @Override public int docID() { - int position =docIDBytes.offset; - return ((docIDBytes.bytes[position] & 0xFF) << 24) | ((docIDBytes.bytes[++position] & 0xFF) << 16) - | ((docIDBytes.bytes[++position] & 0xFF) << 8) | (docIDBytes.bytes[++position] & 0xFF); + int position = packedValueDocID.offset + packedValueLength; + return ((packedValueDocID.bytes[position] & 0xFF) << 24) | ((packedValueDocID.bytes[++position] & 0xFF) << 16) + | ((packedValueDocID.bytes[++position] & 0xFF) << 8) | (packedValueDocID.bytes[++position] & 0xFF); } @Override - public BytesRef docIDBytes() { - return docIDBytes; + public BytesRef packedValueDocIDBytes() { + return packedValueDocID; } } diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointWriter.java index 4f4ce9ee740..d9fa86f5307 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointWriter.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointWriter.java @@ -62,12 +62,9 @@ public final class OfflinePointWriter implements PointWriter { @Override public void append(PointValue pointValue) throws IOException { assert closed == false : "Point writer is already closed"; - BytesRef packedValue = pointValue.packedValue(); - assert packedValue.length == packedBytesLength : "[packedValue] must have length [" + packedBytesLength + "] but was [" + packedValue.length + "]"; - out.writeBytes(packedValue.bytes, packedValue.offset, packedValue.length); - BytesRef docIDBytes = pointValue.docIDBytes(); - assert docIDBytes.length == Integer.BYTES : "[docIDBytes] must have length [" + Integer.BYTES + "] but was [" + docIDBytes.length + "]"; - out.writeBytes(docIDBytes.bytes, docIDBytes.offset, docIDBytes.length); + BytesRef packedValueDocID = pointValue.packedValueDocIDBytes(); + assert packedValueDocID.length == packedBytesLength + Integer.BYTES : "[packedValue and docID] must have length [" + (packedBytesLength + Integer.BYTES) + "] but was [" + packedValueDocID.length + "]"; + out.writeBytes(packedValueDocID.bytes, packedValueDocID.offset, packedValueDocID.length); count++; assert expectedCount == 0 || count <= expectedCount : "expectedCount=" + expectedCount + " vs count=" + count; } diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/PointValue.java b/lucene/core/src/java/org/apache/lucene/util/bkd/PointValue.java index 28506278782..d933e81ff61 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/PointValue.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/PointValue.java @@ -28,10 +28,11 @@ public interface PointValue { /** Returns the packed values for the dimensions */ BytesRef packedValue(); - /** Returns the document id */ + /** Returns the docID */ int docID(); - /** Returns the byte representation of the document id */ - BytesRef docIDBytes(); + /** Returns the byte representation of the packed value + * together with the docID */ + BytesRef packedValueDocIDBytes(); } diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java index 338e459ad3d..fd80be663bd 100644 --- a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java +++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java @@ -1097,6 +1097,80 @@ public class TestBKD extends LuceneTestCase { } } + public void testCheckDataDimOptimalOrder() throws IOException { + Directory dir = newDirectory(); + final int numValues = atLeast(5000); + final int maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 500); + final int numBytesPerDim = TestUtil.nextInt(random(), 1, 4); + final double maxMB = (float) 3.0 + (3*random().nextDouble()); + + final int numIndexDims = TestUtil.nextInt(random(), 1, 8); + final int numDataDims = TestUtil.nextInt(random(), numIndexDims, 8); + + final byte[] pointValue1 = new byte[numDataDims * numBytesPerDim]; + final byte[] pointValue2 = new byte[numDataDims * numBytesPerDim]; + random().nextBytes(pointValue1); + random().nextBytes(pointValue2); + // equal index dimensions but different data dimensions + for (int i = 0; i < numIndexDims; i++) { + System.arraycopy(pointValue1, i * numBytesPerDim, pointValue2, i * numBytesPerDim, numBytesPerDim); + } + + BKDWriter w = new BKDWriter(2 * numValues, dir, "_temp", numDataDims, numIndexDims, numBytesPerDim, maxPointsInLeafNode, + maxMB, 2 * numValues); + for (int i = 0; i < numValues; ++i) { + w.add(pointValue1, i); + w.add(pointValue2, i); + } + final long indexFP; + try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) { + indexFP = w.finish(out); + w.close(); + } + + IndexInput pointsIn = dir.openInput("bkd", IOContext.DEFAULT); + pointsIn.seek(indexFP); + BKDReader points = new BKDReader(pointsIn); + + points.intersect(new IntersectVisitor() { + + byte[] previous = null; + boolean hasChanged = false; + + @Override + public void visit(int docID) { + throw new UnsupportedOperationException(); + } + + @Override + public void visit(int docID, byte[] packedValue) { + if (previous == null) { + previous = new byte[numDataDims * numBytesPerDim]; + System.arraycopy(packedValue, 0, previous, 0, numDataDims * numBytesPerDim); + } else { + int mismatch = Arrays.mismatch(packedValue, previous); + if (mismatch != -1) { + if (hasChanged == false) { + hasChanged = true; + System.arraycopy(packedValue, 0, previous, 0, numDataDims * numBytesPerDim); + } else { + fail("Points are not in optimal order"); + } + } + } + } + + @Override + public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { + return Relation.CELL_CROSSES_QUERY; + } + }); + + + pointsIn.close(); + dir.close(); + } + public void test2DLongOrdsOffline() throws Exception { try (Directory dir = newDirectory()) { int numDocs = 100000; diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSelector.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSelector.java index 075cd1cd9d8..898643d3ca1 100644 --- a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSelector.java +++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSelector.java @@ -208,7 +208,7 @@ public class TestBKDRadixSelector extends LuceneTestCase { } private void verify(Directory dir, PointWriter points, int dataDimensions, int indexDimensions, long start, long end, long middle, int packedLength, int bytesPerDimensions, int sortedOnHeap) throws IOException{ - BKDRadixSelector radixSelector = new BKDRadixSelector(dataDimensions, bytesPerDimensions, sortedOnHeap, dir, "test"); + BKDRadixSelector radixSelector = new BKDRadixSelector(dataDimensions, indexDimensions, bytesPerDimensions, sortedOnHeap, dir, "test"); //we only split by indexed dimension so we check for each only those dimension for (int splitDim = 0; splitDim < indexDimensions; splitDim++) { //We need to make a copy of the data as it is deleted in the process @@ -224,9 +224,15 @@ public class TestBKDRadixSelector extends LuceneTestCase { int cmp = Arrays.compareUnsigned(max, 0, bytesPerDimensions, min, 0, bytesPerDimensions); assertTrue(cmp <= 0); if (cmp == 0) { - int maxDocID = getMaxDocId(slices[0], bytesPerDimensions, splitDim, partitionPoint); - int minDocId = getMinDocId(slices[1], bytesPerDimensions, splitDim, partitionPoint); - assertTrue(minDocId >= maxDocID); + byte[] maxDataDim = getMaxDataDimension(slices[0], bytesPerDimensions, dataDimensions, indexDimensions, max, splitDim); + byte[] minDataDim = getMinDataDimension(slices[1], bytesPerDimensions, dataDimensions, indexDimensions, min, splitDim); + cmp = Arrays.compareUnsigned(maxDataDim, 0, (dataDimensions - indexDimensions) * bytesPerDimensions, minDataDim, 0, (dataDimensions - indexDimensions) * bytesPerDimensions); + assertTrue(cmp <= 0); + if (cmp == 0) { + int maxDocID = getMaxDocId(slices[0], bytesPerDimensions, splitDim, partitionPoint, dataDimensions, indexDimensions,maxDataDim); + int minDocId = getMinDocId(slices[1], bytesPerDimensions, splitDim, partitionPoint, dataDimensions, indexDimensions,minDataDim); + assertTrue(minDocId >= maxDocID); + } } assertTrue(Arrays.equals(partitionPoint, min)); slices[0].writer.destroy(); @@ -292,14 +298,17 @@ public class TestBKDRadixSelector extends LuceneTestCase { return min; } - private int getMinDocId(BKDRadixSelector.PathSlice p, int bytesPerDimension, int dimension, byte[] partitionPoint) throws IOException { + private int getMinDocId(BKDRadixSelector.PathSlice p, int bytesPerDimension, int dimension, byte[] partitionPoint, int dataDims, int indexDims, byte[] dataDim) throws IOException { int docID = Integer.MAX_VALUE; try (PointReader reader = p.writer.getReader(p.start, p.count)) { while (reader.next()) { PointValue pointValue = reader.pointValue(); BytesRef packedValue = pointValue.packedValue(); int offset = dimension * bytesPerDimension; - if (Arrays.compareUnsigned(packedValue.bytes, packedValue.offset + offset, packedValue.offset + offset + bytesPerDimension, partitionPoint, 0, bytesPerDimension) == 0) { + int dataOffset = indexDims * bytesPerDimension; + int dataLength = (dataDims - indexDims) * bytesPerDimension; + if (Arrays.compareUnsigned(packedValue.bytes, packedValue.offset + offset, packedValue.offset + offset + bytesPerDimension, partitionPoint, 0, bytesPerDimension) == 0 + && Arrays.compareUnsigned(packedValue.bytes, packedValue.offset + dataOffset, packedValue.offset + dataOffset + dataLength, dataDim, 0, dataLength) == 0) { int newDocID = pointValue.docID(); if (newDocID < docID) { docID = newDocID; @@ -310,6 +319,26 @@ public class TestBKDRadixSelector extends LuceneTestCase { return docID; } + private byte[] getMinDataDimension(BKDRadixSelector.PathSlice p, int bytesPerDimension, int dataDims, int indexDims, byte[] minDim, int splitDim) throws IOException { + byte[] min = new byte[(dataDims - indexDims) * bytesPerDimension]; + Arrays.fill(min, (byte) 0xff); + int offset = splitDim * bytesPerDimension; + try (PointReader reader = p.writer.getReader(p.start, p.count)) { + byte[] value = new byte[(dataDims - indexDims) * bytesPerDimension]; + while (reader.next()) { + PointValue pointValue = reader.pointValue(); + BytesRef packedValue = pointValue.packedValue(); + if (Arrays.mismatch(minDim, 0, bytesPerDimension, packedValue.bytes, packedValue.offset + offset, packedValue.offset + offset + bytesPerDimension) == -1) { + System.arraycopy(packedValue.bytes, packedValue.offset + indexDims * bytesPerDimension, value, 0, (dataDims - indexDims) * bytesPerDimension); + if (Arrays.compareUnsigned(min, 0, (dataDims - indexDims) * bytesPerDimension, value, 0, (dataDims - indexDims) * bytesPerDimension) > 0) { + System.arraycopy(value, 0, min, 0, (dataDims - indexDims) * bytesPerDimension); + } + } + } + } + return min; + } + private byte[] getMax(BKDRadixSelector.PathSlice p, int bytesPerDimension, int dimension) throws IOException { byte[] max = new byte[bytesPerDimension]; Arrays.fill(max, (byte) 0); @@ -327,14 +356,37 @@ public class TestBKDRadixSelector extends LuceneTestCase { return max; } - private int getMaxDocId(BKDRadixSelector.PathSlice p, int bytesPerDimension, int dimension, byte[] partitionPoint) throws IOException { + private byte[] getMaxDataDimension(BKDRadixSelector.PathSlice p, int bytesPerDimension, int dataDims, int indexDims, byte[] maxDim, int splitDim) throws IOException { + byte[] max = new byte[(dataDims - indexDims) * bytesPerDimension]; + Arrays.fill(max, (byte) 0); + int offset = splitDim * bytesPerDimension; + try (PointReader reader = p.writer.getReader(p.start, p.count)) { + byte[] value = new byte[(dataDims - indexDims) * bytesPerDimension]; + while (reader.next()) { + PointValue pointValue = reader.pointValue(); + BytesRef packedValue = pointValue.packedValue(); + if (Arrays.mismatch(maxDim, 0, bytesPerDimension, packedValue.bytes, packedValue.offset + offset, packedValue.offset + offset + bytesPerDimension) == -1) { + System.arraycopy(packedValue.bytes, packedValue.offset + indexDims * bytesPerDimension, value, 0, (dataDims - indexDims) * bytesPerDimension); + if (Arrays.compareUnsigned(max, 0, (dataDims - indexDims) * bytesPerDimension, value, 0, (dataDims - indexDims) * bytesPerDimension) < 0) { + System.arraycopy(value, 0, max, 0, (dataDims - indexDims) * bytesPerDimension); + } + } + } + } + return max; + } + + private int getMaxDocId(BKDRadixSelector.PathSlice p, int bytesPerDimension, int dimension, byte[] partitionPoint, int dataDims, int indexDims, byte[] dataDim) throws IOException { int docID = Integer.MIN_VALUE; try (PointReader reader = p.writer.getReader(p.start, p.count)) { while (reader.next()) { PointValue pointValue = reader.pointValue(); BytesRef packedValue = pointValue.packedValue(); int offset = dimension * bytesPerDimension; - if (Arrays.compareUnsigned(packedValue.bytes, packedValue.offset + offset, packedValue.offset + offset + bytesPerDimension, partitionPoint, 0, bytesPerDimension) == 0) { + int dataOffset = indexDims * bytesPerDimension; + int dataLength = (dataDims - indexDims) * bytesPerDimension; + if (Arrays.compareUnsigned(packedValue.bytes, packedValue.offset + offset, packedValue.offset + offset + bytesPerDimension, partitionPoint, 0, bytesPerDimension) == 0 + && Arrays.compareUnsigned(packedValue.bytes, packedValue.offset + dataOffset, packedValue.offset + dataOffset + dataLength, dataDim, 0, dataLength) == 0) { int newDocID = pointValue.docID(); if (newDocID > docID) { docID = newDocID; diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSort.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSort.java index a7cebd024af..1b0b8922002 100644 --- a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSort.java +++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSort.java @@ -116,7 +116,7 @@ public class TestBKDRadixSort extends LuceneTestCase { private void verifySort(HeapPointWriter points, int dataDimensions, int indexDimensions, int start, int end, int bytesPerDim) throws IOException{ int packedBytesLength = dataDimensions * bytesPerDim; Directory dir = newDirectory(); - BKDRadixSelector radixSelector = new BKDRadixSelector(dataDimensions, bytesPerDim, 1000, dir, "test"); + BKDRadixSelector radixSelector = new BKDRadixSelector(dataDimensions, indexDimensions, bytesPerDim, 1000, dir, "test"); // we check for each dimension for (int splitDim = 0; splitDim < dataDimensions; splitDim++) { radixSelector.heapRadixSort(points, start, end, splitDim, getRandomCommonPrefix(points, start, end, bytesPerDim, splitDim)); @@ -129,6 +129,11 @@ public class TestBKDRadixSort extends LuceneTestCase { BytesRef value = pointValue.packedValue(); int cmp = Arrays.compareUnsigned(value.bytes, value.offset + dimOffset, value.offset + dimOffset + bytesPerDim, previous, dimOffset, dimOffset + bytesPerDim); assertTrue(cmp >= 0); + if (cmp == 0) { + int dataOffset = indexDimensions * bytesPerDim; + cmp = Arrays.compareUnsigned(value.bytes, value.offset + dataOffset, value.offset + packedBytesLength, previous, dataOffset, packedBytesLength); + assertTrue(cmp >= 0); + } if (cmp == 0) { assertTrue(pointValue.docID() >= previousDocId); } diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestMutablePointsReaderUtils.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestMutablePointsReaderUtils.java index 6f6bf0240cf..e17eab1f9a2 100644 --- a/lucene/core/src/test/org/apache/lucene/util/bkd/TestMutablePointsReaderUtils.java +++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestMutablePointsReaderUtils.java @@ -37,7 +37,7 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase { private void doTestSort() { final int bytesPerDim = TestUtil.nextInt(random(), 1, 16); final int maxDoc = TestUtil.nextInt(random(), 1, 1 << random().nextInt(30)); - Point[] points = createRandomPoints(1, bytesPerDim, maxDoc); + Point[] points = createRandomPoints(1, 1, bytesPerDim, maxDoc, new int[1]); DummyPointsReader reader = new DummyPointsReader(points); MutablePointsReaderUtils.sort(maxDoc, bytesPerDim, reader, 0, points.length); Arrays.sort(points, new Comparator() { @@ -61,25 +61,15 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase { } private void doTestSortByDim() { - final int numDims = TestUtil.nextInt(random(), 1, 8); + final int numIndexDims = TestUtil.nextInt(random(), 1, 8); + final int numDataDims = TestUtil.nextInt(random(), numIndexDims, 8); final int bytesPerDim = TestUtil.nextInt(random(), 1, 16); final int maxDoc = TestUtil.nextInt(random(), 1, 1 << random().nextInt(30)); - Point[] points = createRandomPoints(numDims, bytesPerDim, maxDoc); - int[] commonPrefixLengths = new int[numDims]; - for (int i = 0; i < commonPrefixLengths.length; ++i) { - commonPrefixLengths[i] = TestUtil.nextInt(random(), 0, bytesPerDim); - } - BytesRef firstValue = points[0].packedValue; - for (int i = 1; i < points.length; ++i) { - for (int dim = 0; dim < numDims; ++dim) { - int offset = dim * bytesPerDim; - BytesRef packedValue = points[i].packedValue; - System.arraycopy(firstValue.bytes, firstValue.offset + offset, packedValue.bytes, packedValue.offset + offset, commonPrefixLengths[dim]); - } - } + int[] commonPrefixLengths = new int[numDataDims]; + Point[] points = createRandomPoints(numDataDims, numIndexDims, bytesPerDim, maxDoc, commonPrefixLengths); DummyPointsReader reader = new DummyPointsReader(points); - final int sortedDim = random().nextInt(numDims); - MutablePointsReaderUtils.sortByDim(sortedDim, bytesPerDim, commonPrefixLengths, reader, 0, points.length, + final int sortedDim = random().nextInt(numIndexDims); + MutablePointsReaderUtils.sortByDim(numDataDims, numIndexDims, sortedDim, bytesPerDim, commonPrefixLengths, reader, 0, points.length, new BytesRef(), new BytesRef()); for (int i = 1; i < points.length; ++i) { final int offset = sortedDim * bytesPerDim; @@ -87,7 +77,13 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase { BytesRef currentValue = reader.points[i].packedValue; int cmp = Arrays.compareUnsigned(previousValue.bytes, previousValue.offset + offset, previousValue.offset + offset + bytesPerDim, currentValue.bytes, currentValue.offset + offset, currentValue.offset + offset + bytesPerDim); if (cmp == 0) { - cmp = reader.points[i - 1].doc - reader.points[i].doc; + int dataDimOffset = numIndexDims * bytesPerDim; + int dataDimsLength = (numDataDims - numIndexDims) * bytesPerDim; + cmp = Arrays.compareUnsigned(previousValue.bytes, previousValue.offset + dataDimOffset, previousValue.offset + dataDimOffset + dataDimsLength, + currentValue.bytes, currentValue.offset + dataDimOffset, currentValue.offset + dataDimOffset + dataDimsLength); + if (cmp == 0) { + cmp = reader.points[i - 1].doc - reader.points[i].doc; + } } assertTrue(cmp <= 0); } @@ -100,29 +96,31 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase { } private void doTestPartition() { - final int numDims = TestUtil.nextInt(random(), 1, 8); + final int numIndexDims = TestUtil.nextInt(random(), 1, 8); + final int numDataDims = TestUtil.nextInt(random(), numIndexDims, 8); final int bytesPerDim = TestUtil.nextInt(random(), 1, 16); + int[] commonPrefixLengths = new int[numDataDims]; final int maxDoc = TestUtil.nextInt(random(), 1, 1 << random().nextInt(30)); - Point[] points = createRandomPoints(numDims, bytesPerDim, maxDoc); - int commonPrefixLength = TestUtil.nextInt(random(), 0, bytesPerDim); - final int splitDim = random().nextInt(numDims); - BytesRef firstValue = points[0].packedValue; - for (int i = 1; i < points.length; ++i) { - BytesRef packedValue = points[i].packedValue; - int offset = splitDim * bytesPerDim; - System.arraycopy(firstValue.bytes, firstValue.offset + offset, packedValue.bytes, packedValue.offset + offset, commonPrefixLength); - } + Point[] points = createRandomPoints(numDataDims, numIndexDims, bytesPerDim, maxDoc, commonPrefixLengths); + final int splitDim = random().nextInt(numIndexDims); DummyPointsReader reader = new DummyPointsReader(points); final int pivot = TestUtil.nextInt(random(), 0, points.length - 1); - MutablePointsReaderUtils.partition(maxDoc, splitDim, bytesPerDim, commonPrefixLength, reader, 0, points.length, pivot, + MutablePointsReaderUtils.partition(numDataDims, numIndexDims, maxDoc, splitDim, bytesPerDim, commonPrefixLengths[splitDim], reader, 0, points.length, pivot, new BytesRef(), new BytesRef()); BytesRef pivotValue = reader.points[pivot].packedValue; int offset = splitDim * bytesPerDim; for (int i = 0; i < points.length; ++i) { BytesRef value = reader.points[i].packedValue; - int cmp = Arrays.compareUnsigned(value.bytes, value.offset + offset, value.offset + offset + bytesPerDim, pivotValue.bytes, pivotValue.offset + offset, pivotValue.offset + offset + bytesPerDim); + int cmp = Arrays.compareUnsigned(value.bytes, value.offset + offset, value.offset + offset + bytesPerDim, + pivotValue.bytes, pivotValue.offset + offset, pivotValue.offset + offset + bytesPerDim); if (cmp == 0) { - cmp = reader.points[i].doc - reader.points[pivot].doc; + int dataDimOffset = numIndexDims * bytesPerDim; + int dataDimsLength = (numDataDims - numIndexDims) * bytesPerDim; + cmp = Arrays.compareUnsigned(value.bytes, value.offset + dataDimOffset, value.offset + dataDimOffset + dataDimsLength, + pivotValue.bytes, pivotValue.offset + dataDimOffset, pivotValue.offset + dataDimOffset + dataDimsLength); + if (cmp == 0) { + cmp = reader.points[i].doc - reader.points[pivot].doc; + } } if (i < pivot) { assertTrue(cmp <= 0); @@ -134,14 +132,54 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase { } } - private static Point[] createRandomPoints(int numDims, int bytesPerDim, int maxDoc) { - final int packedBytesLength = numDims * bytesPerDim; + private static Point[] createRandomPoints(int numDataDims, int numIndexdims, int bytesPerDim, int maxDoc, int[] commonPrefixLengths) { + assertTrue(commonPrefixLengths.length == numDataDims); + final int packedBytesLength = numDataDims * bytesPerDim; final int numPoints = TestUtil.nextInt(random(), 1, 100000); Point[] points = new Point[numPoints]; - for (int i = 0; i < numPoints; ++i) { - byte[] value = new byte[packedBytesLength]; - random().nextBytes(value); - points[i] = new Point(value, random().nextInt(maxDoc)); + if (random().nextInt(5) != 0) { + for (int i = 0; i < numPoints; ++i) { + byte[] value = new byte[packedBytesLength]; + random().nextBytes(value); + points[i] = new Point(value, random().nextInt(maxDoc)); + } + for (int i = 0; i < numDataDims; ++i) { + commonPrefixLengths[i] = TestUtil.nextInt(random(), 0, bytesPerDim); + } + BytesRef firstValue = points[0].packedValue; + for (int i = 1; i < points.length; ++i) { + for (int dim = 0; dim < numDataDims; ++dim) { + int offset = dim * bytesPerDim; + BytesRef packedValue = points[i].packedValue; + System.arraycopy(firstValue.bytes, firstValue.offset + offset, packedValue.bytes, packedValue.offset + offset, commonPrefixLengths[dim]); + } + } + } else { + //index dim are equal, data dims different + byte[] indexDims = new byte[numIndexdims * bytesPerDim]; + random().nextBytes(indexDims); + byte[] dataDims = new byte[(numDataDims - numIndexdims) * bytesPerDim]; + for (int i = 0; i < numPoints; ++i) { + byte[] value = new byte[packedBytesLength]; + System.arraycopy(indexDims, 0, value, 0, numIndexdims * bytesPerDim); + random().nextBytes(dataDims); + System.arraycopy(dataDims, 0, value, numIndexdims * bytesPerDim, (numDataDims - numIndexdims) * bytesPerDim); + points[i] = new Point(value, random().nextInt(maxDoc)); + } + for (int i = 0; i < numIndexdims; ++i) { + commonPrefixLengths[i] = bytesPerDim; + } + for (int i = numDataDims; i < numDataDims; ++i) { + commonPrefixLengths[i] = TestUtil.nextInt(random(), 0, bytesPerDim); + } + BytesRef firstValue = points[0].packedValue; + for (int i = 1; i < points.length; ++i) { + for (int dim = numIndexdims; dim < numDataDims; ++dim) { + int offset = dim * bytesPerDim; + BytesRef packedValue = points[i].packedValue; + System.arraycopy(firstValue.bytes, firstValue.offset + offset, packedValue.bytes, packedValue.offset + offset, commonPrefixLengths[dim]); + } + } } return points; }