diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 4f872f5b517..a3d9228ca38 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -138,6 +138,9 @@ Optimizations * LUCENE-8901: Load frequencies lazily only when needed in BlockDocsEnum and BlockImpactsEverythingEnum (Mayya Sharipova). +* LUCENE-8888: Optimize distribution of points with data dimensions in + BKD tree leaves. (Ignacio Vera) + Test Framework * LUCENE-8825: CheckHits now display the shard index in case of mismatch diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDWriter.java index ff6cbaa9488..31ecb122ec5 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDWriter.java @@ -597,7 +597,7 @@ final class SimpleTextBKDWriter implements Closeable { assert pointCount / numLeaves <= maxPointsInLeafNode: "pointCount=" + pointCount + " numLeaves=" + numLeaves + " maxPointsInLeafNode=" + maxPointsInLeafNode; //We re-use the selector so we do not need to create an object every time. - BKDRadixSelector radixSelector = new BKDRadixSelector(numDataDims, bytesPerDim, maxPointsSortInHeap, tempDir, tempFileNamePrefix); + BKDRadixSelector radixSelector = new BKDRadixSelector(numDataDims, numIndexDims, bytesPerDim, maxPointsSortInHeap, tempDir, tempFileNamePrefix); boolean success = false; try { @@ -605,7 +605,7 @@ final class SimpleTextBKDWriter implements Closeable { build(1, numLeaves, points, out, radixSelector, minPackedValue, maxPackedValue, - splitPackedValues, leafBlockFPs); + splitPackedValues, leafBlockFPs, new int[maxPointsInLeafNode]); // If no exception, we should have cleaned everything up: @@ -877,7 +877,7 @@ final class SimpleTextBKDWriter implements Closeable { } // sort by sortedDim - MutablePointsReaderUtils.sortByDim(sortedDim, bytesPerDim, commonPrefixLengths, + MutablePointsReaderUtils.sortByDim(numDataDims, numIndexDims, sortedDim, bytesPerDim, commonPrefixLengths, reader, from, to, scratchBytesRef1, scratchBytesRef2); // Save the block file pointer: @@ -920,7 +920,7 @@ final class SimpleTextBKDWriter implements Closeable { break; } } - MutablePointsReaderUtils.partition(maxDoc, splitDim, bytesPerDim, commonPrefixLen, + MutablePointsReaderUtils.partition(numDataDims, numIndexDims, maxDoc, splitDim, bytesPerDim, commonPrefixLen, reader, from, to, mid, scratchBytesRef1, scratchBytesRef2); // set the split value @@ -951,7 +951,8 @@ final class SimpleTextBKDWriter implements Closeable { BKDRadixSelector radixSelector, byte[] minPackedValue, byte[] maxPackedValue, byte[] splitPackedValues, - long[] leafBlockFPs) throws IOException { + long[] leafBlockFPs, + int[] spareDocIds) throws IOException { if (nodeID >= leafNodeOffset) { @@ -1010,7 +1011,12 @@ final class SimpleTextBKDWriter implements Closeable { // loading the values: int count = to - from; assert count > 0: "nodeID=" + nodeID + " leafNodeOffset=" + leafNodeOffset; - writeLeafBlockDocs(out, heapSource.docIDs, from, count); + // Write doc IDs + int[] docIDs = spareDocIds; + for (int i = 0; i < count; i++) { + docIDs[i] = heapSource.getPackedValueSlice(from + i).docID(); + } + writeLeafBlockDocs(out, spareDocIds, 0, count); // TODO: minor opto: we don't really have to write the actual common prefixes, because BKDReader on recursing can regenerate it for us // from the index, much like how terms dict does so from the FST: @@ -1030,7 +1036,7 @@ final class SimpleTextBKDWriter implements Closeable { } }; assert valuesInOrderAndBounds(count, sortedDim, minPackedValue, maxPackedValue, packedValues, - heapSource.docIDs, from); + docIDs, 0); writeLeafBlockPackedValues(out, commonPrefixLengths, count, sortedDim, packedValues); } else { @@ -1075,12 +1081,12 @@ final class SimpleTextBKDWriter implements Closeable { // Recurse on left tree: build(2*nodeID, leafNodeOffset, pathSlices[0], out, radixSelector, - minPackedValue, maxSplitPackedValue, splitPackedValues, leafBlockFPs); + minPackedValue, maxSplitPackedValue, splitPackedValues, leafBlockFPs, spareDocIds); // TODO: we could "tail recurse" here? have our parent discard its refs as we recurse right? // Recurse on right tree: build(2*nodeID+1, leafNodeOffset, pathSlices[1], out, radixSelector, - minSplitPackedValue, maxPackedValue, splitPackedValues, leafBlockFPs); + minSplitPackedValue, maxPackedValue, splitPackedValues, leafBlockFPs, spareDocIds); } } @@ -1132,6 +1138,13 @@ final class SimpleTextBKDWriter implements Closeable { if (cmp > 0) { throw new AssertionError("values out of order: last value=" + new BytesRef(lastPackedValue) + " current value=" + new BytesRef(packedValue, packedValueOffset, packedBytesLength) + " ord=" + ord + " sortedDim=" + sortedDim); } + if (cmp == 0 && numDataDims > numIndexDims) { + int dataOffset = numIndexDims * bytesPerDim; + cmp = FutureArrays.compareUnsigned(lastPackedValue, dataOffset, packedBytesLength, packedValue, packedValueOffset + dataOffset, packedValueOffset + packedBytesLength); + if (cmp > 0) { + throw new AssertionError("data values out of order: last value=" + new BytesRef(lastPackedValue) + " current value=" + new BytesRef(packedValue, packedValueOffset, packedBytesLength) + " ord=" + ord); + } + } if (cmp == 0 && doc < lastDoc) { throw new AssertionError("docs out of order: last doc=" + lastDoc + " current doc=" + doc + " ord=" + ord + " sortedDim=" + sortedDim); } diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDRadixSelector.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDRadixSelector.java index 6a212d888d7..4ade1040e88 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDRadixSelector.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDRadixSelector.java @@ -48,6 +48,8 @@ public final class BKDRadixSelector { private final int bytesSorted; //data dimensions size private final int packedBytesLength; + // data dimensions plus docID size + private final int packedBytesDocIDLength; //flag to when we are moving to sort on heap private final int maxPointsSortInHeap; //reusable buffer @@ -60,18 +62,26 @@ public final class BKDRadixSelector { private final Directory tempDir; // prefix for temp files private final String tempFileNamePrefix; + // data and index dimensions + private final int numDataDims, numIndexDims; /** * Sole constructor. */ - public BKDRadixSelector(int numDim, int bytesPerDim, int maxPointsSortInHeap, Directory tempDir, String tempFileNamePrefix) { + public BKDRadixSelector(int numDataDims, int numIndexDims, int bytesPerDim, int maxPointsSortInHeap, Directory tempDir, String tempFileNamePrefix) { this.bytesPerDim = bytesPerDim; - this.packedBytesLength = numDim * bytesPerDim; - this.bytesSorted = bytesPerDim + Integer.BYTES; + this.numDataDims = numDataDims; + this.numIndexDims = numIndexDims; + this.packedBytesLength = numDataDims * bytesPerDim; + this.packedBytesDocIDLength = packedBytesLength + Integer.BYTES; + // Selection and sorting is done in a given dimension. In case the value of the dimension are equal + // between two points we tie break first using the data-only dimensions and if those are still equal + // we tie-break on the docID. Here we account for all bytes used in the process. + this.bytesSorted = bytesPerDim + (numDataDims - numIndexDims) * bytesPerDim + Integer.BYTES; this.maxPointsSortInHeap = maxPointsSortInHeap; - int numberOfPointsOffline = MAX_SIZE_OFFLINE_BUFFER / (packedBytesLength + Integer.BYTES); - this.offlineBuffer = new byte[numberOfPointsOffline * (packedBytesLength + Integer.BYTES)]; + int numberOfPointsOffline = MAX_SIZE_OFFLINE_BUFFER / packedBytesDocIDLength; + this.offlineBuffer = new byte[numberOfPointsOffline * packedBytesDocIDLength]; this.partitionBucket = new int[bytesSorted]; this.histogram = new long[HISTOGRAM_SIZE]; this.scratch = new byte[bytesSorted]; @@ -134,12 +144,12 @@ public final class BKDRadixSelector { assert commonPrefixPosition > dimCommonPrefix; reader.next(); PointValue pointValue = reader.pointValue(); + BytesRef packedValueDocID = pointValue.packedValueDocIDBytes(); // copy dimension - BytesRef packedValue = pointValue.packedValue(); - System.arraycopy(packedValue.bytes, packedValue.offset + offset, scratch, 0, bytesPerDim); - // copy docID - BytesRef docIDBytes = pointValue.docIDBytes(); - System.arraycopy(docIDBytes.bytes, docIDBytes.offset, scratch, bytesPerDim, Integer.BYTES); + System.arraycopy(packedValueDocID.bytes, packedValueDocID.offset + offset, scratch, 0, bytesPerDim); + // copy data dimensions and docID + System.arraycopy(packedValueDocID.bytes, packedValueDocID.offset + numIndexDims * bytesPerDim, scratch, bytesPerDim, (numDataDims - numIndexDims) * bytesPerDim + Integer.BYTES); + for (long i = from + 1; i < to; i++) { reader.next(); pointValue = reader.pointValue(); @@ -157,13 +167,15 @@ public final class BKDRadixSelector { //check common prefix and adjust histogram final int startIndex = (dimCommonPrefix > bytesPerDim) ? bytesPerDim : dimCommonPrefix; final int endIndex = (commonPrefixPosition > bytesPerDim) ? bytesPerDim : commonPrefixPosition; - packedValue = pointValue.packedValue(); - int j = FutureArrays.mismatch(scratch, startIndex, endIndex, packedValue.bytes, packedValue.offset + offset + startIndex, packedValue.offset + offset + endIndex); + packedValueDocID = pointValue.packedValueDocIDBytes(); + int j = FutureArrays.mismatch(scratch, startIndex, endIndex, packedValueDocID.bytes, packedValueDocID.offset + offset + startIndex, packedValueDocID.offset + offset + endIndex); if (j == -1) { if (commonPrefixPosition > bytesPerDim) { - //tie-break on docID - docIDBytes = pointValue.docIDBytes(); - int k = FutureArrays.mismatch(scratch, bytesPerDim, commonPrefixPosition, docIDBytes.bytes, docIDBytes.offset, docIDBytes.offset + commonPrefixPosition - bytesPerDim); + //tie-break on data dimensions + docID + final int startTieBreak = numIndexDims * bytesPerDim; + final int endTieBreak = startTieBreak + commonPrefixPosition - bytesPerDim; + int k = FutureArrays.mismatch(scratch, bytesPerDim, commonPrefixPosition, + packedValueDocID.bytes, packedValueDocID.offset + startTieBreak, packedValueDocID.offset + endTieBreak); if (k != -1) { commonPrefixPosition = bytesPerDim + k; Arrays.fill(histogram, 0); @@ -195,8 +207,8 @@ public final class BKDRadixSelector { BytesRef packedValue = pointValue.packedValue(); bucket = packedValue.bytes[packedValue.offset + offset + commonPrefixPosition] & 0xff; } else { - BytesRef docIDValue = pointValue.docIDBytes(); - bucket = docIDValue.bytes[docIDValue.offset + commonPrefixPosition - bytesPerDim] & 0xff; + BytesRef packedValueDocID = pointValue.packedValueDocIDBytes(); + bucket = packedValueDocID.bytes[packedValueDocID.offset + numIndexDims * bytesPerDim + commonPrefixPosition - bytesPerDim] & 0xff; } return bucket; } @@ -310,10 +322,11 @@ public final class BKDRadixSelector { return partition; } - private byte[] heapRadixSelect(HeapPointWriter points, int dim, int from, int to, int partitionPoint, int commonPrefix) { - final int offset = dim * bytesPerDim + commonPrefix; - final int dimCmpBytes = bytesPerDim - commonPrefix; - new RadixSelector(bytesSorted - commonPrefix) { + private byte[] heapRadixSelect(HeapPointWriter points, int dim, int from, int to, int partitionPoint, int commonPrefixLength) { + final int dimOffset = dim * bytesPerDim + commonPrefixLength; + final int dimCmpBytes = bytesPerDim - commonPrefixLength; + final int dataOffset = numIndexDims * bytesPerDim - dimCmpBytes; + new RadixSelector(bytesSorted - commonPrefixLength) { @Override protected void swap(int i, int j) { @@ -325,23 +338,23 @@ public final class BKDRadixSelector { assert k >= 0 : "negative prefix " + k; if (k < dimCmpBytes) { // dim bytes - return points.block[i * packedBytesLength + offset + k] & 0xff; + return points.block[i * packedBytesDocIDLength + dimOffset + k] & 0xff; } else { - // doc id - int s = 3 - (k - dimCmpBytes); - return (points.docIDs[i] >>> (s * 8)) & 0xff; + // data bytes + return points.block[i * packedBytesDocIDLength + dataOffset + k] & 0xff; } } @Override protected Selector getFallbackSelector(int d) { - int skypedBytes = d + commonPrefix; - final int start = dim * bytesPerDim + skypedBytes; - final int end = dim * bytesPerDim + bytesPerDim; + final int skypedBytes = d + commonPrefixLength; + final int dimStart = dim * bytesPerDim + skypedBytes; + final int dimEnd = dim * bytesPerDim + bytesPerDim; + final int dataOffset = numIndexDims * bytesPerDim; + // data length is composed by the data dimensions plus the docID + final int dataLength = (numDataDims - numIndexDims) * bytesPerDim + Integer.BYTES; return new IntroSelector() { - int pivotDoc = -1; - @Override protected void swap(int i, int j) { points.swap(i, j); @@ -350,36 +363,37 @@ public final class BKDRadixSelector { @Override protected void setPivot(int i) { if (skypedBytes < bytesPerDim) { - System.arraycopy(points.block, i * packedBytesLength + dim * bytesPerDim, scratch, 0, bytesPerDim); + System.arraycopy(points.block, i * packedBytesDocIDLength + dim * bytesPerDim, scratch, 0, bytesPerDim); } - pivotDoc = points.docIDs[i]; + System.arraycopy(points.block, i * packedBytesDocIDLength + dataOffset, scratch, bytesPerDim, dataLength); } @Override protected int compare(int i, int j) { if (skypedBytes < bytesPerDim) { - int iOffset = i * packedBytesLength; - int jOffset = j * packedBytesLength; - int cmp = FutureArrays.compareUnsigned(points.block, iOffset + start, iOffset + end, - points.block, jOffset + start, jOffset + end); + int iOffset = i * packedBytesDocIDLength; + int jOffset = j * packedBytesDocIDLength; + int cmp = FutureArrays.compareUnsigned(points.block, iOffset + dimStart, iOffset + dimEnd, points.block, jOffset + dimStart, jOffset + dimEnd); if (cmp != 0) { return cmp; } } - return points.docIDs[i] - points.docIDs[j]; + int iOffset = i * packedBytesDocIDLength + dataOffset; + int jOffset = j * packedBytesDocIDLength + dataOffset; + return FutureArrays.compareUnsigned(points.block, iOffset, iOffset + dataLength, points.block, jOffset, jOffset + dataLength); } @Override protected int comparePivot(int j) { if (skypedBytes < bytesPerDim) { - int jOffset = j * packedBytesLength; - int cmp = FutureArrays.compareUnsigned(scratch, skypedBytes, bytesPerDim, - points.block, jOffset + start, jOffset + end); + int jOffset = j * packedBytesDocIDLength; + int cmp = FutureArrays.compareUnsigned(scratch, skypedBytes, bytesPerDim, points.block, jOffset + dimStart, jOffset + dimEnd); if (cmp != 0) { return cmp; } } - return pivotDoc - points.docIDs[j]; + int jOffset = j * packedBytesDocIDLength + dataOffset; + return FutureArrays.compareUnsigned(scratch, bytesPerDim, bytesPerDim + dataLength, points.block, jOffset, jOffset + dataLength); } }; } @@ -394,8 +408,9 @@ public final class BKDRadixSelector { /** Sort the heap writer by the specified dim. It is used to sort the leaves of the tree */ public void heapRadixSort(final HeapPointWriter points, int from, int to, int dim, int commonPrefixLength) { - final int offset = dim * bytesPerDim + commonPrefixLength; + final int dimOffset = dim * bytesPerDim + commonPrefixLength; final int dimCmpBytes = bytesPerDim - commonPrefixLength; + final int dataOffset = numIndexDims * bytesPerDim - dimCmpBytes; new MSBRadixSorter(bytesSorted - commonPrefixLength) { @Override @@ -403,11 +418,10 @@ public final class BKDRadixSelector { assert k >= 0 : "negative prefix " + k; if (k < dimCmpBytes) { // dim bytes - return points.block[i * packedBytesLength + offset + k] & 0xff; + return points.block[i * packedBytesDocIDLength + dimOffset + k] & 0xff; } else { - // doc id - int s = 3 - (k - dimCmpBytes); - return (points.docIDs[i] >>> (s * 8)) & 0xff; + // data bytes + return points.block[i * packedBytesDocIDLength + dataOffset + k] & 0xff; } } @@ -418,13 +432,14 @@ public final class BKDRadixSelector { @Override protected Sorter getFallbackSorter(int k) { - int skypedBytes = k + commonPrefixLength; - final int start = dim * bytesPerDim + skypedBytes; - final int end = dim * bytesPerDim + bytesPerDim; + final int skypedBytes = k + commonPrefixLength; + final int dimStart = dim * bytesPerDim + skypedBytes; + final int dimEnd = dim * bytesPerDim + bytesPerDim; + final int dataOffset = numIndexDims * bytesPerDim; + // data length is composed by the data dimensions plus the docID + final int dataLength = (numDataDims - numIndexDims) * bytesPerDim + Integer.BYTES; return new IntroSorter() { - int pivotDoc = -1; - @Override protected void swap(int i, int j) { points.swap(i, j); @@ -433,36 +448,37 @@ public final class BKDRadixSelector { @Override protected void setPivot(int i) { if (skypedBytes < bytesPerDim) { - System.arraycopy(points.block, i * packedBytesLength + dim * bytesPerDim, scratch, 0, bytesPerDim); + System.arraycopy(points.block, i * packedBytesDocIDLength + dim * bytesPerDim, scratch, 0, bytesPerDim); } - pivotDoc = points.docIDs[i]; + System.arraycopy(points.block, i * packedBytesDocIDLength + dataOffset, scratch, bytesPerDim, dataLength); } @Override protected int compare(int i, int j) { if (skypedBytes < bytesPerDim) { - int iOffset = i * packedBytesLength; - int jOffset = j * packedBytesLength; - int cmp = FutureArrays.compareUnsigned(points.block, iOffset + start, iOffset + end, - points.block, jOffset + start, jOffset + end); + int iOffset = i * packedBytesDocIDLength; + int jOffset = j * packedBytesDocIDLength; + int cmp = FutureArrays.compareUnsigned(points.block, iOffset + dimStart, iOffset + dimEnd, points.block, jOffset + dimStart, jOffset + dimEnd); if (cmp != 0) { return cmp; } } - return points.docIDs[i] - points.docIDs[j]; + int iOffset = i * packedBytesDocIDLength + dataOffset; + int jOffset = j * packedBytesDocIDLength + dataOffset; + return FutureArrays.compareUnsigned(points.block, iOffset, iOffset + dataLength, points.block, jOffset, jOffset + dataLength); } @Override protected int comparePivot(int j) { if (skypedBytes < bytesPerDim) { - int jOffset = j * packedBytesLength; - int cmp = FutureArrays.compareUnsigned(scratch, skypedBytes, bytesPerDim, - points.block, jOffset + start, jOffset + end); + int jOffset = j * packedBytesDocIDLength; + int cmp = FutureArrays.compareUnsigned(scratch, skypedBytes, bytesPerDim, points.block, jOffset + dimStart, jOffset + dimEnd); if (cmp != 0) { return cmp; } } - return pivotDoc - points.docIDs[j]; + int jOffset = j * packedBytesDocIDLength + dataOffset; + return FutureArrays.compareUnsigned(scratch, bytesPerDim, bytesPerDim + dataLength, points.block, jOffset, jOffset + dataLength); } }; } diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java index 03cd90478c5..5512eff29bc 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java @@ -776,7 +776,7 @@ public class BKDWriter implements Closeable { assert pointCount / numLeaves <= maxPointsInLeafNode: "pointCount=" + pointCount + " numLeaves=" + numLeaves + " maxPointsInLeafNode=" + maxPointsInLeafNode; //We re-use the selector so we do not need to create an object every time. - BKDRadixSelector radixSelector = new BKDRadixSelector(numDataDims, bytesPerDim, maxPointsSortInHeap, tempDir, tempFileNamePrefix); + BKDRadixSelector radixSelector = new BKDRadixSelector(numDataDims, numIndexDims, bytesPerDim, maxPointsSortInHeap, tempDir, tempFileNamePrefix); boolean success = false; try { @@ -787,7 +787,8 @@ public class BKDWriter implements Closeable { minPackedValue, maxPackedValue, parentSplits, splitPackedValues, - leafBlockFPs); + leafBlockFPs, + new int[maxPointsInLeafNode]); assert Arrays.equals(parentSplits, new int[numIndexDims]); // If no exception, we should have cleaned everything up: @@ -1366,7 +1367,7 @@ public class BKDWriter implements Closeable { } // sort by sortedDim - MutablePointsReaderUtils.sortByDim(sortedDim, bytesPerDim, commonPrefixLengths, + MutablePointsReaderUtils.sortByDim(numDataDims, numIndexDims, sortedDim, bytesPerDim, commonPrefixLengths, reader, from, to, scratchBytesRef1, scratchBytesRef2); BytesRef comparator = scratchBytesRef1; @@ -1435,7 +1436,7 @@ public class BKDWriter implements Closeable { commonPrefixLen = bytesPerDim; } - MutablePointsReaderUtils.partition(maxDoc, splitDim, bytesPerDim, commonPrefixLen, + MutablePointsReaderUtils.partition(numDataDims, numIndexDims, maxDoc, splitDim, bytesPerDim, commonPrefixLen, reader, from, to, mid, scratchBytesRef1, scratchBytesRef2); // set the split value @@ -1472,7 +1473,8 @@ public class BKDWriter implements Closeable { byte[] minPackedValue, byte[] maxPackedValue, int[] parentSplits, byte[] splitPackedValues, - long[] leafBlockFPs) throws IOException { + long[] leafBlockFPs, + int[] spareDocIds) throws IOException { if (nodeID >= leafNodeOffset) { @@ -1532,7 +1534,13 @@ public class BKDWriter implements Closeable { // loading the values: int count = to - from; assert count > 0: "nodeID=" + nodeID + " leafNodeOffset=" + leafNodeOffset; - writeLeafBlockDocs(out, heapSource.docIDs, from, count); + assert count <= spareDocIds.length : "count=" + count + " > length=" + spareDocIds.length; + // Write doc IDs + int[] docIDs = spareDocIds; + for (int i = 0; i < count; i++) { + docIDs[i] = heapSource.getPackedValueSlice(from + i).docID(); + } + writeLeafBlockDocs(out, docIDs, 0, count); // TODO: minor opto: we don't really have to write the actual common prefixes, because BKDReader on recursing can regenerate it for us // from the index, much like how terms dict does so from the FST: @@ -1555,7 +1563,7 @@ public class BKDWriter implements Closeable { } }; assert valuesInOrderAndBounds(count, sortedDim, minPackedValue, maxPackedValue, packedValues, - heapSource.docIDs, from); + docIDs, 0); writeLeafBlockPackedValues(out, commonPrefixLengths, count, sortedDim, packedValues, leafCardinality); } else { @@ -1602,12 +1610,12 @@ public class BKDWriter implements Closeable { // Recurse on left tree: build(2 * nodeID, leafNodeOffset, slices[0], out, radixSelector, minPackedValue, maxSplitPackedValue, - parentSplits, splitPackedValues, leafBlockFPs); + parentSplits, splitPackedValues, leafBlockFPs, spareDocIds); // Recurse on right tree: build(2 * nodeID + 1, leafNodeOffset, slices[1], out, radixSelector, minSplitPackedValue, maxPackedValue - , parentSplits, splitPackedValues, leafBlockFPs); + , parentSplits, splitPackedValues, leafBlockFPs, spareDocIds); parentSplits[splitDim]--; } @@ -1661,6 +1669,13 @@ public class BKDWriter implements Closeable { if (cmp > 0) { throw new AssertionError("values out of order: last value=" + new BytesRef(lastPackedValue) + " current value=" + new BytesRef(packedValue, packedValueOffset, packedBytesLength) + " ord=" + ord); } + if (cmp == 0 && numDataDims > numIndexDims) { + int dataOffset = numIndexDims * bytesPerDim; + cmp = FutureArrays.compareUnsigned(lastPackedValue, dataOffset, packedBytesLength, packedValue, packedValueOffset + dataOffset, packedValueOffset + packedBytesLength); + if (cmp > 0) { + throw new AssertionError("data values out of order: last value=" + new BytesRef(lastPackedValue) + " current value=" + new BytesRef(packedValue, packedValueOffset, packedBytesLength) + " ord=" + ord); + } + } if (cmp == 0 && doc < lastDoc) { throw new AssertionError("docs out of order: last doc=" + lastDoc + " current doc=" + doc + " ord=" + ord); } diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointReader.java index 735b3138b21..4cee1ef036d 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointReader.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointReader.java @@ -27,16 +27,16 @@ public final class HeapPointReader implements PointReader { private int curRead; final byte[] block; final int packedBytesLength; - final int[] docIDs; + final int packedBytesDocIDLength; final int end; private final HeapPointValue pointValue; - public HeapPointReader(byte[] block, int packedBytesLength, int[] docIDs, int start, int end) { + public HeapPointReader(byte[] block, int packedBytesLength, int start, int end) { this.block = block; - this.docIDs = docIDs; curRead = start-1; this.end = end; this.packedBytesLength = packedBytesLength; + this.packedBytesDocIDLength = packedBytesLength + Integer.BYTES; if (start < end) { this.pointValue = new HeapPointValue(block, packedBytesLength); } else { @@ -53,7 +53,7 @@ public final class HeapPointReader implements PointReader { @Override public PointValue pointValue() { - pointValue.setValue(curRead * packedBytesLength, docIDs[curRead]); + pointValue.setOffset(curRead * packedBytesDocIDLength); return pointValue; } @@ -66,21 +66,22 @@ public final class HeapPointReader implements PointReader { */ static class HeapPointValue implements PointValue { - BytesRef packedValue; - BytesRef docIDBytes; - int docID; + final BytesRef packedValue; + final BytesRef packedValueDocID; + final int packedValueLength; - public HeapPointValue(byte[] value, int packedLength) { - packedValue = new BytesRef(value, 0, packedLength); - docIDBytes = new BytesRef(new byte[4]); + HeapPointValue(byte[] value, int packedValueLength) { + this.packedValueLength = packedValueLength; + this.packedValue = new BytesRef(value, 0, packedValueLength); + this.packedValueDocID = new BytesRef(value, 0, packedValueLength + Integer.BYTES); } /** - * Sets a new value by changing the offset and docID. + * Sets a new value by changing the offset. */ - public void setValue(int offset, int docID) { - this.docID = docID; + public void setOffset(int offset) { packedValue.offset = offset; + packedValueDocID.offset = offset; } @Override @@ -90,16 +91,14 @@ public final class HeapPointReader implements PointReader { @Override public int docID() { - return docID; + int position = packedValueDocID.offset + packedValueLength; + return ((packedValueDocID.bytes[position] & 0xFF) << 24) | ((packedValueDocID.bytes[++position] & 0xFF) << 16) + | ((packedValueDocID.bytes[++position] & 0xFF) << 8) | (packedValueDocID.bytes[++position] & 0xFF); } @Override - public BytesRef docIDBytes() { - docIDBytes.bytes[0] = (byte) (docID >> 24); - docIDBytes.bytes[1] = (byte) (docID >> 16); - docIDBytes.bytes[2] = (byte) (docID >> 8); - docIDBytes.bytes[3] = (byte) (docID >> 0); - return docIDBytes; + public BytesRef packedValueDocIDBytes() { + return packedValueDocID; } } } diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointWriter.java index 0893c1917eb..cd1b8ac0190 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointWriter.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointWriter.java @@ -25,10 +25,10 @@ import org.apache.lucene.util.FutureArrays; * @lucene.internal * */ public final class HeapPointWriter implements PointWriter { - public final int[] docIDs; public final byte[] block; final int size; final int packedBytesLength; + final int packedBytesDocIDLength; private final byte[] scratch; private int nextWrite; private boolean closed; @@ -37,11 +37,11 @@ public final class HeapPointWriter implements PointWriter { public HeapPointWriter(int size, int packedBytesLength) { - this.docIDs = new int[size]; - this.block = new byte[packedBytesLength * size]; - this.size = size; + this.packedBytesDocIDLength = packedBytesLength + Integer.BYTES; this.packedBytesLength = packedBytesLength; - this.scratch = new byte[packedBytesLength]; + this.block = new byte[packedBytesDocIDLength * size]; + this.size = size; + this.scratch = new byte[packedBytesDocIDLength]; if (size > 0) { pointValue = new HeapPointReader.HeapPointValue(block, packedBytesLength); } else { @@ -53,7 +53,7 @@ public final class HeapPointWriter implements PointWriter { /** Returns a reference, in result, to the byte[] slice holding this value */ public PointValue getPackedValueSlice(int index) { assert index < nextWrite : "nextWrite=" + (nextWrite) + " vs index=" + index; - pointValue.setValue(index * packedBytesLength, docIDs[index]); + pointValue.setOffset(index * packedBytesDocIDLength); return pointValue; } @@ -62,8 +62,12 @@ public final class HeapPointWriter implements PointWriter { assert closed == false : "point writer is already closed"; assert packedValue.length == packedBytesLength : "[packedValue] must have length [" + packedBytesLength + "] but was [" + packedValue.length + "]"; assert nextWrite < size : "nextWrite=" + (nextWrite + 1) + " vs size=" + size; - System.arraycopy(packedValue, 0, block, nextWrite * packedBytesLength, packedBytesLength); - docIDs[nextWrite] = docID; + System.arraycopy(packedValue, 0, block, nextWrite * packedBytesDocIDLength, packedBytesLength); + int position = nextWrite * packedBytesDocIDLength + packedBytesLength; + block[position] = (byte) (docID >> 24); + block[++position] = (byte) (docID >> 16); + block[++position] = (byte) (docID >> 8); + block[++position] = (byte) (docID >> 0); nextWrite++; } @@ -71,27 +75,23 @@ public final class HeapPointWriter implements PointWriter { public void append(PointValue pointValue) { assert closed == false : "point writer is already closed"; assert nextWrite < size : "nextWrite=" + (nextWrite + 1) + " vs size=" + size; - BytesRef packedValue = pointValue.packedValue(); - assert packedValue.length == packedBytesLength : "[packedValue] must have length [" + (packedBytesLength) + "] but was [" + packedValue.length + "]"; - System.arraycopy(packedValue.bytes, packedValue.offset, block, nextWrite * packedBytesLength, packedBytesLength); - docIDs[nextWrite] = pointValue.docID(); + BytesRef packedValueDocID = pointValue.packedValueDocIDBytes(); + assert packedValueDocID.length == packedBytesDocIDLength : "[packedValue] must have length [" + (packedBytesDocIDLength) + "] but was [" + packedValueDocID.length + "]"; + System.arraycopy(packedValueDocID.bytes, packedValueDocID.offset, block, nextWrite * packedBytesDocIDLength, packedBytesDocIDLength); nextWrite++; } public void swap(int i, int j) { - int docID = docIDs[i]; - docIDs[i] = docIDs[j]; - docIDs[j] = docID; - int indexI = i * packedBytesLength; - int indexJ = j * packedBytesLength; + int indexI = i * packedBytesDocIDLength; + int indexJ = j * packedBytesDocIDLength; // scratch1 = values[i] - System.arraycopy(block, indexI, scratch, 0, packedBytesLength); + System.arraycopy(block, indexI, scratch, 0, packedBytesDocIDLength); // values[i] = values[j] - System.arraycopy(block, indexJ, block, indexI, packedBytesLength); + System.arraycopy(block, indexJ, block, indexI, packedBytesDocIDLength); // values[j] = scratch1 - System.arraycopy(scratch, 0, block, indexJ, packedBytesLength); + System.arraycopy(scratch, 0, block, indexJ, packedBytesDocIDLength); } public int computeCardinality(int from, int to, int numDataDims, int bytesPerDim, int[] commonPrefixLengths) { @@ -101,8 +101,8 @@ public final class HeapPointWriter implements PointWriter { for (int dim = 0; dim < numDataDims; dim++) { final int start = dim * bytesPerDim + commonPrefixLengths[dim]; final int end = dim * bytesPerDim + bytesPerDim; - if (FutureArrays.mismatch(block, i * packedBytesLength + start, i * packedBytesLength + end, - block, (i - 1) * packedBytesLength + start, (i - 1) * packedBytesLength + end) != -1) { + if (FutureArrays.mismatch(block, i * packedBytesDocIDLength + start, i * packedBytesDocIDLength + end, + block, (i - 1) * packedBytesDocIDLength + start, (i - 1) * packedBytesDocIDLength + end) != -1) { leafCardinality++; break; } @@ -119,9 +119,9 @@ public final class HeapPointWriter implements PointWriter { @Override public PointReader getReader(long start, long length) { assert closed : "point writer is still open and trying to get a reader"; - assert start + length <= docIDs.length: "start=" + start + " length=" + length + " docIDs.length=" + docIDs.length; + assert start + length <= size: "start=" + start + " length=" + length + " docIDs.length=" + size; assert start + length <= nextWrite: "start=" + start + " length=" + length + " nextWrite=" + nextWrite; - return new HeapPointReader(block, packedBytesLength, docIDs, (int) start, Math.toIntExact(start+length)); + return new HeapPointReader(block, packedBytesLength, (int) start, Math.toIntExact(start+length)); } @Override @@ -135,6 +135,6 @@ public final class HeapPointWriter implements PointWriter { @Override public String toString() { - return "HeapPointWriter(count=" + nextWrite + " size=" + docIDs.length + ")"; + return "HeapPointWriter(count=" + nextWrite + " size=" + size + ")"; } } diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java b/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java index c56ed71df4d..45e695ba7e1 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java @@ -77,7 +77,8 @@ public final class MutablePointsReaderUtils { protected int comparePivot(int j) { if (k < packedBytesLength) { reader.getValue(j, scratch); - int cmp = FutureArrays.compareUnsigned(pivot.bytes, pivot.offset + k, pivot.offset + k + packedBytesLength - k, scratch.bytes, scratch.offset + k, scratch.offset + k + packedBytesLength - k); + int cmp = FutureArrays.compareUnsigned(pivot.bytes, pivot.offset + k, pivot.offset + k + packedBytesLength - k, + scratch.bytes, scratch.offset + k, scratch.offset + k + packedBytesLength - k); if (cmp != 0) { return cmp; } @@ -91,14 +92,16 @@ public final class MutablePointsReaderUtils { } /** Sort points on the given dimension. */ - public static void sortByDim(int sortedDim, int bytesPerDim, int[] commonPrefixLengths, + public static void sortByDim(int numDataDim, int numIndexDim, int sortedDim, int bytesPerDim, int[] commonPrefixLengths, MutablePointValues reader, int from, int to, BytesRef scratch1, BytesRef scratch2) { + final int start = sortedDim * bytesPerDim + commonPrefixLengths[sortedDim]; + final int dimEnd = sortedDim * bytesPerDim + bytesPerDim; + final int dataStart = numIndexDim * bytesPerDim; + final int dataEnd = dataStart + (numDataDim - numIndexDim) * bytesPerDim; // No need for a fancy radix sort here, this is called on the leaves only so // there are not many values to sort - final int offset = sortedDim * bytesPerDim + commonPrefixLengths[sortedDim]; - final int numBytesToCompare = bytesPerDim - commonPrefixLengths[sortedDim]; new IntroSorter() { final BytesRef pivot = scratch1; @@ -118,9 +121,14 @@ public final class MutablePointsReaderUtils { @Override protected int comparePivot(int j) { reader.getValue(j, scratch2); - int cmp = FutureArrays.compareUnsigned(pivot.bytes, pivot.offset + offset, pivot.offset + offset + numBytesToCompare, scratch2.bytes, scratch2.offset + offset, scratch2.offset + offset + numBytesToCompare); + int cmp = FutureArrays.compareUnsigned(pivot.bytes, pivot.offset + start, pivot.offset + dimEnd, scratch2.bytes, + scratch2.offset + start, scratch2.offset + dimEnd); if (cmp == 0) { - cmp = pivotDoc - reader.getDocID(j); + cmp = FutureArrays.compareUnsigned(pivot.bytes, pivot.offset + dataStart, pivot.offset + dataEnd, + scratch2.bytes, scratch2.offset + dataStart, scratch2.offset + dataEnd); + if (cmp == 0) { + cmp = pivotDoc - reader.getDocID(j); + } } return cmp; } @@ -130,16 +138,20 @@ public final class MutablePointsReaderUtils { /** Partition points around {@code mid}. All values on the left must be less * than or equal to it and all values on the right must be greater than or * equal to it. */ - public static void partition(int maxDoc, int splitDim, int bytesPerDim, int commonPrefixLen, + public static void partition(int numDataDim, int numIndexDim, int maxDoc, int splitDim, int bytesPerDim, int commonPrefixLen, MutablePointValues reader, int from, int to, int mid, BytesRef scratch1, BytesRef scratch2) { - final int offset = splitDim * bytesPerDim + commonPrefixLen; - final int cmpBytes = bytesPerDim - commonPrefixLen; + final int dimOffset = splitDim * bytesPerDim + commonPrefixLen; + final int dimCmpBytes = bytesPerDim - commonPrefixLen; + final int dataOffset = numIndexDim * bytesPerDim; + final int dataCmpBytes = (numDataDim - numIndexDim) * bytesPerDim + dimCmpBytes; final int bitsPerDocId = PackedInts.bitsRequired(maxDoc - 1); - new RadixSelector(cmpBytes + (bitsPerDocId + 7) / 8) { + new RadixSelector(dataCmpBytes + (bitsPerDocId + 7) / 8) { @Override protected Selector getFallbackSelector(int k) { + final int dataStart = (k < dimCmpBytes) ? dataOffset : dataOffset + k - dimCmpBytes; + final int dataEnd = numDataDim * bytesPerDim; return new IntroSelector() { final BytesRef pivot = scratch1; @@ -158,9 +170,18 @@ public final class MutablePointsReaderUtils { @Override protected int comparePivot(int j) { - if (k < cmpBytes) { + if (k < dimCmpBytes) { reader.getValue(j, scratch2); - int cmp = FutureArrays.compareUnsigned(pivot.bytes, pivot.offset + offset + k, pivot.offset + offset + k + cmpBytes - k, scratch2.bytes, scratch2.offset + offset + k, scratch2.offset + offset + k + cmpBytes - k); + int cmp = FutureArrays.compareUnsigned(pivot.bytes, pivot.offset + dimOffset + k, pivot.offset + dimOffset + dimCmpBytes, + scratch2.bytes, scratch2.offset + dimOffset + k, scratch2.offset + dimOffset + dimCmpBytes); + if (cmp != 0) { + return cmp; + } + } + if (k < dataCmpBytes) { + reader.getValue(j, scratch2); + int cmp = FutureArrays.compareUnsigned(pivot.bytes, pivot.offset + dataStart, pivot.offset + dataEnd, + scratch2.bytes, scratch2.offset + dataStart, scratch2.offset + dataEnd); if (cmp != 0) { return cmp; } @@ -177,10 +198,12 @@ public final class MutablePointsReaderUtils { @Override protected int byteAt(int i, int k) { - if (k < cmpBytes) { - return Byte.toUnsignedInt(reader.getByteAt(i, offset + k)); + if (k < dimCmpBytes) { + return Byte.toUnsignedInt(reader.getByteAt(i, dimOffset + k)); + } else if (k < dataCmpBytes) { + return Byte.toUnsignedInt(reader.getByteAt(i, dataOffset + k - dimCmpBytes)); } else { - final int shift = bitsPerDocId - ((k - cmpBytes + 1) << 3); + final int shift = bitsPerDocId - ((k - dataCmpBytes + 1) << 3); return (reader.getDocID(i) >>> Math.max(0, shift)) & 0xff; } } diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointReader.java index 6218429b071..030bd719fea 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointReader.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointReader.java @@ -137,12 +137,14 @@ public final class OfflinePointReader implements PointReader { */ static class OfflinePointValue implements PointValue { - BytesRef packedValue; - BytesRef docIDBytes; + final BytesRef packedValue; + final BytesRef packedValueDocID; + final int packedValueLength; OfflinePointValue(byte[] value, int packedValueLength) { - packedValue = new BytesRef(value, 0, packedValueLength); - docIDBytes = new BytesRef(value, packedValueLength, Integer.BYTES); + this.packedValueLength = packedValueLength; + this.packedValue = new BytesRef(value, 0, packedValueLength); + this.packedValueDocID = new BytesRef(value, 0, packedValueLength + Integer.BYTES); } /** @@ -150,7 +152,7 @@ public final class OfflinePointReader implements PointReader { */ public void setOffset(int offset) { packedValue.offset = offset; - docIDBytes.offset = offset + packedValue.length; + packedValueDocID.offset = offset; } @Override @@ -160,14 +162,14 @@ public final class OfflinePointReader implements PointReader { @Override public int docID() { - int position =docIDBytes.offset; - return ((docIDBytes.bytes[position] & 0xFF) << 24) | ((docIDBytes.bytes[++position] & 0xFF) << 16) - | ((docIDBytes.bytes[++position] & 0xFF) << 8) | (docIDBytes.bytes[++position] & 0xFF); + int position = packedValueDocID.offset + packedValueLength; + return ((packedValueDocID.bytes[position] & 0xFF) << 24) | ((packedValueDocID.bytes[++position] & 0xFF) << 16) + | ((packedValueDocID.bytes[++position] & 0xFF) << 8) | (packedValueDocID.bytes[++position] & 0xFF); } @Override - public BytesRef docIDBytes() { - return docIDBytes; + public BytesRef packedValueDocIDBytes() { + return packedValueDocID; } } diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointWriter.java index 4f4ce9ee740..d9fa86f5307 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointWriter.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointWriter.java @@ -62,12 +62,9 @@ public final class OfflinePointWriter implements PointWriter { @Override public void append(PointValue pointValue) throws IOException { assert closed == false : "Point writer is already closed"; - BytesRef packedValue = pointValue.packedValue(); - assert packedValue.length == packedBytesLength : "[packedValue] must have length [" + packedBytesLength + "] but was [" + packedValue.length + "]"; - out.writeBytes(packedValue.bytes, packedValue.offset, packedValue.length); - BytesRef docIDBytes = pointValue.docIDBytes(); - assert docIDBytes.length == Integer.BYTES : "[docIDBytes] must have length [" + Integer.BYTES + "] but was [" + docIDBytes.length + "]"; - out.writeBytes(docIDBytes.bytes, docIDBytes.offset, docIDBytes.length); + BytesRef packedValueDocID = pointValue.packedValueDocIDBytes(); + assert packedValueDocID.length == packedBytesLength + Integer.BYTES : "[packedValue and docID] must have length [" + (packedBytesLength + Integer.BYTES) + "] but was [" + packedValueDocID.length + "]"; + out.writeBytes(packedValueDocID.bytes, packedValueDocID.offset, packedValueDocID.length); count++; assert expectedCount == 0 || count <= expectedCount : "expectedCount=" + expectedCount + " vs count=" + count; } diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/PointValue.java b/lucene/core/src/java/org/apache/lucene/util/bkd/PointValue.java index 28506278782..d933e81ff61 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/PointValue.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/PointValue.java @@ -28,10 +28,11 @@ public interface PointValue { /** Returns the packed values for the dimensions */ BytesRef packedValue(); - /** Returns the document id */ + /** Returns the docID */ int docID(); - /** Returns the byte representation of the document id */ - BytesRef docIDBytes(); + /** Returns the byte representation of the packed value + * together with the docID */ + BytesRef packedValueDocIDBytes(); } diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java index 2b3cd8a6f77..7c537a4758d 100644 --- a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java +++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java @@ -1098,6 +1098,80 @@ public class TestBKD extends LuceneTestCase { } } + public void testCheckDataDimOptimalOrder() throws IOException { + Directory dir = newDirectory(); + final int numValues = atLeast(5000); + final int maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 500); + final int numBytesPerDim = TestUtil.nextInt(random(), 1, 4); + final double maxMB = (float) 3.0 + (3*random().nextDouble()); + + final int numIndexDims = TestUtil.nextInt(random(), 1, 8); + final int numDataDims = TestUtil.nextInt(random(), numIndexDims, 8); + + final byte[] pointValue1 = new byte[numDataDims * numBytesPerDim]; + final byte[] pointValue2 = new byte[numDataDims * numBytesPerDim]; + random().nextBytes(pointValue1); + random().nextBytes(pointValue2); + // equal index dimensions but different data dimensions + for (int i = 0; i < numIndexDims; i++) { + System.arraycopy(pointValue1, i * numBytesPerDim, pointValue2, i * numBytesPerDim, numBytesPerDim); + } + + BKDWriter w = new BKDWriter(2 * numValues, dir, "_temp", numDataDims, numIndexDims, numBytesPerDim, maxPointsInLeafNode, + maxMB, 2 * numValues); + for (int i = 0; i < numValues; ++i) { + w.add(pointValue1, i); + w.add(pointValue2, i); + } + final long indexFP; + try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) { + indexFP = w.finish(out); + w.close(); + } + + IndexInput pointsIn = dir.openInput("bkd", IOContext.DEFAULT); + pointsIn.seek(indexFP); + BKDReader points = new BKDReader(pointsIn); + + points.intersect(new IntersectVisitor() { + + byte[] previous = null; + boolean hasChanged = false; + + @Override + public void visit(int docID) { + throw new UnsupportedOperationException(); + } + + @Override + public void visit(int docID, byte[] packedValue) { + if (previous == null) { + previous = new byte[numDataDims * numBytesPerDim]; + System.arraycopy(packedValue, 0, previous, 0, numDataDims * numBytesPerDim); + } else { + int mismatch = FutureArrays.mismatch(packedValue, 0, numDataDims * numBytesPerDim, previous, 0, numDataDims * numBytesPerDim); + if (mismatch != -1) { + if (hasChanged == false) { + hasChanged = true; + System.arraycopy(packedValue, 0, previous, 0, numDataDims * numBytesPerDim); + } else { + fail("Points are not in optimal order"); + } + } + } + } + + @Override + public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { + return Relation.CELL_CROSSES_QUERY; + } + }); + + + pointsIn.close(); + dir.close(); + } + public void test2DLongOrdsOffline() throws Exception { try (Directory dir = newDirectory()) { int numDocs = 100000; diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSelector.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSelector.java index 456667339a7..939991c138b 100644 --- a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSelector.java +++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSelector.java @@ -209,7 +209,7 @@ public class TestBKDRadixSelector extends LuceneTestCase { } private void verify(Directory dir, PointWriter points, int dataDimensions, int indexDimensions, long start, long end, long middle, int packedLength, int bytesPerDimensions, int sortedOnHeap) throws IOException{ - BKDRadixSelector radixSelector = new BKDRadixSelector(dataDimensions, bytesPerDimensions, sortedOnHeap, dir, "test"); + BKDRadixSelector radixSelector = new BKDRadixSelector(dataDimensions, indexDimensions, bytesPerDimensions, sortedOnHeap, dir, "test"); //we only split by indexed dimension so we check for each only those dimension for (int splitDim = 0; splitDim < indexDimensions; splitDim++) { //We need to make a copy of the data as it is deleted in the process @@ -225,9 +225,15 @@ public class TestBKDRadixSelector extends LuceneTestCase { int cmp = FutureArrays.compareUnsigned(max, 0, bytesPerDimensions, min, 0, bytesPerDimensions); assertTrue(cmp <= 0); if (cmp == 0) { - int maxDocID = getMaxDocId(slices[0], bytesPerDimensions, splitDim, partitionPoint); - int minDocId = getMinDocId(slices[1], bytesPerDimensions, splitDim, partitionPoint); - assertTrue(minDocId >= maxDocID); + byte[] maxDataDim = getMaxDataDimension(slices[0], bytesPerDimensions, dataDimensions, indexDimensions, max, splitDim); + byte[] minDataDim = getMinDataDimension(slices[1], bytesPerDimensions, dataDimensions, indexDimensions, min, splitDim); + cmp = FutureArrays.compareUnsigned(maxDataDim, 0, (dataDimensions - indexDimensions) * bytesPerDimensions, minDataDim, 0, (dataDimensions - indexDimensions) * bytesPerDimensions); + assertTrue(cmp <= 0); + if (cmp == 0) { + int maxDocID = getMaxDocId(slices[0], bytesPerDimensions, splitDim, partitionPoint, dataDimensions, indexDimensions,maxDataDim); + int minDocId = getMinDocId(slices[1], bytesPerDimensions, splitDim, partitionPoint, dataDimensions, indexDimensions,minDataDim); + assertTrue(minDocId >= maxDocID); + } } assertTrue(Arrays.equals(partitionPoint, min)); slices[0].writer.destroy(); @@ -293,14 +299,17 @@ public class TestBKDRadixSelector extends LuceneTestCase { return min; } - private int getMinDocId(BKDRadixSelector.PathSlice p, int bytesPerDimension, int dimension, byte[] partitionPoint) throws IOException { + private int getMinDocId(BKDRadixSelector.PathSlice p, int bytesPerDimension, int dimension, byte[] partitionPoint, int dataDims, int indexDims, byte[] dataDim) throws IOException { int docID = Integer.MAX_VALUE; try (PointReader reader = p.writer.getReader(p.start, p.count)) { while (reader.next()) { PointValue pointValue = reader.pointValue(); BytesRef packedValue = pointValue.packedValue(); int offset = dimension * bytesPerDimension; - if (FutureArrays.compareUnsigned(packedValue.bytes, packedValue.offset + offset, packedValue.offset + offset + bytesPerDimension, partitionPoint, 0, bytesPerDimension) == 0) { + int dataOffset = indexDims * bytesPerDimension; + int dataLength = (dataDims - indexDims) * bytesPerDimension; + if (FutureArrays.compareUnsigned(packedValue.bytes, packedValue.offset + offset, packedValue.offset + offset + bytesPerDimension, partitionPoint, 0, bytesPerDimension) == 0 + && FutureArrays.compareUnsigned(packedValue.bytes, packedValue.offset + dataOffset, packedValue.offset + dataOffset + dataLength, dataDim, 0, dataLength) == 0) { int newDocID = pointValue.docID(); if (newDocID < docID) { docID = newDocID; @@ -311,6 +320,26 @@ public class TestBKDRadixSelector extends LuceneTestCase { return docID; } + private byte[] getMinDataDimension(BKDRadixSelector.PathSlice p, int bytesPerDimension, int dataDims, int indexDims, byte[] minDim, int splitDim) throws IOException { + byte[] min = new byte[(dataDims - indexDims) * bytesPerDimension]; + Arrays.fill(min, (byte) 0xff); + int offset = splitDim * bytesPerDimension; + try (PointReader reader = p.writer.getReader(p.start, p.count)) { + byte[] value = new byte[(dataDims - indexDims) * bytesPerDimension]; + while (reader.next()) { + PointValue pointValue = reader.pointValue(); + BytesRef packedValue = pointValue.packedValue(); + if (FutureArrays.mismatch(minDim, 0, bytesPerDimension, packedValue.bytes, packedValue.offset + offset, packedValue.offset + offset + bytesPerDimension) == -1) { + System.arraycopy(packedValue.bytes, packedValue.offset + indexDims * bytesPerDimension, value, 0, (dataDims - indexDims) * bytesPerDimension); + if (FutureArrays.compareUnsigned(min, 0, (dataDims - indexDims) * bytesPerDimension, value, 0, (dataDims - indexDims) * bytesPerDimension) > 0) { + System.arraycopy(value, 0, min, 0, (dataDims - indexDims) * bytesPerDimension); + } + } + } + } + return min; + } + private byte[] getMax(BKDRadixSelector.PathSlice p, int bytesPerDimension, int dimension) throws IOException { byte[] max = new byte[bytesPerDimension]; Arrays.fill(max, (byte) 0); @@ -328,14 +357,37 @@ public class TestBKDRadixSelector extends LuceneTestCase { return max; } - private int getMaxDocId(BKDRadixSelector.PathSlice p, int bytesPerDimension, int dimension, byte[] partitionPoint) throws IOException { + private byte[] getMaxDataDimension(BKDRadixSelector.PathSlice p, int bytesPerDimension, int dataDims, int indexDims, byte[] maxDim, int splitDim) throws IOException { + byte[] max = new byte[(dataDims - indexDims) * bytesPerDimension]; + Arrays.fill(max, (byte) 0); + int offset = splitDim * bytesPerDimension; + try (PointReader reader = p.writer.getReader(p.start, p.count)) { + byte[] value = new byte[(dataDims - indexDims) * bytesPerDimension]; + while (reader.next()) { + PointValue pointValue = reader.pointValue(); + BytesRef packedValue = pointValue.packedValue(); + if (FutureArrays.mismatch(maxDim, 0, bytesPerDimension, packedValue.bytes, packedValue.offset + offset, packedValue.offset + offset + bytesPerDimension) == -1) { + System.arraycopy(packedValue.bytes, packedValue.offset + indexDims * bytesPerDimension, value, 0, (dataDims - indexDims) * bytesPerDimension); + if (FutureArrays.compareUnsigned(max, 0, (dataDims - indexDims) * bytesPerDimension, value, 0, (dataDims - indexDims) * bytesPerDimension) < 0) { + System.arraycopy(value, 0, max, 0, (dataDims - indexDims) * bytesPerDimension); + } + } + } + } + return max; + } + + private int getMaxDocId(BKDRadixSelector.PathSlice p, int bytesPerDimension, int dimension, byte[] partitionPoint, int dataDims, int indexDims, byte[] dataDim) throws IOException { int docID = Integer.MIN_VALUE; try (PointReader reader = p.writer.getReader(p.start, p.count)) { while (reader.next()) { PointValue pointValue = reader.pointValue(); BytesRef packedValue = pointValue.packedValue(); int offset = dimension * bytesPerDimension; - if (FutureArrays.compareUnsigned(packedValue.bytes, packedValue.offset + offset, packedValue.offset + offset + bytesPerDimension, partitionPoint, 0, bytesPerDimension) == 0) { + int dataOffset = indexDims * bytesPerDimension; + int dataLength = (dataDims - indexDims) * bytesPerDimension; + if (FutureArrays.compareUnsigned(packedValue.bytes, packedValue.offset + offset, packedValue.offset + offset + bytesPerDimension, partitionPoint, 0, bytesPerDimension) == 0 + && FutureArrays.compareUnsigned(packedValue.bytes, packedValue.offset + dataOffset, packedValue.offset + dataOffset + dataLength, dataDim, 0, dataLength) == 0) { int newDocID = pointValue.docID(); if (newDocID > docID) { docID = newDocID; diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSort.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSort.java index 045f223b5a1..7fecca79cdb 100644 --- a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSort.java +++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSort.java @@ -117,7 +117,7 @@ public class TestBKDRadixSort extends LuceneTestCase { private void verifySort(HeapPointWriter points, int dataDimensions, int indexDimensions, int start, int end, int bytesPerDim) throws IOException{ int packedBytesLength = dataDimensions * bytesPerDim; Directory dir = newDirectory(); - BKDRadixSelector radixSelector = new BKDRadixSelector(dataDimensions, bytesPerDim, 1000, dir, "test"); + BKDRadixSelector radixSelector = new BKDRadixSelector(dataDimensions, indexDimensions, bytesPerDim, 1000, dir, "test"); // we check for each dimension for (int splitDim = 0; splitDim < dataDimensions; splitDim++) { radixSelector.heapRadixSort(points, start, end, splitDim, getRandomCommonPrefix(points, start, end, bytesPerDim, splitDim)); @@ -130,6 +130,11 @@ public class TestBKDRadixSort extends LuceneTestCase { BytesRef value = pointValue.packedValue(); int cmp = FutureArrays.compareUnsigned(value.bytes, value.offset + dimOffset, value.offset + dimOffset + bytesPerDim, previous, dimOffset, dimOffset + bytesPerDim); assertTrue(cmp >= 0); + if (cmp == 0) { + int dataOffset = indexDimensions * bytesPerDim; + cmp = FutureArrays.compareUnsigned(value.bytes, value.offset + dataOffset, value.offset + packedBytesLength, previous, dataOffset, packedBytesLength); + assertTrue(cmp >= 0); + } if (cmp == 0) { assertTrue(pointValue.docID() >= previousDocId); } diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestMutablePointsReaderUtils.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestMutablePointsReaderUtils.java index c2a516345bc..d3b105d12c5 100644 --- a/lucene/core/src/test/org/apache/lucene/util/bkd/TestMutablePointsReaderUtils.java +++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestMutablePointsReaderUtils.java @@ -38,7 +38,7 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase { private void doTestSort() { final int bytesPerDim = TestUtil.nextInt(random(), 1, 16); final int maxDoc = TestUtil.nextInt(random(), 1, 1 << random().nextInt(30)); - Point[] points = createRandomPoints(1, bytesPerDim, maxDoc); + Point[] points = createRandomPoints(1, 1, bytesPerDim, maxDoc, new int[1]); DummyPointsReader reader = new DummyPointsReader(points); MutablePointsReaderUtils.sort(maxDoc, bytesPerDim, reader, 0, points.length); Arrays.sort(points, new Comparator() { @@ -62,25 +62,15 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase { } private void doTestSortByDim() { - final int numDims = TestUtil.nextInt(random(), 1, 8); + final int numIndexDims = TestUtil.nextInt(random(), 1, 8); + final int numDataDims = TestUtil.nextInt(random(), numIndexDims, 8); final int bytesPerDim = TestUtil.nextInt(random(), 1, 16); final int maxDoc = TestUtil.nextInt(random(), 1, 1 << random().nextInt(30)); - Point[] points = createRandomPoints(numDims, bytesPerDim, maxDoc); - int[] commonPrefixLengths = new int[numDims]; - for (int i = 0; i < commonPrefixLengths.length; ++i) { - commonPrefixLengths[i] = TestUtil.nextInt(random(), 0, bytesPerDim); - } - BytesRef firstValue = points[0].packedValue; - for (int i = 1; i < points.length; ++i) { - for (int dim = 0; dim < numDims; ++dim) { - int offset = dim * bytesPerDim; - BytesRef packedValue = points[i].packedValue; - System.arraycopy(firstValue.bytes, firstValue.offset + offset, packedValue.bytes, packedValue.offset + offset, commonPrefixLengths[dim]); - } - } + int[] commonPrefixLengths = new int[numDataDims]; + Point[] points = createRandomPoints(numDataDims, numIndexDims, bytesPerDim, maxDoc, commonPrefixLengths); DummyPointsReader reader = new DummyPointsReader(points); - final int sortedDim = random().nextInt(numDims); - MutablePointsReaderUtils.sortByDim(sortedDim, bytesPerDim, commonPrefixLengths, reader, 0, points.length, + final int sortedDim = random().nextInt(numIndexDims); + MutablePointsReaderUtils.sortByDim(numDataDims, numIndexDims, sortedDim, bytesPerDim, commonPrefixLengths, reader, 0, points.length, new BytesRef(), new BytesRef()); for (int i = 1; i < points.length; ++i) { final int offset = sortedDim * bytesPerDim; @@ -88,7 +78,13 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase { BytesRef currentValue = reader.points[i].packedValue; int cmp = FutureArrays.compareUnsigned(previousValue.bytes, previousValue.offset + offset, previousValue.offset + offset + bytesPerDim, currentValue.bytes, currentValue.offset + offset, currentValue.offset + offset + bytesPerDim); if (cmp == 0) { - cmp = reader.points[i - 1].doc - reader.points[i].doc; + int dataDimOffset = numIndexDims * bytesPerDim; + int dataDimsLength = (numDataDims - numIndexDims) * bytesPerDim; + cmp = FutureArrays.compareUnsigned(previousValue.bytes, previousValue.offset + dataDimOffset, previousValue.offset + dataDimOffset + dataDimsLength, + currentValue.bytes, currentValue.offset + dataDimOffset, currentValue.offset + dataDimOffset + dataDimsLength); + if (cmp == 0) { + cmp = reader.points[i - 1].doc - reader.points[i].doc; + } } assertTrue(cmp <= 0); } @@ -101,29 +97,31 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase { } private void doTestPartition() { - final int numDims = TestUtil.nextInt(random(), 1, 8); + final int numIndexDims = TestUtil.nextInt(random(), 1, 8); + final int numDataDims = TestUtil.nextInt(random(), numIndexDims, 8); final int bytesPerDim = TestUtil.nextInt(random(), 1, 16); + int[] commonPrefixLengths = new int[numDataDims]; final int maxDoc = TestUtil.nextInt(random(), 1, 1 << random().nextInt(30)); - Point[] points = createRandomPoints(numDims, bytesPerDim, maxDoc); - int commonPrefixLength = TestUtil.nextInt(random(), 0, bytesPerDim); - final int splitDim = random().nextInt(numDims); - BytesRef firstValue = points[0].packedValue; - for (int i = 1; i < points.length; ++i) { - BytesRef packedValue = points[i].packedValue; - int offset = splitDim * bytesPerDim; - System.arraycopy(firstValue.bytes, firstValue.offset + offset, packedValue.bytes, packedValue.offset + offset, commonPrefixLength); - } + Point[] points = createRandomPoints(numDataDims, numIndexDims, bytesPerDim, maxDoc, commonPrefixLengths); + final int splitDim = random().nextInt(numIndexDims); DummyPointsReader reader = new DummyPointsReader(points); final int pivot = TestUtil.nextInt(random(), 0, points.length - 1); - MutablePointsReaderUtils.partition(maxDoc, splitDim, bytesPerDim, commonPrefixLength, reader, 0, points.length, pivot, + MutablePointsReaderUtils.partition(numDataDims, numIndexDims, maxDoc, splitDim, bytesPerDim, commonPrefixLengths[splitDim], reader, 0, points.length, pivot, new BytesRef(), new BytesRef()); BytesRef pivotValue = reader.points[pivot].packedValue; int offset = splitDim * bytesPerDim; for (int i = 0; i < points.length; ++i) { BytesRef value = reader.points[i].packedValue; - int cmp = FutureArrays.compareUnsigned(value.bytes, value.offset + offset, value.offset + offset + bytesPerDim, pivotValue.bytes, pivotValue.offset + offset, pivotValue.offset + offset + bytesPerDim); + int cmp = FutureArrays.compareUnsigned(value.bytes, value.offset + offset, value.offset + offset + bytesPerDim, + pivotValue.bytes, pivotValue.offset + offset, pivotValue.offset + offset + bytesPerDim); if (cmp == 0) { - cmp = reader.points[i].doc - reader.points[pivot].doc; + int dataDimOffset = numIndexDims * bytesPerDim; + int dataDimsLength = (numDataDims - numIndexDims) * bytesPerDim; + cmp = FutureArrays.compareUnsigned(value.bytes, value.offset + dataDimOffset, value.offset + dataDimOffset + dataDimsLength, + pivotValue.bytes, pivotValue.offset + dataDimOffset, pivotValue.offset + dataDimOffset + dataDimsLength); + if (cmp == 0) { + cmp = reader.points[i].doc - reader.points[pivot].doc; + } } if (i < pivot) { assertTrue(cmp <= 0); @@ -135,14 +133,54 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase { } } - private static Point[] createRandomPoints(int numDims, int bytesPerDim, int maxDoc) { - final int packedBytesLength = numDims * bytesPerDim; + private static Point[] createRandomPoints(int numDataDims, int numIndexdims, int bytesPerDim, int maxDoc, int[] commonPrefixLengths) { + assertTrue(commonPrefixLengths.length == numDataDims); + final int packedBytesLength = numDataDims * bytesPerDim; final int numPoints = TestUtil.nextInt(random(), 1, 100000); Point[] points = new Point[numPoints]; - for (int i = 0; i < numPoints; ++i) { - byte[] value = new byte[packedBytesLength]; - random().nextBytes(value); - points[i] = new Point(value, random().nextInt(maxDoc)); + if (random().nextInt(5) != 0) { + for (int i = 0; i < numPoints; ++i) { + byte[] value = new byte[packedBytesLength]; + random().nextBytes(value); + points[i] = new Point(value, random().nextInt(maxDoc)); + } + for (int i = 0; i < numDataDims; ++i) { + commonPrefixLengths[i] = TestUtil.nextInt(random(), 0, bytesPerDim); + } + BytesRef firstValue = points[0].packedValue; + for (int i = 1; i < points.length; ++i) { + for (int dim = 0; dim < numDataDims; ++dim) { + int offset = dim * bytesPerDim; + BytesRef packedValue = points[i].packedValue; + System.arraycopy(firstValue.bytes, firstValue.offset + offset, packedValue.bytes, packedValue.offset + offset, commonPrefixLengths[dim]); + } + } + } else { + //index dim are equal, data dims different + byte[] indexDims = new byte[numIndexdims * bytesPerDim]; + random().nextBytes(indexDims); + byte[] dataDims = new byte[(numDataDims - numIndexdims) * bytesPerDim]; + for (int i = 0; i < numPoints; ++i) { + byte[] value = new byte[packedBytesLength]; + System.arraycopy(indexDims, 0, value, 0, numIndexdims * bytesPerDim); + random().nextBytes(dataDims); + System.arraycopy(dataDims, 0, value, numIndexdims * bytesPerDim, (numDataDims - numIndexdims) * bytesPerDim); + points[i] = new Point(value, random().nextInt(maxDoc)); + } + for (int i = 0; i < numIndexdims; ++i) { + commonPrefixLengths[i] = bytesPerDim; + } + for (int i = numDataDims; i < numDataDims; ++i) { + commonPrefixLengths[i] = TestUtil.nextInt(random(), 0, bytesPerDim); + } + BytesRef firstValue = points[0].packedValue; + for (int i = 1; i < points.length; ++i) { + for (int dim = numIndexdims; dim < numDataDims; ++dim) { + int offset = dim * bytesPerDim; + BytesRef packedValue = points[i].packedValue; + System.arraycopy(firstValue.bytes, firstValue.offset + offset, packedValue.bytes, packedValue.offset + offset, commonPrefixLengths[dim]); + } + } } return points; }