From 74085cd1b083d556e7772043a881464c4ea07b78 Mon Sep 17 00:00:00 2001 From: Ignacio Vera Date: Fri, 24 Nov 2023 13:58:39 +0100 Subject: [PATCH] Hide the internal data structure of HeapPointWriter (#12762) --- lucene/CHANGES.txt | 2 + .../lucene/util/bkd/BKDRadixSelector.java | 120 ++----------- .../lucene/util/bkd/HeapPointReader.java | 59 +----- .../lucene/util/bkd/HeapPointWriter.java | 168 +++++++++++++++--- 4 files changed, 167 insertions(+), 182 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 8c85e5bce33..93bdc076b4d 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -363,6 +363,8 @@ Other overflows and slices that are too large. Some bits of code are simplified. Documentation is updated and expanded. (Stefan Vodita) +* GITHUB#12762: Refactor BKD HeapPointWriter to hide the internal data structure. (Ignacio Vera) + ======================== Lucene 9.8.0 ======================= API Changes diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDRadixSelector.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDRadixSelector.java index 662f21b6c86..62586641e2c 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDRadixSelector.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDRadixSelector.java @@ -19,8 +19,6 @@ package org.apache.lucene.util.bkd; import java.io.IOException; import java.util.Arrays; import org.apache.lucene.store.Directory; -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.ArrayUtil.ByteArrayComparator; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IntroSelector; import org.apache.lucene.util.IntroSorter; @@ -181,12 +179,8 @@ public final class BKDRadixSelector { break; } else { // Check common prefix and adjust histogram - final int startIndex = - (dimCommonPrefix > config.bytesPerDim) ? config.bytesPerDim : dimCommonPrefix; - final int endIndex = - (commonPrefixPosition > config.bytesPerDim) - ? config.bytesPerDim - : commonPrefixPosition; + final int startIndex = Math.min(dimCommonPrefix, config.bytesPerDim); + final int endIndex = Math.min(commonPrefixPosition, config.bytesPerDim); packedValueDocID = pointValue.packedValueDocIDBytes(); int j = Arrays.mismatch( @@ -427,24 +421,13 @@ public final class BKDRadixSelector { @Override protected int byteAt(int i, int k) { assert k >= 0 : "negative prefix " + k; - if (k < dimCmpBytes) { - // dim bytes - return points.block[i * config.bytesPerDoc + dimOffset + k] & 0xff; - } else { - // data bytes - return points.block[i * config.bytesPerDoc + dataOffset + k] & 0xff; - } + return points.byteAt(i, k < dimCmpBytes ? dimOffset + k : dataOffset + k); } @Override protected Selector getFallbackSelector(int d) { final int skypedBytes = d + commonPrefixLength; final int dimStart = dim * config.bytesPerDim; - // data length is composed by the data dimensions plus the docID - final int dataLength = - (config.numDims - config.numIndexDims) * config.bytesPerDim + Integer.BYTES; - final ByteArrayComparator dimComparator = - ArrayUtil.getUnsignedComparator(config.bytesPerDim); return new IntroSelector() { @Override @@ -455,61 +438,31 @@ public final class BKDRadixSelector { @Override protected void setPivot(int i) { if (skypedBytes < config.bytesPerDim) { - System.arraycopy( - points.block, - i * config.bytesPerDoc + dim * config.bytesPerDim, - scratch, - 0, - config.bytesPerDim); + points.copyDim(i, dimStart, scratch, 0); } - System.arraycopy( - points.block, - i * config.bytesPerDoc + config.packedIndexBytesLength, - scratch, - config.bytesPerDim, - dataLength); + points.copyDataDimsAndDoc(i, scratch, config.bytesPerDim); } @Override protected int compare(int i, int j) { if (skypedBytes < config.bytesPerDim) { - int iOffset = i * config.bytesPerDoc; - int jOffset = j * config.bytesPerDoc; - int cmp = - dimComparator.compare( - points.block, iOffset + dimStart, points.block, jOffset + dimStart); + int cmp = points.compareDim(i, j, dimStart); if (cmp != 0) { return cmp; } } - int iOffset = i * config.bytesPerDoc + config.packedIndexBytesLength; - int jOffset = j * config.bytesPerDoc + config.packedIndexBytesLength; - return Arrays.compareUnsigned( - points.block, - iOffset, - iOffset + dataLength, - points.block, - jOffset, - jOffset + dataLength); + return points.compareDataDimsAndDoc(i, j); } @Override protected int comparePivot(int j) { if (skypedBytes < config.bytesPerDim) { - int jOffset = j * config.bytesPerDoc; - int cmp = dimComparator.compare(scratch, 0, points.block, jOffset + dimStart); + int cmp = points.compareDim(j, scratch, 0, dimStart); if (cmp != 0) { return cmp; } } - int jOffset = j * config.bytesPerDoc + config.packedIndexBytesLength; - return Arrays.compareUnsigned( - scratch, - config.bytesPerDim, - config.bytesPerDim + dataLength, - points.block, - jOffset, - jOffset + dataLength); + return points.compareDataDimsAndDoc(j, scratch, config.bytesPerDim); } }; } @@ -538,13 +491,7 @@ public final class BKDRadixSelector { @Override protected int byteAt(int i, int k) { assert k >= 0 : "negative prefix " + k; - if (k < dimCmpBytes) { - // dim bytes - return points.block[i * config.bytesPerDoc + dimOffset + k] & 0xff; - } else { - // data bytes - return points.block[i * config.bytesPerDoc + dataOffset + k] & 0xff; - } + return points.byteAt(i, k < dimCmpBytes ? dimOffset + k : dataOffset + k); } @Override @@ -556,11 +503,6 @@ public final class BKDRadixSelector { protected Sorter getFallbackSorter(int k) { final int skypedBytes = k + commonPrefixLength; final int dimStart = dim * config.bytesPerDim; - // data length is composed by the data dimensions plus the docID - final int dataLength = - (config.numDims - config.numIndexDims) * config.bytesPerDim + Integer.BYTES; - final ByteArrayComparator dimComparator = - ArrayUtil.getUnsignedComparator(config.bytesPerDim); return new IntroSorter() { @Override @@ -571,61 +513,31 @@ public final class BKDRadixSelector { @Override protected void setPivot(int i) { if (skypedBytes < config.bytesPerDim) { - System.arraycopy( - points.block, - i * config.bytesPerDoc + dim * config.bytesPerDim, - scratch, - 0, - config.bytesPerDim); + points.copyDim(i, dimStart, scratch, 0); } - System.arraycopy( - points.block, - i * config.bytesPerDoc + config.packedIndexBytesLength, - scratch, - config.bytesPerDim, - dataLength); + points.copyDataDimsAndDoc(i, scratch, config.bytesPerDim); } @Override protected int compare(int i, int j) { if (skypedBytes < config.bytesPerDim) { - int iOffset = i * config.bytesPerDoc; - int jOffset = j * config.bytesPerDoc; - int cmp = - dimComparator.compare( - points.block, iOffset + dimStart, points.block, jOffset + dimStart); + final int cmp = points.compareDim(i, j, dimStart); if (cmp != 0) { return cmp; } } - int iOffset = i * config.bytesPerDoc + config.packedIndexBytesLength; - int jOffset = j * config.bytesPerDoc + config.packedIndexBytesLength; - return Arrays.compareUnsigned( - points.block, - iOffset, - iOffset + dataLength, - points.block, - jOffset, - jOffset + dataLength); + return points.compareDataDimsAndDoc(i, j); } @Override protected int comparePivot(int j) { if (skypedBytes < config.bytesPerDim) { - int jOffset = j * config.bytesPerDoc; - int cmp = dimComparator.compare(scratch, 0, points.block, jOffset + dimStart); + int cmp = points.compareDim(j, scratch, 0, dimStart); if (cmp != 0) { return cmp; } } - int jOffset = j * config.bytesPerDoc + config.packedIndexBytesLength; - return Arrays.compareUnsigned( - scratch, - config.bytesPerDim, - config.bytesPerDim + dataLength, - points.block, - jOffset, - jOffset + dataLength); + return points.compareDataDimsAndDoc(j, scratch, config.bytesPerDim); } }; } diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointReader.java index 093b6dd5d54..2b79ad704be 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointReader.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointReader.java @@ -16,8 +16,7 @@ */ package org.apache.lucene.util.bkd; -import org.apache.lucene.util.BitUtil; -import org.apache.lucene.util.BytesRef; +import java.util.function.IntFunction; /** * Utility class to read buffered points from in-heap arrays. @@ -26,22 +25,13 @@ import org.apache.lucene.util.BytesRef; */ public final class HeapPointReader implements PointReader { private int curRead; - final byte[] block; - final BKDConfig config; - final int end; - private final HeapPointValue pointValue; + private final int end; + private final IntFunction points; - public HeapPointReader(BKDConfig config, byte[] block, int start, int end) { - this.block = block; + HeapPointReader(IntFunction points, int start, int end) { curRead = start - 1; this.end = end; - this.config = config; - if (start < end) { - this.pointValue = new HeapPointValue(config, block); - } else { - // no values - this.pointValue = null; - } + this.points = points; } @Override @@ -52,46 +42,9 @@ public final class HeapPointReader implements PointReader { @Override public PointValue pointValue() { - pointValue.setOffset(curRead * config.bytesPerDoc); - return pointValue; + return points.apply(curRead); } @Override public void close() {} - - /** Reusable implementation for a point value on-heap */ - static class HeapPointValue implements PointValue { - - final BytesRef packedValue; - final BytesRef packedValueDocID; - final int packedValueLength; - - HeapPointValue(BKDConfig config, byte[] value) { - this.packedValueLength = config.packedBytesLength; - this.packedValue = new BytesRef(value, 0, packedValueLength); - this.packedValueDocID = new BytesRef(value, 0, config.bytesPerDoc); - } - - /** Sets a new value by changing the offset. */ - public void setOffset(int offset) { - packedValue.offset = offset; - packedValueDocID.offset = offset; - } - - @Override - public BytesRef packedValue() { - return packedValue; - } - - @Override - public int docID() { - int position = packedValueDocID.offset + packedValueLength; - return (int) BitUtil.VH_BE_INT.get(packedValueDocID.bytes, position); - } - - @Override - public BytesRef packedValueDocIDBytes() { - return packedValueDocID; - } - } } diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointWriter.java index 117ce6f80f1..5a918d166d8 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointWriter.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointWriter.java @@ -17,6 +17,7 @@ package org.apache.lucene.util.bkd; import java.util.Arrays; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BitUtil; import org.apache.lucene.util.BytesRef; @@ -26,22 +27,26 @@ import org.apache.lucene.util.BytesRef; * @lucene.internal */ public final class HeapPointWriter implements PointWriter { - public final byte[] block; + private final byte[] block; final int size; - final BKDConfig config; + private final BKDConfig config; private final byte[] scratch; + private final ArrayUtil.ByteArrayComparator dimComparator; + // length is composed by the data dimensions plus the docID + private final int dataDimsAndDocLength; private int nextWrite; private boolean closed; - - private HeapPointReader.HeapPointValue pointValue; + private final HeapPointValue pointValue; public HeapPointWriter(BKDConfig config, int size) { this.config = config; this.block = new byte[config.bytesPerDoc * size]; this.size = size; + this.dimComparator = ArrayUtil.getUnsignedComparator(config.bytesPerDim); + this.dataDimsAndDocLength = config.bytesPerDoc - config.packedIndexBytesLength; this.scratch = new byte[config.bytesPerDoc]; if (size > 0) { - pointValue = new HeapPointReader.HeapPointValue(config, block); + pointValue = new HeapPointValue(config, block); } else { // no values pointValue = null; @@ -65,10 +70,9 @@ public final class HeapPointWriter implements PointWriter { + packedValue.length + "]"; assert nextWrite < size : "nextWrite=" + (nextWrite + 1) + " vs size=" + size; - System.arraycopy( - packedValue, 0, block, nextWrite * config.bytesPerDoc, config.packedBytesLength); - int position = nextWrite * config.bytesPerDoc + config.packedBytesLength; - BitUtil.VH_BE_INT.set(block, position, docID); + final int position = nextWrite * config.bytesPerDoc; + System.arraycopy(packedValue, 0, block, position, config.packedBytesLength); + BitUtil.VH_BE_INT.set(block, position + config.packedBytesLength, docID); nextWrite++; } @@ -76,27 +80,23 @@ public final class HeapPointWriter implements PointWriter { public void append(PointValue pointValue) { assert closed == false : "point writer is already closed"; assert nextWrite < size : "nextWrite=" + (nextWrite + 1) + " vs size=" + size; - BytesRef packedValueDocID = pointValue.packedValueDocIDBytes(); + final BytesRef packedValueDocID = pointValue.packedValueDocIDBytes(); assert packedValueDocID.length == config.bytesPerDoc : "[packedValue] must have length [" + (config.bytesPerDoc) + "] but was [" + packedValueDocID.length + "]"; + final int position = nextWrite * config.bytesPerDoc; System.arraycopy( - packedValueDocID.bytes, - packedValueDocID.offset, - block, - nextWrite * config.bytesPerDoc, - config.bytesPerDoc); + packedValueDocID.bytes, packedValueDocID.offset, block, position, config.bytesPerDoc); nextWrite++; } - public void swap(int i, int j) { - - int indexI = i * config.bytesPerDoc; - int indexJ = j * config.bytesPerDoc; - + /** Swaps the point at point {@code i} with the point at position {@code j} */ + void swap(int i, int j) { + final int indexI = i * config.bytesPerDoc; + final int indexJ = j * config.bytesPerDoc; // scratch1 = values[i] System.arraycopy(block, indexI, scratch, 0, config.bytesPerDoc); // values[i] = values[j] @@ -105,19 +105,100 @@ public final class HeapPointWriter implements PointWriter { System.arraycopy(scratch, 0, block, indexJ, config.bytesPerDoc); } + /** Return the byte at position {@code k} of the point at position {@code i} */ + int byteAt(int i, int k) { + return block[i * config.bytesPerDoc + k] & 0xff; + } + + /** + * Copy the dimension {@code dim} of the point at position {@code i} in the provided {@code bytes} + * at the given offset + */ + void copyDim(int i, int dim, byte[] bytes, int offset) { + System.arraycopy(block, i * config.bytesPerDoc + dim, bytes, offset, config.bytesPerDim); + } + + /** + * Copy the data dimensions and doc value of the point at position {@code i} in the provided + * {@code bytes} at the given offset + */ + void copyDataDimsAndDoc(int i, byte[] bytes, int offset) { + System.arraycopy( + block, + i * config.bytesPerDoc + config.packedIndexBytesLength, + bytes, + offset, + dataDimsAndDocLength); + } + + /** + * Compares the dimension {@code dim} value of the point at position {@code i} with the point at + * position {@code j} + */ + int compareDim(int i, int j, int dim) { + final int iOffset = i * config.bytesPerDoc + dim; + final int jOffset = j * config.bytesPerDoc + dim; + return compareDim(block, iOffset, block, jOffset); + } + + /** + * Compares the dimension {@code dim} value of the point at position {@code j} with the provided + * value + */ + int compareDim(int j, byte[] dimValue, int offset, int dim) { + final int jOffset = j * config.bytesPerDoc + dim; + return compareDim(dimValue, offset, block, jOffset); + } + + private int compareDim(byte[] blockI, int offsetI, byte[] blockJ, int offsetJ) { + return dimComparator.compare(blockI, offsetI, blockJ, offsetJ); + } + + /** + * Compares the data dimensions and doc values of the point at position {@code i} with the point + * at position {@code j} + */ + int compareDataDimsAndDoc(int i, int j) { + final int iOffset = i * config.bytesPerDoc + config.packedIndexBytesLength; + final int jOffset = j * config.bytesPerDoc + config.packedIndexBytesLength; + return compareDataDimsAndDoc(block, iOffset, block, jOffset); + } + + /** + * Compares the data dimensions and doc values of the point at position {@code j} with the + * provided value + */ + int compareDataDimsAndDoc(int j, byte[] dataDimsAndDocs, int offset) { + final int jOffset = j * config.bytesPerDoc + config.packedIndexBytesLength; + return compareDataDimsAndDoc(dataDimsAndDocs, offset, block, jOffset); + } + + private int compareDataDimsAndDoc(byte[] blockI, int offsetI, byte[] blockJ, int offsetJ) { + return Arrays.compareUnsigned( + blockI, + offsetI, + offsetI + dataDimsAndDocLength, + blockJ, + offsetJ, + offsetJ + dataDimsAndDocLength); + } + + /** Computes the cardinality of the points between {@code from} tp {@code to} */ public int computeCardinality(int from, int to, int[] commonPrefixLengths) { int leafCardinality = 1; for (int i = from + 1; i < to; i++) { + final int pointOffset = (i - 1) * config.bytesPerDoc; + final int nextPointOffset = pointOffset + config.bytesPerDoc; for (int dim = 0; dim < config.numDims; dim++) { final int start = dim * config.bytesPerDim + commonPrefixLengths[dim]; final int end = dim * config.bytesPerDim + config.bytesPerDim; if (Arrays.mismatch( block, - i * config.bytesPerDoc + start, - i * config.bytesPerDoc + end, + nextPointOffset + start, + nextPointOffset + end, block, - (i - 1) * config.bytesPerDoc + start, - (i - 1) * config.bytesPerDoc + end) + pointOffset + start, + pointOffset + end) != -1) { leafCardinality++; break; @@ -139,7 +220,8 @@ public final class HeapPointWriter implements PointWriter { : "start=" + start + " length=" + length + " docIDs.length=" + size; assert start + length <= nextWrite : "start=" + start + " length=" + length + " nextWrite=" + nextWrite; - return new HeapPointReader(config, block, (int) start, Math.toIntExact(start + length)); + return new HeapPointReader( + this::getPackedValueSlice, (int) start, Math.toIntExact(start + length)); } @Override @@ -154,4 +236,40 @@ public final class HeapPointWriter implements PointWriter { public String toString() { return "HeapPointWriter(count=" + nextWrite + " size=" + size + ")"; } + + /** Reusable implementation for a point value on-heap */ + private static class HeapPointValue implements PointValue { + + private final BytesRef packedValue; + private final BytesRef packedValueDocID; + private final int packedValueLength; + + HeapPointValue(BKDConfig config, byte[] value) { + this.packedValueLength = config.packedBytesLength; + this.packedValue = new BytesRef(value, 0, packedValueLength); + this.packedValueDocID = new BytesRef(value, 0, config.bytesPerDoc); + } + + /** Sets a new value by changing the offset. */ + void setOffset(int offset) { + packedValue.offset = offset; + packedValueDocID.offset = offset; + } + + @Override + public BytesRef packedValue() { + return packedValue; + } + + @Override + public int docID() { + int position = packedValueDocID.offset + packedValueLength; + return (int) BitUtil.VH_BE_INT.get(packedValueDocID.bytes, position); + } + + @Override + public BytesRef packedValueDocIDBytes() { + return packedValueDocID; + } + } }