mirror of https://github.com/apache/lucene.git
optimize BKD leaf block writing: use incoming sorted points to compute commonn prefix (saves one pass); remove an extra copy bytes
This commit is contained in:
parent
85d268ee8b
commit
c0e8be7c1b
|
@ -158,11 +158,10 @@ class SimpleTextPointsWriter extends PointsWriter {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void writeLeafBlockPackedValue(IndexOutput out, int[] commonPrefixLengths, byte[] bytes) throws IOException {
|
protected void writeLeafBlockPackedValue(IndexOutput out, int[] commonPrefixLengths, byte[] bytes, int bytesOffset) throws IOException {
|
||||||
// NOTE: we don't do prefix coding, so we ignore commonPrefixLengths
|
// NOTE: we don't do prefix coding, so we ignore commonPrefixLengths
|
||||||
assert bytes.length == packedBytesLength;
|
|
||||||
write(out, BLOCK_VALUE);
|
write(out, BLOCK_VALUE);
|
||||||
write(out, new BytesRef(bytes, 0, bytes.length).toString());
|
write(out, new BytesRef(bytes, bytesOffset, packedBytesLength).toString());
|
||||||
newline(out);
|
newline(out);
|
||||||
}
|
}
|
||||||
}) {
|
}) {
|
||||||
|
|
|
@ -106,9 +106,9 @@ public class BKDWriter implements Closeable {
|
||||||
final double maxMBSortInHeap;
|
final double maxMBSortInHeap;
|
||||||
|
|
||||||
final byte[] scratchDiff;
|
final byte[] scratchDiff;
|
||||||
final byte[] scratchPackedValue;
|
|
||||||
final byte[] scratch1;
|
final byte[] scratch1;
|
||||||
final byte[] scratch2;
|
final byte[] scratch2;
|
||||||
|
final BytesRef scratchBytesRef = new BytesRef();
|
||||||
final int[] commonPrefixLengths;
|
final int[] commonPrefixLengths;
|
||||||
|
|
||||||
protected final FixedBitSet docsSeen;
|
protected final FixedBitSet docsSeen;
|
||||||
|
@ -152,7 +152,7 @@ public class BKDWriter implements Closeable {
|
||||||
packedBytesLength = numDims * bytesPerDim;
|
packedBytesLength = numDims * bytesPerDim;
|
||||||
|
|
||||||
scratchDiff = new byte[bytesPerDim];
|
scratchDiff = new byte[bytesPerDim];
|
||||||
scratchPackedValue = new byte[packedBytesLength];
|
scratchBytesRef.length = packedBytesLength;
|
||||||
scratch1 = new byte[packedBytesLength];
|
scratch1 = new byte[packedBytesLength];
|
||||||
scratch2 = new byte[packedBytesLength];
|
scratch2 = new byte[packedBytesLength];
|
||||||
commonPrefixLengths = new int[numDims];
|
commonPrefixLengths = new int[numDims];
|
||||||
|
@ -455,7 +455,7 @@ public class BKDWriter implements Closeable {
|
||||||
}
|
}
|
||||||
System.arraycopy(reader.state.scratchPackedValue, 0, maxPackedValue, 0, packedBytesLength);
|
System.arraycopy(reader.state.scratchPackedValue, 0, maxPackedValue, 0, packedBytesLength);
|
||||||
|
|
||||||
assert numDims > 1 || valueInOrder(valueCount, lastPackedValue, reader.state.scratchPackedValue);
|
assert numDims > 1 || valueInOrder(valueCount, lastPackedValue, reader.state.scratchPackedValue, 0);
|
||||||
valueCount++;
|
valueCount++;
|
||||||
if (pointCount > totalPointCount) {
|
if (pointCount > totalPointCount) {
|
||||||
throw new IllegalStateException("totalPointCount=" + totalPointCount + " was passed when we were created, but we just hit " + pointCount + " values");
|
throw new IllegalStateException("totalPointCount=" + totalPointCount + " was passed when we were created, but we just hit " + pointCount + " values");
|
||||||
|
@ -502,7 +502,7 @@ public class BKDWriter implements Closeable {
|
||||||
|
|
||||||
// Write the full values:
|
// Write the full values:
|
||||||
for (int i=0;i<leafCount;i++) {
|
for (int i=0;i<leafCount;i++) {
|
||||||
writeLeafBlockPackedValue(out, commonPrefixLengths, leafBlockPackedValues[i]);
|
writeLeafBlockPackedValue(out, commonPrefixLengths, leafBlockPackedValues[i], 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
leafCount = 0;
|
leafCount = 0;
|
||||||
|
@ -920,10 +920,10 @@ public class BKDWriter implements Closeable {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void writeLeafBlockPackedValue(IndexOutput out, int[] commonPrefixLengths, byte[] bytes) throws IOException {
|
protected void writeLeafBlockPackedValue(IndexOutput out, int[] commonPrefixLengths, byte[] bytes, int offset) throws IOException {
|
||||||
for(int dim=0;dim<numDims;dim++) {
|
for(int dim=0;dim<numDims;dim++) {
|
||||||
int prefix = commonPrefixLengths[dim];
|
int prefix = commonPrefixLengths[dim];
|
||||||
out.writeBytes(bytes, dim*bytesPerDim+prefix, bytesPerDim-prefix);
|
out.writeBytes(bytes, offset+dim*bytesPerDim+prefix, bytesPerDim-prefix);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -994,13 +994,13 @@ public class BKDWriter implements Closeable {
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Called only in assert */
|
/** Called only in assert */
|
||||||
private boolean valueInBounds(byte[] packedValue, byte[] minPackedValue, byte[] maxPackedValue) {
|
private boolean valueInBounds(BytesRef packedValue, byte[] minPackedValue, byte[] maxPackedValue) {
|
||||||
for(int dim=0;dim<numDims;dim++) {
|
for(int dim=0;dim<numDims;dim++) {
|
||||||
int offset = bytesPerDim*dim;
|
int offset = bytesPerDim*dim;
|
||||||
if (StringHelper.compare(bytesPerDim, packedValue, offset, minPackedValue, offset) < 0) {
|
if (StringHelper.compare(bytesPerDim, packedValue.bytes, packedValue.offset + offset, minPackedValue, offset) < 0) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (StringHelper.compare(bytesPerDim, packedValue, offset, maxPackedValue, offset) > 0) {
|
if (StringHelper.compare(bytesPerDim, packedValue.bytes, packedValue.offset + offset, maxPackedValue, offset) > 0) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1060,16 +1060,35 @@ public class BKDWriter implements Closeable {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (nodeID >= leafNodeOffset) {
|
if (nodeID >= leafNodeOffset) {
|
||||||
|
|
||||||
// Leaf node: write block
|
// Leaf node: write block
|
||||||
|
for (int dim=0;dim<numDims;dim++) {
|
||||||
PathSlice source = slices[0];
|
if (slices[dim].writer instanceof HeapPointWriter == false) {
|
||||||
|
|
||||||
if (source.writer instanceof HeapPointWriter == false) {
|
|
||||||
// Adversarial cases can cause this, e.g. very lopsided data, all equal points, such that we started
|
// Adversarial cases can cause this, e.g. very lopsided data, all equal points, such that we started
|
||||||
// offline, but then kept splitting only in one dimension, and so never had to rewrite into heap writer
|
// offline, but then kept splitting only in one dimension, and so never had to rewrite into heap writer
|
||||||
source = switchToHeap(source);
|
slices[dim] = switchToHeap(slices[dim]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
PathSlice source = slices[dim];
|
||||||
|
|
||||||
|
HeapPointWriter heapSource = (HeapPointWriter) source.writer;
|
||||||
|
|
||||||
|
// Find common prefix by comparing first and last values, already sorted in this dimension:
|
||||||
|
heapSource.readPackedValue(Math.toIntExact(source.start), scratch1);
|
||||||
|
heapSource.readPackedValue(Math.toIntExact(source.start + source.count - 1), scratch2);
|
||||||
|
|
||||||
|
int offset = dim * bytesPerDim;
|
||||||
|
commonPrefixLengths[dim] = bytesPerDim;
|
||||||
|
for(int j=0;j<bytesPerDim;j++) {
|
||||||
|
if (scratch1[offset+j] != scratch2[offset+j]) {
|
||||||
|
commonPrefixLengths[dim] = j;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
PathSlice source = slices[0];
|
||||||
|
|
||||||
// We ensured that maxPointsSortInHeap was >= maxPointsInLeafNode, so we better be in heap at this point:
|
// We ensured that maxPointsSortInHeap was >= maxPointsInLeafNode, so we better be in heap at this point:
|
||||||
HeapPointWriter heapSource = (HeapPointWriter) source.writer;
|
HeapPointWriter heapSource = (HeapPointWriter) source.writer;
|
||||||
|
|
||||||
|
@ -1083,37 +1102,21 @@ public class BKDWriter implements Closeable {
|
||||||
assert count > 0: "nodeID=" + nodeID + " leafNodeOffset=" + leafNodeOffset;
|
assert count > 0: "nodeID=" + nodeID + " leafNodeOffset=" + leafNodeOffset;
|
||||||
writeLeafBlockDocs(out, heapSource.docIDs, Math.toIntExact(source.start), count);
|
writeLeafBlockDocs(out, heapSource.docIDs, Math.toIntExact(source.start), count);
|
||||||
|
|
||||||
// First pass: find the per-dim common prefix for all values in this block:
|
// TODO: minor opto: we don't really have to write the actual common prefixes, because BKDReader on recursing can regenerate it for us
|
||||||
Arrays.fill(commonPrefixLengths, bytesPerDim);
|
// from the index, much like how terms dict does so from the FST:
|
||||||
for (int i=0;i<count;i++) {
|
|
||||||
if (i == 0) {
|
|
||||||
heapSource.readPackedValue(Math.toIntExact(source.start + i), scratch1);
|
|
||||||
} else {
|
|
||||||
heapSource.readPackedValue(Math.toIntExact(source.start + i), scratchPackedValue);
|
|
||||||
for(int dim=0;dim<numDims;dim++) {
|
|
||||||
int offset = dim * bytesPerDim;
|
|
||||||
for(int j=0;j<commonPrefixLengths[dim];j++) {
|
|
||||||
if (scratch1[offset+j] != scratchPackedValue[offset+j]) {
|
|
||||||
commonPrefixLengths[dim] = j;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
// Write the common prefixes:
|
||||||
writeCommonPrefixes(out, commonPrefixLengths, scratch1);
|
writeCommonPrefixes(out, commonPrefixLengths, scratch1);
|
||||||
|
|
||||||
// Second pass: write the full values:
|
// Write the full values:
|
||||||
byte[] lastPackedValue = new byte[bytesPerDim];
|
byte[] lastPackedValue = new byte[bytesPerDim];
|
||||||
for (int i=0;i<count;i++) {
|
for (int i=0;i<count;i++) {
|
||||||
// TODO: we could do bulk copying here, avoiding the intermediate copy:
|
heapSource.getPackedValueSlice(Math.toIntExact(source.start + i), scratchBytesRef);
|
||||||
heapSource.readPackedValue(Math.toIntExact(source.start + i), scratchPackedValue);
|
assert numDims != 1 || valueInOrder(i, lastPackedValue, scratchBytesRef.bytes, scratchBytesRef.offset);
|
||||||
assert numDims != 1 || valueInOrder(i, lastPackedValue, scratchPackedValue);
|
|
||||||
|
|
||||||
// Make sure this value does in fact fall within this leaf cell:
|
// Make sure this value does in fact fall within this leaf cell:
|
||||||
assert valueInBounds(scratchPackedValue, minPackedValue, maxPackedValue);
|
assert valueInBounds(scratchBytesRef, minPackedValue, maxPackedValue);
|
||||||
writeLeafBlockPackedValue(out, commonPrefixLengths, scratchPackedValue);
|
writeLeafBlockPackedValue(out, commonPrefixLengths, scratchBytesRef.bytes, scratchBytesRef.offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
@ -1227,11 +1230,11 @@ public class BKDWriter implements Closeable {
|
||||||
}
|
}
|
||||||
|
|
||||||
// only called from assert
|
// only called from assert
|
||||||
private boolean valueInOrder(long ord, byte[] lastPackedValue, byte[] packedValue) {
|
private boolean valueInOrder(long ord, byte[] lastPackedValue, byte[] packedValue, int packedValueOffset) {
|
||||||
if (ord > 0 && StringHelper.compare(bytesPerDim, lastPackedValue, 0, packedValue, 0) > 0) {
|
if (ord > 0 && StringHelper.compare(bytesPerDim, lastPackedValue, 0, packedValue, packedValueOffset) > 0) {
|
||||||
throw new AssertionError("values out of order: last value=" + new BytesRef(lastPackedValue) + " current value=" + new BytesRef(packedValue) + " ord=" + ord);
|
throw new AssertionError("values out of order: last value=" + new BytesRef(lastPackedValue) + " current value=" + new BytesRef(packedValue, packedValueOffset, packedBytesLength) + " ord=" + ord);
|
||||||
}
|
}
|
||||||
System.arraycopy(packedValue, 0, lastPackedValue, 0, bytesPerDim);
|
System.arraycopy(packedValue, packedValueOffset, lastPackedValue, 0, bytesPerDim);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -16,7 +16,6 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.util.bkd;
|
package org.apache.lucene.util.bkd;
|
||||||
|
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.lucene.util.PagedBytes;
|
import org.apache.lucene.util.PagedBytes;
|
||||||
|
|
|
@ -20,6 +20,7 @@ import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.lucene.util.ArrayUtil;
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
|
||||||
final class HeapPointWriter implements PointWriter {
|
final class HeapPointWriter implements PointWriter {
|
||||||
int[] docIDs;
|
int[] docIDs;
|
||||||
|
@ -72,6 +73,15 @@ final class HeapPointWriter implements PointWriter {
|
||||||
System.arraycopy(blocks.get(block), blockIndex * packedBytesLength, bytes, 0, packedBytesLength);
|
System.arraycopy(blocks.get(block), blockIndex * packedBytesLength, bytes, 0, packedBytesLength);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Returns a reference, in <code>result</code>, to the byte[] slice holding this value */
|
||||||
|
void getPackedValueSlice(int index, BytesRef result) {
|
||||||
|
int block = index / valuesPerBlock;
|
||||||
|
int blockIndex = index % valuesPerBlock;
|
||||||
|
result.bytes = blocks.get(block);
|
||||||
|
result.offset = blockIndex * packedBytesLength;
|
||||||
|
assert result.length == packedBytesLength;
|
||||||
|
}
|
||||||
|
|
||||||
void writePackedValue(int index, byte[] bytes) {
|
void writePackedValue(int index, byte[] bytes) {
|
||||||
assert bytes.length == packedBytesLength;
|
assert bytes.length == packedBytesLength;
|
||||||
int block = index / valuesPerBlock;
|
int block = index / valuesPerBlock;
|
||||||
|
|
Loading…
Reference in New Issue