LUCENE-7501: BKDReader should not store the split dimension explicitly in the 1D case.

2016-10-18 09:33:36 +02:00 · 2016-10-18 09:33:36 +02:00 · e6d225603a
parent df09867ccc
commit e6d225603a
4 changed files with 45 additions and 21 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -53,6 +53,9 @@ Improvements
 Optimizations
 * LUCENE-7501: BKDReader should not store the split dimension explicitly in the
  1D case. (Adrien Grand)
 Other
 * LUCENE-7452: Block join query exception suggests how to find a doc, which 
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsReader.java
@ -138,15 +138,26 @@ class SimpleTextPointsReader extends PointsReader {
    readLine(dataIn);
    count = parseInt(SPLIT_COUNT);
-    byte[] splitPackedValues = new byte[count * (1 + bytesPerDim)];
+    byte[] splitPackedValues;
    int bytesPerIndexEntry;
    if (numDims == 1) {
      bytesPerIndexEntry = bytesPerDim;
    } else {
      bytesPerIndexEntry = 1 + bytesPerDim;
    }
    splitPackedValues = new byte[count * bytesPerIndexEntry];
    for(int i=0;i<count;i++) {
      readLine(dataIn);
-      splitPackedValues[(1 + bytesPerDim) * i] = (byte) parseInt(SPLIT_DIM);
+      int address = bytesPerIndexEntry * i;
      int splitDim = parseInt(SPLIT_DIM);
      if (numDims != 1) {
        splitPackedValues[address++] = (byte) splitDim;
      }
      readLine(dataIn);
      assert startsWith(SPLIT_VALUE);
      BytesRef br = SimpleTextUtil.fromBytesRefString(stripPrefix(SPLIT_VALUE));
      assert br.length == bytesPerDim;
-      System.arraycopy(br.bytes, br.offset, splitPackedValues, (1 + bytesPerDim) * i + 1, bytesPerDim);
+      System.arraycopy(br.bytes, br.offset, splitPackedValues, address, bytesPerDim);
    }
    return new SimpleTextBKDReader(dataIn, numDims, maxPointsInLeafNode, bytesPerDim, leafBlockFPs, splitPackedValues, minValue.bytes, maxValue.bytes, pointCount, docCount);
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
@ -26,6 +26,7 @@ import org.apache.lucene.index.PointValues.Relation;
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.util.Accountable;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.RamUsageEstimator;
 import org.apache.lucene.util.StringHelper;
 /** Handles intersection of an multi-dimensional shape in byte[] space with a block KD-tree previously written with {@link BKDWriter}.
@ -39,6 +40,7 @@ public class BKDReader implements Accountable {
  final private int leafNodeOffset;
  final int numDims;
  final int bytesPerDim;
  final int bytesPerIndexEntry;
  final IndexInput in;
  final int maxPointsInLeafNode;
  final byte[] minPackedValue;
@ -54,6 +56,7 @@ public class BKDReader implements Accountable {
    numDims = in.readVInt();
    maxPointsInLeafNode = in.readVInt();
    bytesPerDim = in.readVInt();
    bytesPerIndexEntry = numDims == 1 && version >= BKDWriter.VERSION_IMPLICIT_SPLIT_DIM_1D ? bytesPerDim : bytesPerDim + 1;
    packedBytesLength = numDims * bytesPerDim;
    // Read index:
@ -70,7 +73,7 @@ public class BKDReader implements Accountable {
    pointCount = in.readVLong();
    docCount = in.readVInt();
-    splitPackedValues = new byte[(1+bytesPerDim)*numLeaves];
+    splitPackedValues = new byte[bytesPerIndexEntry*numLeaves];
    // TODO: don't write split packed values[0]!
    in.readBytes(splitPackedValues, 0, splitPackedValues.length);
@ -135,6 +138,7 @@ public class BKDReader implements Accountable {
    this.numDims = numDims;
    this.maxPointsInLeafNode = maxPointsInLeafNode;
    this.bytesPerDim = bytesPerDim;
    bytesPerIndexEntry = numDims == 1 ? bytesPerDim : bytesPerDim + 1;
    packedBytesLength = numDims * bytesPerDim;
    this.leafNodeOffset = leafBlockFPs.length;
    this.leafBlockFPs = leafBlockFPs;
@ -234,22 +238,22 @@ public class BKDReader implements Accountable {
    } else {
      // Non-leaf node:
-      int address = nodeID * (bytesPerDim+1);
+      int address = nodeID * bytesPerIndexEntry;
-      int splitDim = splitPackedValues[address] & 0xff;
+      int splitDim = numDims == 1 ? 0 : splitPackedValues[address++] & 0xff;
      assert splitDim < numDims;
      byte[] splitPackedValue = new byte[packedBytesLength];
      // Recurse on left sub-tree:
      System.arraycopy(cellMaxPacked, 0, splitPackedValue, 0, packedBytesLength);
-      System.arraycopy(splitPackedValues, address+1, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
+      System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
      verify(state,
             2*nodeID,
             cellMinPacked, splitPackedValue);
      // Recurse on right sub-tree:
      System.arraycopy(cellMinPacked, 0, splitPackedValue, 0, packedBytesLength);
-      System.arraycopy(splitPackedValues, address+1, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
+      System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
      verify(state,
             2*nodeID+1,
             splitPackedValue, cellMaxPacked);
@ -457,8 +461,8 @@ public class BKDReader implements Accountable {
      // Non-leaf node: recurse on the split left and right nodes
      // TODO: save the unused 1 byte prefix (it's always 0) in the 1d case here:
-      int address = nodeID * (bytesPerDim+1);
+      int address = nodeID * bytesPerIndexEntry;
-      int splitDim = splitPackedValues[address] & 0xff;
+      int splitDim = numDims == 1 ? 0 : splitPackedValues[address++] & 0xff;
      assert splitDim < numDims;
      // TODO: can we alloc & reuse this up front?
@ -468,14 +472,14 @@ public class BKDReader implements Accountable {
      // Recurse on left sub-tree:
      System.arraycopy(cellMaxPacked, 0, splitPackedValue, 0, packedBytesLength);
-      System.arraycopy(splitPackedValues, address+1, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
+      System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
      intersect(state,
                2*nodeID,
                cellMinPacked, splitPackedValue);
      // Recurse on right sub-tree:
      System.arraycopy(cellMinPacked, 0, splitPackedValue, 0, packedBytesLength);
-      System.arraycopy(splitPackedValues, address+1, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
+      System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
      intersect(state,
                2*nodeID+1,
                splitPackedValue, cellMaxPacked);
@ -484,16 +488,16 @@ public class BKDReader implements Accountable {
  /** Copies the split value for this node into the provided byte array */
  public void copySplitValue(int nodeID, byte[] splitPackedValue) {
-    int address = nodeID * (bytesPerDim+1);
+    int address = nodeID * bytesPerIndexEntry;
-    int splitDim = splitPackedValues[address] & 0xff;
+    int splitDim = numDims == 1 ? 0 : splitPackedValues[address++] & 0xff;
    assert splitDim < numDims;
-    System.arraycopy(splitPackedValues, address+1, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
+    System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
  }
  @Override
  public long ramBytesUsed() {
-    return splitPackedValues.length +
+    return RamUsageEstimator.sizeOf(splitPackedValues) +
-      leafBlockFPs.length * Long.BYTES;
+        RamUsageEstimator.sizeOf(leafBlockFPs);
  }
  public byte[] getMinPackedValue() {
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
@ -82,7 +82,8 @@ public class BKDWriter implements Closeable {
  public static final int VERSION_START = 0;
  public static final int VERSION_COMPRESSED_DOC_IDS = 1;
  public static final int VERSION_COMPRESSED_VALUES = 2;
-  public static final int VERSION_CURRENT = VERSION_COMPRESSED_VALUES;
+  public static final int VERSION_IMPLICIT_SPLIT_DIM_1D = 3;
  public static final int VERSION_CURRENT = VERSION_IMPLICIT_SPLIT_DIM_1D;
  /** How many bytes each docs takes in the fixed-width offline format */
  private final int bytesPerDoc;
@ -1033,10 +1034,15 @@ public class BKDWriter implements Closeable {
    out.writeVLong(pointCount);
    out.writeVInt(docsSeen.cardinality());
    // TODO: for 1D case, don't waste the first byte of each split value (it's always 0)
    // NOTE: splitPackedValues[0] is unused, because nodeID is 1-based:
-    out.writeBytes(splitPackedValues, 0, splitPackedValues.length);
+    if (numDims == 1) {
      // write the index, skipping the byte used to store the split dim since it is always 0
      for (int i = 1; i < splitPackedValues.length; i += 1 + bytesPerDim) {
        out.writeBytes(splitPackedValues, i, bytesPerDim);
      }
    } else {
      out.writeBytes(splitPackedValues, 0, splitPackedValues.length);
    }
    long lastFP = 0;
    for (int i=0;i<leafBlockFPs.length;i++) {