From 5e8db2e068f2549b9619d5ac48a50c8032fc292b Mon Sep 17 00:00:00 2001
From: Mike McCandless <mikemccand@apache.org>
Date: Sun, 4 Dec 2016 05:18:04 -0500
Subject: [PATCH] LUCENE-7563: use a compressed format for the in-heap BKD
 index

---
 lucene/CHANGES.txt                            |    4 +
 .../simpletext/SimpleTextBKDReader.java       |  281 ++-
 .../simpletext/SimpleTextBKDWriter.java       | 1661 +++++++++++++++++
 .../simpletext/SimpleTextPointsReader.java    |    5 +-
 .../simpletext/SimpleTextPointsWriter.java    |  188 +-
 .../codecs/lucene60/Lucene60PointsFormat.java |   10 +-
 .../lucene/codecs/lucene60/package-info.java  |    4 +-
 .../lucene/codecs/lucene62/package-info.java  |    4 +-
 .../lucene/codecs/lucene70/package-info.java  |   15 +-
 .../org/apache/lucene/index/CheckIndex.java   |  314 ++--
 .../org/apache/lucene/util/bkd/BKDReader.java |  717 ++++---
 .../org/apache/lucene/util/bkd/BKDWriter.java |  293 ++-
 .../lucene/util/bkd/HeapPointReader.java      |    7 +-
 .../lucene/util/bkd/HeapPointWriter.java      |   22 +-
 .../util/bkd/MutablePointsReaderUtils.java    |   21 +-
 .../lucene/util/bkd/OfflinePointReader.java   |    8 +-
 .../lucene/util/bkd/OfflinePointWriter.java   |   10 +-
 .../apache/lucene/util/bkd/PointReader.java   |   14 +-
 .../apache/lucene/util/bkd/PointWriter.java   |    6 +-
 .../lucene/search/TestPointQueries.java       |    3 +
 .../lucene/util/bkd/Test2BBKDPoints.java      |   11 +-
 .../org/apache/lucene/util/bkd/TestBKD.java   |   54 +
 .../org/apache/lucene/util/fst/TestFSTs.java  |    2 +-
 .../lucene/document/NearestNeighbor.java      |   44 +-
 24 files changed, 3030 insertions(+), 668 deletions(-)
 create mode 100644 lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDWriter.java

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 4afc5078fa2..79e44e112c8 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -126,6 +126,10 @@ Optimizations
 * LUCENE-7568: Optimize merging when index sorting is used but the
   index is already sorted (Jim Ferenczi via Mike McCandless)
 
+* LUCENE-7563: The BKD in-memory index for dimensional points now uses
+  a compressed format, using substantially less RAM in some cases
+  (Adrien Grand, Mike McCandless)
+
 Other
 
 * LUCENE-7546: Fixed references to benchmark wikipedia data and the Jenkins line-docs file
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDReader.java
index a2b784afd27..488547b4dea 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDReader.java
@@ -16,13 +16,17 @@
  */
 package org.apache.lucene.codecs.simpletext;
 
-
 import java.io.IOException;
 import java.nio.charset.StandardCharsets;
 
+import org.apache.lucene.codecs.simpletext.SimpleTextUtil;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.PointValues;
 import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.Accountable;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.RamUsageEstimator;
 import org.apache.lucene.util.StringHelper;
 import org.apache.lucene.util.bkd.BKDReader;
 
@@ -30,15 +34,105 @@ import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_C
 import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_DOC_ID;
 import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_VALUE;
 
-class SimpleTextBKDReader extends BKDReader {
+/** Forked from {@link BKDReader} and simplified/specialized for SimpleText's usage */
 
-  public SimpleTextBKDReader(IndexInput datIn, int numDims, int maxPointsInLeafNode, int bytesPerDim, long[] leafBlockFPs, byte[] splitPackedValues,
+final class SimpleTextBKDReader extends PointValues implements Accountable {
+  // Packed array of byte[] holding all split values in the full binary tree:
+  final private byte[] splitPackedValues; 
+  final long[] leafBlockFPs;
+  final private int leafNodeOffset;
+  final int numDims;
+  final int bytesPerDim;
+  final int bytesPerIndexEntry;
+  final IndexInput in;
+  final int maxPointsInLeafNode;
+  final byte[] minPackedValue;
+  final byte[] maxPackedValue;
+  final long pointCount;
+  final int docCount;
+  final int version;
+  protected final int packedBytesLength;
+
+  public SimpleTextBKDReader(IndexInput in, int numDims, int maxPointsInLeafNode, int bytesPerDim, long[] leafBlockFPs, byte[] splitPackedValues,
                              byte[] minPackedValue, byte[] maxPackedValue, long pointCount, int docCount) throws IOException {
-    super(datIn, numDims, maxPointsInLeafNode, bytesPerDim, leafBlockFPs, splitPackedValues, minPackedValue, maxPackedValue, pointCount, docCount);
+    this.in = in;
+    this.numDims = numDims;
+    this.maxPointsInLeafNode = maxPointsInLeafNode;
+    this.bytesPerDim = bytesPerDim;
+    // no version check here because callers of this API (SimpleText) have no back compat:
+    bytesPerIndexEntry = numDims == 1 ? bytesPerDim : bytesPerDim + 1;
+    packedBytesLength = numDims * bytesPerDim;
+    this.leafNodeOffset = leafBlockFPs.length;
+    this.leafBlockFPs = leafBlockFPs;
+    this.splitPackedValues = splitPackedValues;
+    this.minPackedValue = minPackedValue;
+    this.maxPackedValue = maxPackedValue;
+    this.pointCount = pointCount;
+    this.docCount = docCount;
+    this.version = SimpleTextBKDWriter.VERSION_CURRENT;
+    assert minPackedValue.length == packedBytesLength;
+    assert maxPackedValue.length == packedBytesLength;
   }
 
-  @Override
-  protected void visitDocIDs(IndexInput in, long blockFP, IntersectVisitor visitor) throws IOException {
+  /** Used to track all state for a single call to {@link #intersect}. */
+  public static final class IntersectState {
+    final IndexInput in;
+    final int[] scratchDocIDs;
+    final byte[] scratchPackedValue;
+    final int[] commonPrefixLengths;
+
+    final IntersectVisitor visitor;
+
+    public IntersectState(IndexInput in, int numDims,
+                          int packedBytesLength,
+                          int maxPointsInLeafNode,
+                          IntersectVisitor visitor) {
+      this.in = in;
+      this.visitor = visitor;
+      this.commonPrefixLengths = new int[numDims];
+      this.scratchDocIDs = new int[maxPointsInLeafNode];
+      this.scratchPackedValue = new byte[packedBytesLength];
+    }
+  }
+
+  public void intersect(IntersectVisitor visitor) throws IOException {
+    intersect(getIntersectState(visitor), 1, minPackedValue, maxPackedValue);
+  }
+
+  /** Fast path: this is called when the query box fully encompasses all cells under this node. */
+  private void addAll(IntersectState state, int nodeID) throws IOException {
+    //System.out.println("R: addAll nodeID=" + nodeID);
+
+    if (nodeID >= leafNodeOffset) {
+      //System.out.println("ADDALL");
+      visitDocIDs(state.in, leafBlockFPs[nodeID-leafNodeOffset], state.visitor);
+      // TODO: we can assert that the first value here in fact matches what the index claimed?
+    } else {
+      addAll(state, 2*nodeID);
+      addAll(state, 2*nodeID+1);
+    }
+  }
+
+  /** Create a new {@link IntersectState} */
+  public IntersectState getIntersectState(IntersectVisitor visitor) {
+    return new IntersectState(in.clone(), numDims,
+                              packedBytesLength,
+                              maxPointsInLeafNode,
+                              visitor);
+  }
+
+  /** Visits all docIDs and packed values in a single leaf block */
+  public void visitLeafBlockValues(int nodeID, IntersectState state) throws IOException {
+    int leafID = nodeID - leafNodeOffset;
+
+    // Leaf node; scan and filter all points in this block:
+    int count = readDocIDs(state.in, leafBlockFPs[leafID], state.scratchDocIDs);
+
+    // Again, this time reading values and checking with the visitor
+    visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor);
+  }
+
+  void visitDocIDs(IndexInput in, long blockFP, IntersectVisitor visitor) throws IOException {
     BytesRefBuilder scratch = new BytesRefBuilder();
     in.seek(blockFP);
     readLine(in, scratch);
@@ -50,8 +144,7 @@ class SimpleTextBKDReader extends BKDReader {
     }
   }
 
-  @Override
-  protected int readDocIDs(IndexInput in, long blockFP, int[] docIDs) throws IOException {
+  int readDocIDs(IndexInput in, long blockFP, int[] docIDs) throws IOException {
     BytesRefBuilder scratch = new BytesRefBuilder();
     in.seek(blockFP);
     readLine(in, scratch);
@@ -63,8 +156,7 @@ class SimpleTextBKDReader extends BKDReader {
     return count;
   }
 
-  @Override
-  protected void visitDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
+  void visitDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
     visitor.grow(count);
     // NOTE: we don't do prefix coding, so we ignore commonPrefixLengths
     assert scratchPackedValue.length == packedBytesLength;
@@ -79,6 +171,175 @@ class SimpleTextBKDReader extends BKDReader {
     }
   }
 
+  private void visitCompressedDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor, int compressedDim) throws IOException {
+    // the byte at `compressedByteOffset` is compressed using run-length compression,
+    // other suffix bytes are stored verbatim
+    final int compressedByteOffset = compressedDim * bytesPerDim + commonPrefixLengths[compressedDim];
+    commonPrefixLengths[compressedDim]++;
+    int i;
+    for (i = 0; i < count; ) {
+      scratchPackedValue[compressedByteOffset] = in.readByte();
+      final int runLen = Byte.toUnsignedInt(in.readByte());
+      for (int j = 0; j < runLen; ++j) {
+        for(int dim=0;dim<numDims;dim++) {
+          int prefix = commonPrefixLengths[dim];
+          in.readBytes(scratchPackedValue, dim*bytesPerDim + prefix, bytesPerDim - prefix);
+        }
+        visitor.visit(docIDs[i+j], scratchPackedValue);
+      }
+      i += runLen;
+    }
+    if (i != count) {
+      throw new CorruptIndexException("Sub blocks do not add up to the expected count: " + count + " != " + i, in);
+    }
+  }
+
+  private int readCompressedDim(IndexInput in) throws IOException {
+    int compressedDim = in.readByte();
+    if (compressedDim < -1 || compressedDim >= numDims) {
+      throw new CorruptIndexException("Got compressedDim="+compressedDim, in);
+    }
+    return compressedDim;
+  }
+
+  private void readCommonPrefixes(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in) throws IOException {
+    for(int dim=0;dim<numDims;dim++) {
+      int prefix = in.readVInt();
+      commonPrefixLengths[dim] = prefix;
+      if (prefix > 0) {
+        in.readBytes(scratchPackedValue, dim*bytesPerDim, prefix);
+      }
+      //System.out.println("R: " + dim + " of " + numDims + " prefix=" + prefix);
+    }
+  }
+
+  private void intersect(IntersectState state,
+                         int nodeID,
+                         byte[] cellMinPacked, byte[] cellMaxPacked)
+    throws IOException {
+
+    /*
+    System.out.println("\nR: intersect nodeID=" + nodeID);
+    for(int dim=0;dim<numDims;dim++) {
+      System.out.println("  dim=" + dim + "\n    cellMin=" + new BytesRef(cellMinPacked, dim*bytesPerDim, bytesPerDim) + "\n    cellMax=" + new BytesRef(cellMaxPacked, dim*bytesPerDim, bytesPerDim));
+    }
+    */
+
+    Relation r = state.visitor.compare(cellMinPacked, cellMaxPacked);
+
+    if (r == Relation.CELL_OUTSIDE_QUERY) {
+      // This cell is fully outside of the query shape: stop recursing
+      return;
+    } else if (r == Relation.CELL_INSIDE_QUERY) {
+      // This cell is fully inside of the query shape: recursively add all points in this cell without filtering
+      addAll(state, nodeID);
+      return;
+    } else {
+      // The cell crosses the shape boundary, or the cell fully contains the query, so we fall through and do full filtering
+    }
+
+    if (nodeID >= leafNodeOffset) {
+      // TODO: we can assert that the first value here in fact matches what the index claimed?
+
+      int leafID = nodeID - leafNodeOffset;
+      
+      // In the unbalanced case it's possible the left most node only has one child:
+      if (leafID < leafBlockFPs.length) {
+        // Leaf node; scan and filter all points in this block:
+        int count = readDocIDs(state.in, leafBlockFPs[leafID], state.scratchDocIDs);
+
+        // Again, this time reading values and checking with the visitor
+        visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor);
+      }
+
+    } else {
+      
+      // Non-leaf node: recurse on the split left and right nodes
+
+      int address = nodeID * bytesPerIndexEntry;
+      int splitDim;
+      if (numDims == 1) {
+        splitDim = 0;
+      } else {
+        splitDim = splitPackedValues[address++] & 0xff;
+      }
+      
+      assert splitDim < numDims;
+
+      // TODO: can we alloc & reuse this up front?
+
+      byte[] splitPackedValue = new byte[packedBytesLength];
+
+      // Recurse on left sub-tree:
+      System.arraycopy(cellMaxPacked, 0, splitPackedValue, 0, packedBytesLength);
+      System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
+      intersect(state,
+                2*nodeID,
+                cellMinPacked, splitPackedValue);
+
+      // Recurse on right sub-tree:
+      System.arraycopy(cellMinPacked, 0, splitPackedValue, 0, packedBytesLength);
+      System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
+      intersect(state,
+                2*nodeID+1,
+                splitPackedValue, cellMaxPacked);
+    }
+  }
+
+  /** Copies the split value for this node into the provided byte array */
+  public void copySplitValue(int nodeID, byte[] splitPackedValue) {
+    int address = nodeID * bytesPerIndexEntry;
+    int splitDim;
+    if (numDims == 1) {
+      splitDim = 0;
+    } else {
+      splitDim = splitPackedValues[address++] & 0xff;
+    }
+    
+    assert splitDim < numDims;
+    System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
+  }
+
+  @Override
+  public long ramBytesUsed() {
+    return RamUsageEstimator.sizeOf(splitPackedValues) +
+        RamUsageEstimator.sizeOf(leafBlockFPs);
+  }
+
+  @Override
+  public byte[] getMinPackedValue() {
+    return minPackedValue.clone();
+  }
+
+  @Override
+  public byte[] getMaxPackedValue() {
+    return maxPackedValue.clone();
+  }
+
+  @Override
+  public int getNumDimensions() {
+    return numDims;
+  }
+
+  @Override
+  public int getBytesPerDimension() {
+    return bytesPerDim;
+  }
+
+  @Override
+  public long size() {
+    return pointCount;
+  }
+
+  @Override
+  public int getDocCount() {
+    return docCount;
+  }
+
+  public boolean isLeafNode(int nodeID) {
+    return nodeID >= leafNodeOffset;
+  }
+
   private int parseInt(BytesRefBuilder scratch, BytesRef prefix) {
     assert startsWith(scratch, prefix);
     return Integer.parseInt(stripPrefix(scratch, prefix));
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDWriter.java
new file mode 100644
index 00000000000..d7674edf369
--- /dev/null
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDWriter.java
@@ -0,0 +1,1661 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.simpletext;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.List;
+import java.util.function.IntFunction;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.MutablePointValues;
+import org.apache.lucene.index.MergeState;
+import org.apache.lucene.index.PointValues.IntersectVisitor;
+import org.apache.lucene.index.PointValues.Relation;
+import org.apache.lucene.store.ChecksumIndexInput;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.TrackingDirectoryWrapper;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.BytesRefComparator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.LongBitSet;
+import org.apache.lucene.util.MSBRadixSorter;
+import org.apache.lucene.util.NumericUtils;
+import org.apache.lucene.util.OfflineSorter;
+import org.apache.lucene.util.PriorityQueue;
+import org.apache.lucene.util.StringHelper;
+import org.apache.lucene.util.bkd.BKDWriter;
+import org.apache.lucene.util.bkd.HeapPointWriter;
+import org.apache.lucene.util.bkd.MutablePointsReaderUtils;
+import org.apache.lucene.util.bkd.OfflinePointReader;
+import org.apache.lucene.util.bkd.OfflinePointWriter;
+import org.apache.lucene.util.bkd.PointReader;
+import org.apache.lucene.util.bkd.PointWriter;
+
+import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_COUNT;
+import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_DOC_ID;
+import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_FP;
+import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_VALUE;
+import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BYTES_PER_DIM;
+import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.DOC_COUNT;
+import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.INDEX_COUNT;
+import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.MAX_LEAF_POINTS;
+import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.MAX_VALUE;
+import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.MIN_VALUE;
+import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.NUM_DIMS;
+import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.POINT_COUNT;
+import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.SPLIT_COUNT;
+import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.SPLIT_DIM;
+import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.SPLIT_VALUE;
+
+
+// TODO
+//   - allow variable length byte[] (across docs and dims), but this is quite a bit more hairy
+//   - we could also index "auto-prefix terms" here, and use better compression, and maybe only use for the "fully contained" case so we'd
+//     only index docIDs
+//   - the index could be efficiently encoded as an FST, so we don't have wasteful
+//     (monotonic) long[] leafBlockFPs; or we could use MonotonicLongValues ... but then
+//     the index is already plenty small: 60M OSM points --> 1.1 MB with 128 points
+//     per leaf, and you can reduce that by putting more points per leaf
+//   - we could use threads while building; the higher nodes are very parallelizable
+
+/** Forked from {@link BKDWriter} and simplified/specialized for SimpleText's usage */
+
+final class SimpleTextBKDWriter implements Closeable {
+
+  public static final String CODEC_NAME = "BKD";
+  public static final int VERSION_START = 0;
+  public static final int VERSION_COMPRESSED_DOC_IDS = 1;
+  public static final int VERSION_COMPRESSED_VALUES = 2;
+  public static final int VERSION_IMPLICIT_SPLIT_DIM_1D = 3;
+  public static final int VERSION_CURRENT = VERSION_IMPLICIT_SPLIT_DIM_1D;
+
+  /** How many bytes each docs takes in the fixed-width offline format */
+  private final int bytesPerDoc;
+
+  /** Default maximum number of point in each leaf block */
+  public static final int DEFAULT_MAX_POINTS_IN_LEAF_NODE = 1024;
+
+  /** Default maximum heap to use, before spilling to (slower) disk */
+  public static final float DEFAULT_MAX_MB_SORT_IN_HEAP = 16.0f;
+
+  /** Maximum number of dimensions */
+  public static final int MAX_DIMS = 8;
+
+  /** How many dimensions we are indexing */
+  protected final int numDims;
+
+  /** How many bytes each value in each dimension takes. */
+  protected final int bytesPerDim;
+
+  /** numDims * bytesPerDim */
+  protected final int packedBytesLength;
+
+  final BytesRefBuilder scratch = new BytesRefBuilder();
+
+  final TrackingDirectoryWrapper tempDir;
+  final String tempFileNamePrefix;
+  final double maxMBSortInHeap;
+
+  final byte[] scratchDiff;
+  final byte[] scratch1;
+  final byte[] scratch2;
+  final BytesRef scratchBytesRef1 = new BytesRef();
+  final BytesRef scratchBytesRef2 = new BytesRef();
+  final int[] commonPrefixLengths;
+
+  protected final FixedBitSet docsSeen;
+
+  private OfflinePointWriter offlinePointWriter;
+  private HeapPointWriter heapPointWriter;
+
+  private IndexOutput tempInput;
+  protected final int maxPointsInLeafNode;
+  private final int maxPointsSortInHeap;
+
+  /** Minimum per-dim values, packed */
+  protected final byte[] minPackedValue;
+
+  /** Maximum per-dim values, packed */
+  protected final byte[] maxPackedValue;
+
+  protected long pointCount;
+
+  /** true if we have so many values that we must write ords using long (8 bytes) instead of int (4 bytes) */
+  protected final boolean longOrds;
+
+  /** An upper bound on how many points the caller will add (includes deletions) */
+  private final long totalPointCount;
+
+  /** True if every document has at most one value.  We specialize this case by not bothering to store the ord since it's redundant with docID.  */
+  protected final boolean singleValuePerDoc;
+
+  /** How much heap OfflineSorter is allowed to use */
+  protected final OfflineSorter.BufferSize offlineSorterBufferMB;
+
+  /** How much heap OfflineSorter is allowed to use */
+  protected final int offlineSorterMaxTempFiles;
+
+  private final int maxDoc;
+
+  public SimpleTextBKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDims, int bytesPerDim,
+                             int maxPointsInLeafNode, double maxMBSortInHeap, long totalPointCount, boolean singleValuePerDoc) throws IOException {
+    this(maxDoc, tempDir, tempFileNamePrefix, numDims, bytesPerDim, maxPointsInLeafNode, maxMBSortInHeap, totalPointCount, singleValuePerDoc,
+         totalPointCount > Integer.MAX_VALUE, Math.max(1, (long) maxMBSortInHeap), OfflineSorter.MAX_TEMPFILES);
+  }
+
+  private SimpleTextBKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDims, int bytesPerDim,
+                              int maxPointsInLeafNode, double maxMBSortInHeap, long totalPointCount,
+                              boolean singleValuePerDoc, boolean longOrds, long offlineSorterBufferMB, int offlineSorterMaxTempFiles) throws IOException {
+    verifyParams(numDims, maxPointsInLeafNode, maxMBSortInHeap, totalPointCount);
+    // We use tracking dir to deal with removing files on exception, so each place that
+    // creates temp files doesn't need crazy try/finally/sucess logic:
+    this.tempDir = new TrackingDirectoryWrapper(tempDir);
+    this.tempFileNamePrefix = tempFileNamePrefix;
+    this.maxPointsInLeafNode = maxPointsInLeafNode;
+    this.numDims = numDims;
+    this.bytesPerDim = bytesPerDim;
+    this.totalPointCount = totalPointCount;
+    this.maxDoc = maxDoc;
+    this.offlineSorterBufferMB = OfflineSorter.BufferSize.megabytes(offlineSorterBufferMB);
+    this.offlineSorterMaxTempFiles = offlineSorterMaxTempFiles;
+    docsSeen = new FixedBitSet(maxDoc);
+    packedBytesLength = numDims * bytesPerDim;
+
+    scratchDiff = new byte[bytesPerDim];
+    scratch1 = new byte[packedBytesLength];
+    scratch2 = new byte[packedBytesLength];
+    commonPrefixLengths = new int[numDims];
+
+    minPackedValue = new byte[packedBytesLength];
+    maxPackedValue = new byte[packedBytesLength];
+
+    // If we may have more than 1+Integer.MAX_VALUE values, then we must encode ords with long (8 bytes), else we can use int (4 bytes).
+    this.longOrds = longOrds;
+
+    this.singleValuePerDoc = singleValuePerDoc;
+
+    // dimensional values (numDims * bytesPerDim) + ord (int or long) + docID (int)
+    if (singleValuePerDoc) {
+      // Lucene only supports up to 2.1 docs, so we better not need longOrds in this case:
+      assert longOrds == false;
+      bytesPerDoc = packedBytesLength + Integer.BYTES;
+    } else if (longOrds) {
+      bytesPerDoc = packedBytesLength + Long.BYTES + Integer.BYTES;
+    } else {
+      bytesPerDoc = packedBytesLength + Integer.BYTES + Integer.BYTES;
+    }
+
+    // As we recurse, we compute temporary partitions of the data, halving the
+    // number of points at each recursion.  Once there are few enough points,
+    // we can switch to sorting in heap instead of offline (on disk).  At any
+    // time in the recursion, we hold the number of points at that level, plus
+    // all recursive halves (i.e. 16 + 8 + 4 + 2) so the memory usage is 2X
+    // what that level would consume, so we multiply by 0.5 to convert from
+    // bytes to points here.  Each dimension has its own sorted partition, so
+    // we must divide by numDims as wel.
+
+    maxPointsSortInHeap = (int) (0.5 * (maxMBSortInHeap * 1024 * 1024) / (bytesPerDoc * numDims));
+
+    // Finally, we must be able to hold at least the leaf node in heap during build:
+    if (maxPointsSortInHeap < maxPointsInLeafNode) {
+      throw new IllegalArgumentException("maxMBSortInHeap=" + maxMBSortInHeap + " only allows for maxPointsSortInHeap=" + maxPointsSortInHeap + ", but this is less than maxPointsInLeafNode=" + maxPointsInLeafNode + "; either increase maxMBSortInHeap or decrease maxPointsInLeafNode");
+    }
+
+    // We write first maxPointsSortInHeap in heap, then cutover to offline for additional points:
+    heapPointWriter = new HeapPointWriter(16, maxPointsSortInHeap, packedBytesLength, longOrds, singleValuePerDoc);
+
+    this.maxMBSortInHeap = maxMBSortInHeap;
+  }
+
+  public static void verifyParams(int numDims, int maxPointsInLeafNode, double maxMBSortInHeap, long totalPointCount) {
+    // We encode dim in a single byte in the splitPackedValues, but we only expose 4 bits for it now, in case we want to use
+    // remaining 4 bits for another purpose later
+    if (numDims < 1 || numDims > MAX_DIMS) {
+      throw new IllegalArgumentException("numDims must be 1 .. " + MAX_DIMS + " (got: " + numDims + ")");
+    }
+    if (maxPointsInLeafNode <= 0) {
+      throw new IllegalArgumentException("maxPointsInLeafNode must be > 0; got " + maxPointsInLeafNode);
+    }
+    if (maxPointsInLeafNode > ArrayUtil.MAX_ARRAY_LENGTH) {
+      throw new IllegalArgumentException("maxPointsInLeafNode must be <= ArrayUtil.MAX_ARRAY_LENGTH (= " + ArrayUtil.MAX_ARRAY_LENGTH + "); got " + maxPointsInLeafNode);
+    }
+    if (maxMBSortInHeap < 0.0) {
+      throw new IllegalArgumentException("maxMBSortInHeap must be >= 0.0 (got: " + maxMBSortInHeap + ")");
+    }
+    if (totalPointCount < 0) {
+      throw new IllegalArgumentException("totalPointCount must be >=0 (got: " + totalPointCount + ")");
+    }
+  }
+
+  /** If the current segment has too many points then we spill over to temp files / offline sort. */
+  private void spillToOffline() throws IOException {
+
+    // For each .add we just append to this input file, then in .finish we sort this input and resursively build the tree:
+    offlinePointWriter = new OfflinePointWriter(tempDir, tempFileNamePrefix, packedBytesLength, longOrds, "spill", 0, singleValuePerDoc);
+    tempInput = offlinePointWriter.out;
+    PointReader reader = heapPointWriter.getReader(0, pointCount);
+    for(int i=0;i<pointCount;i++) {
+      boolean hasNext = reader.next();
+      assert hasNext;
+      offlinePointWriter.append(reader.packedValue(), i, heapPointWriter.docIDs[i]);
+    }
+
+    heapPointWriter = null;
+  }
+
+  public void add(byte[] packedValue, int docID) throws IOException {
+    if (packedValue.length != packedBytesLength) {
+      throw new IllegalArgumentException("packedValue should be length=" + packedBytesLength + " (got: " + packedValue.length + ")");
+    }
+
+    if (pointCount >= maxPointsSortInHeap) {
+      if (offlinePointWriter == null) {
+        spillToOffline();
+      }
+      offlinePointWriter.append(packedValue, pointCount, docID);
+    } else {
+      // Not too many points added yet, continue using heap:
+      heapPointWriter.append(packedValue, pointCount, docID);
+    }
+
+    // TODO: we could specialize for the 1D case:
+    if (pointCount == 0) {
+      System.arraycopy(packedValue, 0, minPackedValue, 0, packedBytesLength);
+      System.arraycopy(packedValue, 0, maxPackedValue, 0, packedBytesLength);
+    } else {
+      for(int dim=0;dim<numDims;dim++) {
+        int offset = dim*bytesPerDim;
+        if (StringHelper.compare(bytesPerDim, packedValue, offset, minPackedValue, offset) < 0) {
+          System.arraycopy(packedValue, offset, minPackedValue, offset, bytesPerDim);
+        }
+        if (StringHelper.compare(bytesPerDim, packedValue, offset, maxPackedValue, offset) > 0) {
+          System.arraycopy(packedValue, offset, maxPackedValue, offset, bytesPerDim);
+        }
+      }
+    }
+
+    pointCount++;
+    if (pointCount > totalPointCount) {
+      throw new IllegalStateException("totalPointCount=" + totalPointCount + " was passed when we were created, but we just hit " + pointCount + " values");
+    }
+    docsSeen.set(docID);
+  }
+
+  /** How many points have been added so far */
+  public long getPointCount() {
+    return pointCount;
+  }
+
+  private static class MergeReader {
+    final SimpleTextBKDReader bkd;
+    final SimpleTextBKDReader.IntersectState state;
+    final MergeState.DocMap docMap;
+
+    /** Current doc ID */
+    public int docID;
+
+    /** Which doc in this block we are up to */
+    private int docBlockUpto;
+
+    /** How many docs in the current block */
+    private int docsInBlock;
+
+    /** Which leaf block we are up to */
+    private int blockID;
+
+    private final byte[] packedValues;
+
+    public MergeReader(SimpleTextBKDReader bkd, MergeState.DocMap docMap) throws IOException {
+      this.bkd = bkd;
+      state = new SimpleTextBKDReader.IntersectState(bkd.in.clone(),
+                                                     bkd.numDims,
+                                                     bkd.packedBytesLength,
+                                                     bkd.maxPointsInLeafNode,
+                                                     null);
+      this.docMap = docMap;
+      long minFP = Long.MAX_VALUE;
+      //System.out.println("MR.init " + this + " bkdreader=" + bkd + " leafBlockFPs.length=" + bkd.leafBlockFPs.length);
+      for(long fp : bkd.leafBlockFPs) {
+        minFP = Math.min(minFP, fp);
+        //System.out.println("  leaf fp=" + fp);
+      }
+      state.in.seek(minFP);
+      this.packedValues = new byte[bkd.maxPointsInLeafNode * bkd.packedBytesLength];
+    }
+
+    public boolean next() throws IOException {
+      //System.out.println("MR.next this=" + this);
+      while (true) {
+        if (docBlockUpto == docsInBlock) {
+          if (blockID == bkd.leafBlockFPs.length) {
+            //System.out.println("  done!");
+            return false;
+          }
+          //System.out.println("  new block @ fp=" + state.in.getFilePointer());
+          docsInBlock = bkd.readDocIDs(state.in, state.in.getFilePointer(), state.scratchDocIDs);
+          assert docsInBlock > 0;
+          docBlockUpto = 0;
+          bkd.visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, docsInBlock, new IntersectVisitor() {
+            int i = 0;
+
+            @Override
+            public void visit(int docID) throws IOException {
+              throw new UnsupportedOperationException();
+            }
+
+            @Override
+            public void visit(int docID, byte[] packedValue) throws IOException {
+              assert docID == state.scratchDocIDs[i];
+              System.arraycopy(packedValue, 0, packedValues, i * bkd.packedBytesLength, bkd.packedBytesLength);
+              i++;
+            }
+
+            @Override
+            public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+              throw new UnsupportedOperationException();
+            }
+
+          });
+
+          blockID++;
+        }
+
+        final int index = docBlockUpto++;
+        int oldDocID = state.scratchDocIDs[index];
+
+        int mappedDocID;
+        if (docMap == null) {
+          mappedDocID = oldDocID;
+        } else {
+          mappedDocID = docMap.get(oldDocID);
+        }
+
+        if (mappedDocID != -1) {
+          // Not deleted!
+          docID = mappedDocID;
+          System.arraycopy(packedValues, index * bkd.packedBytesLength, state.scratchPackedValue, 0, bkd.packedBytesLength);
+          return true;
+        }
+      }
+    }
+  }
+
+  private static class BKDMergeQueue extends PriorityQueue<MergeReader> {
+    private final int bytesPerDim;
+
+    public BKDMergeQueue(int bytesPerDim, int maxSize) {
+      super(maxSize);
+      this.bytesPerDim = bytesPerDim;
+    }
+
+    @Override
+    public boolean lessThan(MergeReader a, MergeReader b) {
+      assert a != b;
+
+      int cmp = StringHelper.compare(bytesPerDim, a.state.scratchPackedValue, 0, b.state.scratchPackedValue, 0);
+      if (cmp < 0) {
+        return true;
+      } else if (cmp > 0) {
+        return false;
+      }
+
+      // Tie break by sorting smaller docIDs earlier:
+      return a.docID < b.docID;
+    }
+  }
+
+  /** Write a field from a {@link MutablePointValues}. This way of writing
+   *  points is faster than regular writes with {@link BKDWriter#add} since
+   *  there is opportunity for reordering points before writing them to
+   *  disk. This method does not use transient disk in order to reorder points.
+   */
+  public long writeField(IndexOutput out, String fieldName, MutablePointValues reader) throws IOException {
+    if (numDims == 1) {
+      return writeField1Dim(out, fieldName, reader);
+    } else {
+      return writeFieldNDims(out, fieldName, reader);
+    }
+  }
+
+
+  /* In the 2+D case, we recursively pick the split dimension, compute the
+   * median value and partition other values around it. */
+  private long writeFieldNDims(IndexOutput out, String fieldName, MutablePointValues values) throws IOException {
+    if (pointCount != 0) {
+      throw new IllegalStateException("cannot mix add and writeField");
+    }
+
+    // Catch user silliness:
+    if (heapPointWriter == null && tempInput == null) {
+      throw new IllegalStateException("already finished");
+    }
+
+    // Mark that we already finished:
+    heapPointWriter = null;
+
+    long countPerLeaf = pointCount = values.size();
+    long innerNodeCount = 1;
+
+    while (countPerLeaf > maxPointsInLeafNode) {
+      countPerLeaf = (countPerLeaf+1)/2;
+      innerNodeCount *= 2;
+    }
+
+    int numLeaves = Math.toIntExact(innerNodeCount);
+
+    checkMaxLeafNodeCount(numLeaves);
+
+    final byte[] splitPackedValues = new byte[numLeaves * (bytesPerDim + 1)];
+    final long[] leafBlockFPs = new long[numLeaves];
+
+    // compute the min/max for this slice
+    Arrays.fill(minPackedValue, (byte) 0xff);
+    Arrays.fill(maxPackedValue, (byte) 0);
+    for (int i = 0; i < Math.toIntExact(pointCount); ++i) {
+      values.getValue(i, scratchBytesRef1);
+      for(int dim=0;dim<numDims;dim++) {
+        int offset = dim*bytesPerDim;
+        if (StringHelper.compare(bytesPerDim, scratchBytesRef1.bytes, scratchBytesRef1.offset + offset, minPackedValue, offset) < 0) {
+          System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + offset, minPackedValue, offset, bytesPerDim);
+        }
+        if (StringHelper.compare(bytesPerDim, scratchBytesRef1.bytes, scratchBytesRef1.offset + offset, maxPackedValue, offset) > 0) {
+          System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + offset, maxPackedValue, offset, bytesPerDim);
+        }
+      }
+
+      docsSeen.set(values.getDocID(i));
+    }
+
+    build(1, numLeaves, values, 0, Math.toIntExact(pointCount), out,
+        minPackedValue, maxPackedValue, splitPackedValues, leafBlockFPs,
+        new int[maxPointsInLeafNode]);
+
+    long indexFP = out.getFilePointer();
+    writeIndex(out, leafBlockFPs, splitPackedValues);
+    return indexFP;
+  }
+
+
+  /* In the 1D case, we can simply sort points in ascending order and use the
+   * same writing logic as we use at merge time. */
+  private long writeField1Dim(IndexOutput out, String fieldName, MutablePointValues reader) throws IOException {
+    MutablePointsReaderUtils.sort(maxDoc, packedBytesLength, reader, 0, Math.toIntExact(reader.size()));
+
+    final OneDimensionBKDWriter oneDimWriter = new OneDimensionBKDWriter(out);
+
+    reader.intersect(new IntersectVisitor() {
+
+      @Override
+      public void visit(int docID, byte[] packedValue) throws IOException {
+        oneDimWriter.add(packedValue, docID);
+      }
+
+      @Override
+      public void visit(int docID) throws IOException {
+        throw new IllegalStateException();
+      }
+
+      @Override
+      public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+        return Relation.CELL_CROSSES_QUERY;
+      }
+    });
+
+    return oneDimWriter.finish();
+  }
+
+  // TODO: remove this opto: SimpleText is supposed to be simple!
+  
+  /** More efficient bulk-add for incoming {@link SimpleTextBKDReader}s.  This does a merge sort of the already
+   *  sorted values and currently only works when numDims==1.  This returns -1 if all documents containing
+   *  dimensional values were deleted. */
+  public long merge(IndexOutput out, List<MergeState.DocMap> docMaps, List<SimpleTextBKDReader> readers) throws IOException {
+    assert docMaps == null || readers.size() == docMaps.size();
+
+    BKDMergeQueue queue = new BKDMergeQueue(bytesPerDim, readers.size());
+
+    for(int i=0;i<readers.size();i++) {
+      SimpleTextBKDReader bkd = readers.get(i);
+      MergeState.DocMap docMap;
+      if (docMaps == null) {
+        docMap = null;
+      } else {
+        docMap = docMaps.get(i);
+      }
+      MergeReader reader = new MergeReader(bkd, docMap);
+      if (reader.next()) {
+        queue.add(reader);
+      }
+    }
+
+    OneDimensionBKDWriter oneDimWriter = new OneDimensionBKDWriter(out);
+
+    while (queue.size() != 0) {
+      MergeReader reader = queue.top();
+      // System.out.println("iter reader=" + reader);
+
+      // NOTE: doesn't work with subclasses (e.g. SimpleText!)
+      oneDimWriter.add(reader.state.scratchPackedValue, reader.docID);
+
+      if (reader.next()) {
+        queue.updateTop();
+      } else {
+        // This segment was exhausted
+        queue.pop();
+      }
+    }
+
+    return oneDimWriter.finish();
+  }
+
+  private class OneDimensionBKDWriter {
+
+    final IndexOutput out;
+    final List<Long> leafBlockFPs = new ArrayList<>();
+    final List<byte[]> leafBlockStartValues = new ArrayList<>();
+    final byte[] leafValues = new byte[maxPointsInLeafNode * packedBytesLength];
+    final int[] leafDocs = new int[maxPointsInLeafNode];
+    long valueCount;
+    int leafCount;
+
+    OneDimensionBKDWriter(IndexOutput out) {
+      if (numDims != 1) {
+        throw new UnsupportedOperationException("numDims must be 1 but got " + numDims);
+      }
+      if (pointCount != 0) {
+        throw new IllegalStateException("cannot mix add and merge");
+      }
+
+      // Catch user silliness:
+      if (heapPointWriter == null && tempInput == null) {
+        throw new IllegalStateException("already finished");
+      }
+
+      // Mark that we already finished:
+      heapPointWriter = null;
+
+      this.out = out;
+
+      lastPackedValue = new byte[packedBytesLength];
+    }
+
+    // for asserts
+    final byte[] lastPackedValue;
+    int lastDocID;
+
+    void add(byte[] packedValue, int docID) throws IOException {
+      assert valueInOrder(valueCount + leafCount,
+          0, lastPackedValue, packedValue, 0, docID, lastDocID);
+
+      System.arraycopy(packedValue, 0, leafValues, leafCount * packedBytesLength, packedBytesLength);
+      leafDocs[leafCount] = docID;
+      docsSeen.set(docID);
+      leafCount++;
+
+      if (valueCount > totalPointCount) {
+        throw new IllegalStateException("totalPointCount=" + totalPointCount + " was passed when we were created, but we just hit " + pointCount + " values");
+      }
+
+      if (leafCount == maxPointsInLeafNode) {
+        // We write a block once we hit exactly the max count ... this is different from
+        // when we flush a new segment, where we write between max/2 and max per leaf block,
+        // so merged segments will behave differently from newly flushed segments:
+        writeLeafBlock();
+        leafCount = 0;
+      }
+
+      assert (lastDocID = docID) >= 0; // only assign when asserts are enabled
+    }
+
+    public long finish() throws IOException {
+      if (leafCount > 0) {
+        writeLeafBlock();
+        leafCount = 0;
+      }
+
+      if (valueCount == 0) {
+        return -1;
+      }
+
+      pointCount = valueCount;
+
+      long indexFP = out.getFilePointer();
+
+      int numInnerNodes = leafBlockStartValues.size();
+
+      //System.out.println("BKDW: now rotate numInnerNodes=" + numInnerNodes + " leafBlockStarts=" + leafBlockStartValues.size());
+
+      byte[] index = new byte[(1+numInnerNodes) * (1+bytesPerDim)];
+      rotateToTree(1, 0, numInnerNodes, index, leafBlockStartValues);
+      long[] arr = new long[leafBlockFPs.size()];
+      for(int i=0;i<leafBlockFPs.size();i++) {
+        arr[i] = leafBlockFPs.get(i);
+      }
+      writeIndex(out, arr, index);
+      return indexFP;
+    }
+
+    private void writeLeafBlock() throws IOException {
+      assert leafCount != 0;
+      if (valueCount == 0) {
+        System.arraycopy(leafValues, 0, minPackedValue, 0, packedBytesLength);
+      }
+      System.arraycopy(leafValues, (leafCount - 1) * packedBytesLength, maxPackedValue, 0, packedBytesLength);
+
+      valueCount += leafCount;
+
+      if (leafBlockFPs.size() > 0) {
+        // Save the first (minimum) value in each leaf block except the first, to build the split value index in the end:
+        leafBlockStartValues.add(Arrays.copyOf(leafValues, packedBytesLength));
+      }
+      leafBlockFPs.add(out.getFilePointer());
+      checkMaxLeafNodeCount(leafBlockFPs.size());
+
+      Arrays.fill(commonPrefixLengths, bytesPerDim);
+      // Find per-dim common prefix:
+      for(int dim=0;dim<numDims;dim++) {
+        int offset1 = dim * bytesPerDim;
+        int offset2 = (leafCount - 1) * packedBytesLength + offset1;
+        for(int j=0;j<commonPrefixLengths[dim];j++) {
+          if (leafValues[offset1+j] != leafValues[offset2+j]) {
+            commonPrefixLengths[dim] = j;
+            break;
+          }
+        }
+      }
+
+      writeLeafBlockDocs(out, leafDocs, 0, leafCount);
+
+      final IntFunction<BytesRef> packedValues = new IntFunction<BytesRef>() {
+        final BytesRef scratch = new BytesRef();
+
+        {
+          scratch.length = packedBytesLength;
+          scratch.bytes = leafValues;
+        }
+
+        @Override
+        public BytesRef apply(int i) {
+          scratch.offset = packedBytesLength * i;
+          return scratch;
+        }
+      };
+      assert valuesInOrderAndBounds(leafCount, 0, Arrays.copyOf(leafValues, packedBytesLength),
+          Arrays.copyOfRange(leafValues, (leafCount - 1) * packedBytesLength, leafCount * packedBytesLength),
+          packedValues, leafDocs, 0);
+      writeLeafBlockPackedValues(out, commonPrefixLengths, leafCount, 0, packedValues);
+    }
+
+  }
+
+  // TODO: there must be a simpler way?
+  private void rotateToTree(int nodeID, int offset, int count, byte[] index, List<byte[]> leafBlockStartValues) {
+    //System.out.println("ROTATE: nodeID=" + nodeID + " offset=" + offset + " count=" + count + " bpd=" + bytesPerDim + " index.length=" + index.length);
+    if (count == 1) {
+      // Leaf index node
+      //System.out.println("  leaf index node");
+      //System.out.println("  index[" + nodeID + "] = blockStartValues[" + offset + "]");
+      System.arraycopy(leafBlockStartValues.get(offset), 0, index, nodeID*(1+bytesPerDim)+1, bytesPerDim);
+    } else if (count > 1) {
+      // Internal index node: binary partition of count
+      int countAtLevel = 1;
+      int totalCount = 0;
+      while (true) {
+        int countLeft = count - totalCount;
+        //System.out.println("    cycle countLeft=" + countLeft + " coutAtLevel=" + countAtLevel);
+        if (countLeft <= countAtLevel) {
+          // This is the last level, possibly partially filled:
+          int lastLeftCount = Math.min(countAtLevel/2, countLeft);
+          assert lastLeftCount >= 0;
+          int leftHalf = (totalCount-1)/2 + lastLeftCount;
+
+          int rootOffset = offset + leftHalf;
+          /*
+          System.out.println("  last left count " + lastLeftCount);
+          System.out.println("  leftHalf " + leftHalf + " rightHalf=" + (count-leftHalf-1));
+          System.out.println("  rootOffset=" + rootOffset);
+          */
+
+          System.arraycopy(leafBlockStartValues.get(rootOffset), 0, index, nodeID*(1+bytesPerDim)+1, bytesPerDim);
+          //System.out.println("  index[" + nodeID + "] = blockStartValues[" + rootOffset + "]");
+
+          // TODO: we could optimize/specialize, when we know it's simply fully balanced binary tree
+          // under here, to save this while loop on each recursion
+
+          // Recurse left
+          rotateToTree(2*nodeID, offset, leftHalf, index, leafBlockStartValues);
+
+          // Recurse right
+          rotateToTree(2*nodeID+1, rootOffset+1, count-leftHalf-1, index, leafBlockStartValues);
+          return;
+        }
+        totalCount += countAtLevel;
+        countAtLevel *= 2;
+      }
+    } else {
+      assert count == 0;
+    }
+  }
+
+  // TODO: if we fixed each partition step to just record the file offset at the "split point", we could probably handle variable length
+  // encoding and not have our own ByteSequencesReader/Writer
+
+  /** Sort the heap writer by the specified dim */
+  private void sortHeapPointWriter(final HeapPointWriter writer, int dim) {
+    final int pointCount = Math.toIntExact(this.pointCount);
+    // Tie-break by docID:
+
+    // No need to tie break on ord, for the case where the same doc has the same value in a given dimension indexed more than once: it
+    // can't matter at search time since we don't write ords into the index:
+    new MSBRadixSorter(bytesPerDim + Integer.BYTES) {
+
+      @Override
+      protected int byteAt(int i, int k) {
+        assert k >= 0;
+        if (k < bytesPerDim) {
+          // dim bytes
+          int block = i / writer.valuesPerBlock;
+          int index = i % writer.valuesPerBlock;
+          return writer.blocks.get(block)[index * packedBytesLength + dim * bytesPerDim + k] & 0xff;
+        } else {
+          // doc id
+          int s = 3 - (k - bytesPerDim);
+          return (writer.docIDs[i] >>> (s * 8)) & 0xff;
+        }
+      }
+
+      @Override
+      protected void swap(int i, int j) {
+        int docID = writer.docIDs[i];
+        writer.docIDs[i] = writer.docIDs[j];
+        writer.docIDs[j] = docID;
+
+        if (singleValuePerDoc == false) {
+          if (longOrds) {
+            long ord = writer.ordsLong[i];
+            writer.ordsLong[i] = writer.ordsLong[j];
+            writer.ordsLong[j] = ord;
+          } else {
+            int ord = writer.ords[i];
+            writer.ords[i] = writer.ords[j];
+            writer.ords[j] = ord;
+          }
+        }
+
+        byte[] blockI = writer.blocks.get(i / writer.valuesPerBlock);
+        int indexI = (i % writer.valuesPerBlock) * packedBytesLength;
+        byte[] blockJ = writer.blocks.get(j / writer.valuesPerBlock);
+        int indexJ = (j % writer.valuesPerBlock) * packedBytesLength;
+
+        // scratch1 = values[i]
+        System.arraycopy(blockI, indexI, scratch1, 0, packedBytesLength);
+        // values[i] = values[j]
+        System.arraycopy(blockJ, indexJ, blockI, indexI, packedBytesLength);
+        // values[j] = scratch1
+        System.arraycopy(scratch1, 0, blockJ, indexJ, packedBytesLength);
+      }
+
+    }.sort(0, pointCount);
+  }
+
+  private PointWriter sort(int dim) throws IOException {
+    assert dim >= 0 && dim < numDims;
+
+    if (heapPointWriter != null) {
+
+      assert tempInput == null;
+
+      // We never spilled the incoming points to disk, so now we sort in heap:
+      HeapPointWriter sorted;
+
+      if (dim == 0) {
+        // First dim can re-use the current heap writer
+        sorted = heapPointWriter;
+      } else {
+        // Subsequent dims need a private copy
+        sorted = new HeapPointWriter((int) pointCount, (int) pointCount, packedBytesLength, longOrds, singleValuePerDoc);
+        sorted.copyFrom(heapPointWriter);
+      }
+
+      //long t0 = System.nanoTime();
+      sortHeapPointWriter(sorted, dim);
+      //long t1 = System.nanoTime();
+      //System.out.println("BKD: sort took " + ((t1-t0)/1000000.0) + " msec");
+
+      sorted.close();
+      return sorted;
+    } else {
+
+      // Offline sort:
+      assert tempInput != null;
+
+      final int offset = bytesPerDim * dim;
+
+      Comparator<BytesRef> cmp;
+      if (dim == numDims - 1) {
+        // in that case the bytes for the dimension and for the doc id are contiguous,
+        // so we don't need a branch
+        cmp = new BytesRefComparator(bytesPerDim + Integer.BYTES) {
+          @Override
+          protected int byteAt(BytesRef ref, int i) {
+            return ref.bytes[ref.offset + offset + i] & 0xff;
+          }
+        };
+      } else {
+        cmp = new BytesRefComparator(bytesPerDim + Integer.BYTES) {
+          @Override
+          protected int byteAt(BytesRef ref, int i) {
+            if (i < bytesPerDim) {
+              return ref.bytes[ref.offset + offset + i] & 0xff;
+            } else {
+              return ref.bytes[ref.offset + packedBytesLength + i - bytesPerDim] & 0xff;
+            }
+          }
+        };
+      }
+
+      OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix + "_bkd" + dim, cmp, offlineSorterBufferMB, offlineSorterMaxTempFiles, bytesPerDoc) {
+
+          /** We write/read fixed-byte-width file that {@link OfflinePointReader} can read. */
+          @Override
+          protected ByteSequencesWriter getWriter(IndexOutput out) {
+            return new ByteSequencesWriter(out) {
+              @Override
+              public void write(byte[] bytes, int off, int len) throws IOException {
+                assert len == bytesPerDoc: "len=" + len + " bytesPerDoc=" + bytesPerDoc;
+                out.writeBytes(bytes, off, len);
+              }
+            };
+          }
+
+          /** We write/read fixed-byte-width file that {@link OfflinePointReader} can read. */
+          @Override
+          protected ByteSequencesReader getReader(ChecksumIndexInput in, String name) throws IOException {
+            return new ByteSequencesReader(in, name) {
+              final BytesRef scratch = new BytesRef(new byte[bytesPerDoc]);
+              @Override
+              public BytesRef next() throws IOException {
+                if (in.getFilePointer() >= end) {
+                  return null;
+                }
+                in.readBytes(scratch.bytes, 0, bytesPerDoc);
+                return scratch;
+              }
+            };
+          }
+        };
+
+      String name = sorter.sort(tempInput.getName());
+
+      return new OfflinePointWriter(tempDir, name, packedBytesLength, pointCount, longOrds, singleValuePerDoc);
+    }
+  }
+
+  private void checkMaxLeafNodeCount(int numLeaves) {
+    if ((1+bytesPerDim) * (long) numLeaves > ArrayUtil.MAX_ARRAY_LENGTH) {
+      throw new IllegalStateException("too many nodes; increase maxPointsInLeafNode (currently " + maxPointsInLeafNode + ") and reindex");
+    }
+  }
+
+  /** Writes the BKD tree to the provided {@link IndexOutput} and returns the file offset where index was written. */
+  public long finish(IndexOutput out) throws IOException {
+    // System.out.println("\nBKDTreeWriter.finish pointCount=" + pointCount + " out=" + out + " heapWriter=" + heapPointWriter);
+
+    // TODO: specialize the 1D case?  it's much faster at indexing time (no partitioning on recurse...)
+
+    // Catch user silliness:
+    if (heapPointWriter == null && tempInput == null) {
+      throw new IllegalStateException("already finished");
+    }
+
+    if (offlinePointWriter != null) {
+      offlinePointWriter.close();
+    }
+
+    if (pointCount == 0) {
+      throw new IllegalStateException("must index at least one point");
+    }
+
+    LongBitSet ordBitSet;
+    if (numDims > 1) {
+      if (singleValuePerDoc) {
+        ordBitSet = new LongBitSet(maxDoc);
+      } else {
+        ordBitSet = new LongBitSet(pointCount);
+      }
+    } else {
+      ordBitSet = null;
+    }
+
+    long countPerLeaf = pointCount;
+    long innerNodeCount = 1;
+
+    while (countPerLeaf > maxPointsInLeafNode) {
+      countPerLeaf = (countPerLeaf+1)/2;
+      innerNodeCount *= 2;
+    }
+
+    int numLeaves = (int) innerNodeCount;
+
+    checkMaxLeafNodeCount(numLeaves);
+
+    // NOTE: we could save the 1+ here, to use a bit less heap at search time, but then we'd need a somewhat costly check at each
+    // step of the recursion to recompute the split dim:
+
+    // Indexed by nodeID, but first (root) nodeID is 1.  We do 1+ because the lead byte at each recursion says which dim we split on.
+    byte[] splitPackedValues = new byte[Math.toIntExact(numLeaves*(1+bytesPerDim))];
+
+    // +1 because leaf count is power of 2 (e.g. 8), and innerNodeCount is power of 2 minus 1 (e.g. 7)
+    long[] leafBlockFPs = new long[numLeaves];
+
+    // Make sure the math above "worked":
+    assert pointCount / numLeaves <= maxPointsInLeafNode: "pointCount=" + pointCount + " numLeaves=" + numLeaves + " maxPointsInLeafNode=" + maxPointsInLeafNode;
+
+    // Sort all docs once by each dimension:
+    PathSlice[] sortedPointWriters = new PathSlice[numDims];
+
+    // This is only used on exception; on normal code paths we close all files we opened:
+    List<Closeable> toCloseHeroically = new ArrayList<>();
+
+    boolean success = false;
+    try {
+      //long t0 = System.nanoTime();
+      for(int dim=0;dim<numDims;dim++) {
+        sortedPointWriters[dim] = new PathSlice(sort(dim), 0, pointCount);
+      }
+      //long t1 = System.nanoTime();
+      //System.out.println("sort time: " + ((t1-t0)/1000000.0) + " msec");
+
+      if (tempInput != null) {
+        tempDir.deleteFile(tempInput.getName());
+        tempInput = null;
+      } else {
+        assert heapPointWriter != null;
+        heapPointWriter = null;
+      }
+
+      build(1, numLeaves, sortedPointWriters,
+            ordBitSet, out,
+            minPackedValue, maxPackedValue,
+            splitPackedValues,
+            leafBlockFPs,
+            toCloseHeroically);
+
+      for(PathSlice slice : sortedPointWriters) {
+        slice.writer.destroy();
+      }
+
+      // If no exception, we should have cleaned everything up:
+      assert tempDir.getCreatedFiles().isEmpty();
+      //long t2 = System.nanoTime();
+      //System.out.println("write time: " + ((t2-t1)/1000000.0) + " msec");
+
+      success = true;
+    } finally {
+      if (success == false) {
+        IOUtils.deleteFilesIgnoringExceptions(tempDir, tempDir.getCreatedFiles());
+        IOUtils.closeWhileHandlingException(toCloseHeroically);
+      }
+    }
+
+    //System.out.println("Total nodes: " + innerNodeCount);
+
+    // Write index:
+    long indexFP = out.getFilePointer();
+    writeIndex(out, leafBlockFPs, splitPackedValues);
+    return indexFP;
+  }
+
+  /** Subclass can change how it writes the index. */
+  private void writeIndex(IndexOutput out, long[] leafBlockFPs, byte[] splitPackedValues) throws IOException {
+    write(out, NUM_DIMS);
+    writeInt(out, numDims);
+    newline(out);
+
+    write(out, BYTES_PER_DIM);
+    writeInt(out, bytesPerDim);
+    newline(out);
+
+    write(out, MAX_LEAF_POINTS);
+    writeInt(out, maxPointsInLeafNode);
+    newline(out);
+
+    write(out, INDEX_COUNT);
+    writeInt(out, leafBlockFPs.length);
+    newline(out);
+
+    write(out, MIN_VALUE);
+    BytesRef br = new BytesRef(minPackedValue, 0, minPackedValue.length);
+    write(out, br.toString());
+    newline(out);
+
+    write(out, MAX_VALUE);
+    br = new BytesRef(maxPackedValue, 0, maxPackedValue.length);
+    write(out, br.toString());
+    newline(out);
+
+    write(out, POINT_COUNT);
+    writeLong(out, pointCount);
+    newline(out);
+
+    write(out, DOC_COUNT);
+    writeInt(out, docsSeen.cardinality());
+    newline(out);
+
+    for(int i=0;i<leafBlockFPs.length;i++) {
+      write(out, BLOCK_FP);
+      writeLong(out, leafBlockFPs[i]);
+      newline(out);
+    }
+
+    assert (splitPackedValues.length % (1 + bytesPerDim)) == 0;
+    int count = splitPackedValues.length / (1 + bytesPerDim);
+    assert count == leafBlockFPs.length;
+
+    write(out, SPLIT_COUNT);
+    writeInt(out, count);
+    newline(out);
+
+    for(int i=0;i<count;i++) {
+      write(out, SPLIT_DIM);
+      writeInt(out, splitPackedValues[i * (1 + bytesPerDim)] & 0xff);
+      newline(out);
+      write(out, SPLIT_VALUE);
+      br = new BytesRef(splitPackedValues, 1+(i * (1+bytesPerDim)), bytesPerDim);
+      write(out, br.toString());
+      newline(out);
+    }
+  }
+
+  protected void writeLeafBlockDocs(IndexOutput out, int[] docIDs, int start, int count) throws IOException {
+    write(out, BLOCK_COUNT);
+    writeInt(out, count);
+    newline(out);
+    for(int i=0;i<count;i++) {
+      write(out, BLOCK_DOC_ID);
+      writeInt(out, docIDs[start+i]);
+      newline(out);
+    }
+  }
+
+  protected void writeLeafBlockPackedValues(IndexOutput out, int[] commonPrefixLengths, int count, int sortedDim, IntFunction<BytesRef> packedValues) throws IOException {
+    for (int i = 0; i < count; ++i) {
+      BytesRef packedValue = packedValues.apply(i);
+      // NOTE: we don't do prefix coding, so we ignore commonPrefixLengths
+      write(out, BLOCK_VALUE);
+      write(out, packedValue.toString());
+      newline(out);
+    }
+  }
+
+  private void writeLeafBlockPackedValuesRange(IndexOutput out, int[] commonPrefixLengths, int start, int end, IntFunction<BytesRef> packedValues) throws IOException {
+    for (int i = start; i < end; ++i) {
+      BytesRef ref = packedValues.apply(i);
+      assert ref.length == packedBytesLength;
+
+      for(int dim=0;dim<numDims;dim++) {
+        int prefix = commonPrefixLengths[dim];
+        out.writeBytes(ref.bytes, ref.offset + dim*bytesPerDim + prefix, bytesPerDim-prefix);
+      }
+    }
+  }
+
+  private static int runLen(IntFunction<BytesRef> packedValues, int start, int end, int byteOffset) {
+    BytesRef first = packedValues.apply(start);
+    byte b = first.bytes[first.offset + byteOffset];
+    for (int i = start + 1; i < end; ++i) {
+      BytesRef ref = packedValues.apply(i);
+      byte b2 = ref.bytes[ref.offset + byteOffset];
+      assert Byte.toUnsignedInt(b2) >= Byte.toUnsignedInt(b);
+      if (b != b2) {
+        return i - start;
+      }
+    }
+    return end - start;
+  }
+
+  @Override
+  public void close() throws IOException {
+    if (tempInput != null) {
+      // NOTE: this should only happen on exception, e.g. caller calls close w/o calling finish:
+      try {
+        tempInput.close();
+      } finally {
+        tempDir.deleteFile(tempInput.getName());
+        tempInput = null;
+      }
+    }
+  }
+
+  /** Sliced reference to points in an OfflineSorter.ByteSequencesWriter file. */
+  private static final class PathSlice {
+    final PointWriter writer;
+    final long start;
+    final long count;
+
+    public PathSlice(PointWriter writer, long start, long count) {
+      this.writer = writer;
+      this.start = start;
+      this.count = count;
+    }
+
+    @Override
+    public String toString() {
+      return "PathSlice(start=" + start + " count=" + count + " writer=" + writer + ")";
+    }
+  }
+
+  /** Called on exception, to check whether the checksum is also corrupt in this source, and add that
+   *  information (checksum matched or didn't) as a suppressed exception. */
+  private void verifyChecksum(Throwable priorException, PointWriter writer) throws IOException {
+    // TODO: we could improve this, to always validate checksum as we recurse, if we shared left and
+    // right reader after recursing to children, and possibly within recursed children,
+    // since all together they make a single pass through the file.  But this is a sizable re-org,
+    // and would mean leaving readers (IndexInputs) open for longer:
+    if (writer instanceof OfflinePointWriter) {
+      // We are reading from a temp file; go verify the checksum:
+      String tempFileName = ((OfflinePointWriter) writer).name;
+      try (ChecksumIndexInput in = tempDir.openChecksumInput(tempFileName, IOContext.READONCE)) {
+        CodecUtil.checkFooter(in, priorException);
+      }
+    } else {
+      // We are reading from heap; nothing to add:
+      IOUtils.reThrow(priorException);
+    }
+  }
+
+  /** Marks bits for the ords (points) that belong in the right sub tree (those docs that have values >= the splitValue). */
+  private byte[] markRightTree(long rightCount, int splitDim, PathSlice source, LongBitSet ordBitSet) throws IOException {
+
+    // Now we mark ords that fall into the right half, so we can partition on all other dims that are not the split dim:
+
+    // Read the split value, then mark all ords in the right tree (larger than the split value):
+
+    // TODO: find a way to also checksum this reader?  If we changed to markLeftTree, and scanned the final chunk, it could work?
+    try (PointReader reader = source.writer.getReader(source.start + source.count - rightCount, rightCount)) {
+      boolean result = reader.next();
+      assert result;
+      System.arraycopy(reader.packedValue(), splitDim*bytesPerDim, scratch1, 0, bytesPerDim);
+      if (numDims > 1) {
+        assert ordBitSet.get(reader.ord()) == false;
+        ordBitSet.set(reader.ord());
+        // Subtract 1 from rightCount because we already did the first value above (so we could record the split value):
+        reader.markOrds(rightCount-1, ordBitSet);
+      }
+    } catch (Throwable t) {
+      verifyChecksum(t, source.writer);
+    }
+
+    return scratch1;
+  }
+
+  /** Called only in assert */
+  private boolean valueInBounds(BytesRef packedValue, byte[] minPackedValue, byte[] maxPackedValue) {
+    for(int dim=0;dim<numDims;dim++) {
+      int offset = bytesPerDim*dim;
+      if (StringHelper.compare(bytesPerDim, packedValue.bytes, packedValue.offset + offset, minPackedValue, offset) < 0) {
+        return false;
+      }
+      if (StringHelper.compare(bytesPerDim, packedValue.bytes, packedValue.offset + offset, maxPackedValue, offset) > 0) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  protected int split(byte[] minPackedValue, byte[] maxPackedValue) {
+    // Find which dim has the largest span so we can split on it:
+    int splitDim = -1;
+    for(int dim=0;dim<numDims;dim++) {
+      NumericUtils.subtract(bytesPerDim, dim, maxPackedValue, minPackedValue, scratchDiff);
+      if (splitDim == -1 || StringHelper.compare(bytesPerDim, scratchDiff, 0, scratch1, 0) > 0) {
+        System.arraycopy(scratchDiff, 0, scratch1, 0, bytesPerDim);
+        splitDim = dim;
+      }
+    }
+
+    //System.out.println("SPLIT: " + splitDim);
+    return splitDim;
+  }
+
+  /** Pull a partition back into heap once the point count is low enough while recursing. */
+  private PathSlice switchToHeap(PathSlice source, List<Closeable> toCloseHeroically) throws IOException {
+    int count = Math.toIntExact(source.count);
+    // Not inside the try because we don't want to close it here:
+    PointReader reader = source.writer.getSharedReader(source.start, source.count, toCloseHeroically);
+    try (PointWriter writer = new HeapPointWriter(count, count, packedBytesLength, longOrds, singleValuePerDoc)) {
+      for(int i=0;i<count;i++) {
+        boolean hasNext = reader.next();
+        assert hasNext;
+        writer.append(reader.packedValue(), reader.ord(), reader.docID());
+      }
+      return new PathSlice(writer, 0, count);
+    } catch (Throwable t) {
+      verifyChecksum(t, source.writer);
+
+      // Dead code but javac disagrees:
+      return null;
+    }
+  }
+
+  /* Recursively reorders the provided reader and writes the bkd-tree on the fly. */
+  private void build(int nodeID, int leafNodeOffset,
+      MutablePointValues reader, int from, int to,
+      IndexOutput out,
+      byte[] minPackedValue, byte[] maxPackedValue,
+      byte[] splitPackedValues,
+      long[] leafBlockFPs,
+      int[] spareDocIds) throws IOException {
+
+    if (nodeID >= leafNodeOffset) {
+      // leaf node
+      final int count = to - from;
+      assert count <= maxPointsInLeafNode;
+
+      // Compute common prefixes
+      Arrays.fill(commonPrefixLengths, bytesPerDim);
+      reader.getValue(from, scratchBytesRef1);
+      for (int i = from + 1; i < to; ++i) {
+        reader.getValue(i, scratchBytesRef2);
+        for (int dim=0;dim<numDims;dim++) {
+          final int offset = dim * bytesPerDim;
+          for(int j=0;j<commonPrefixLengths[dim];j++) {
+            if (scratchBytesRef1.bytes[scratchBytesRef1.offset+offset+j] != scratchBytesRef2.bytes[scratchBytesRef2.offset+offset+j]) {
+              commonPrefixLengths[dim] = j;
+              break;
+            }
+          }
+        }
+      }
+
+      // Find the dimension that has the least number of unique bytes at commonPrefixLengths[dim]
+      FixedBitSet[] usedBytes = new FixedBitSet[numDims];
+      for (int dim = 0; dim < numDims; ++dim) {
+        if (commonPrefixLengths[dim] < bytesPerDim) {
+          usedBytes[dim] = new FixedBitSet(256);
+        }
+      }
+      for (int i = from + 1; i < to; ++i) {
+        for (int dim=0;dim<numDims;dim++) {
+          if (usedBytes[dim] != null) {
+            byte b = reader.getByteAt(i, dim * bytesPerDim + commonPrefixLengths[dim]);
+            usedBytes[dim].set(Byte.toUnsignedInt(b));
+          }
+        }
+      }
+      int sortedDim = 0;
+      int sortedDimCardinality = Integer.MAX_VALUE;
+      for (int dim = 0; dim < numDims; ++dim) {
+        if (usedBytes[dim] != null) {
+          final int cardinality = usedBytes[dim].cardinality();
+          if (cardinality < sortedDimCardinality) {
+            sortedDim = dim;
+            sortedDimCardinality = cardinality;
+          }
+        }
+      }
+
+      // sort by sortedDim
+      MutablePointsReaderUtils.sortByDim(sortedDim, bytesPerDim, commonPrefixLengths,
+                                         reader, from, to, scratchBytesRef1, scratchBytesRef2);
+
+      // Save the block file pointer:
+      leafBlockFPs[nodeID - leafNodeOffset] = out.getFilePointer();
+
+      // Write doc IDs
+      int[] docIDs = spareDocIds;
+      for (int i = from; i < to; ++i) {
+        docIDs[i - from] = reader.getDocID(i);
+      }
+      writeLeafBlockDocs(out, docIDs, 0, count);
+
+      // Write the common prefixes:
+      reader.getValue(from, scratchBytesRef1);
+      System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset, scratch1, 0, packedBytesLength);
+
+      // Write the full values:
+      IntFunction<BytesRef> packedValues = new IntFunction<BytesRef>() {
+        @Override
+        public BytesRef apply(int i) {
+          reader.getValue(from + i, scratchBytesRef1);
+          return scratchBytesRef1;
+        }
+      };
+      assert valuesInOrderAndBounds(count, sortedDim, minPackedValue, maxPackedValue, packedValues,
+          docIDs, 0);
+      writeLeafBlockPackedValues(out, commonPrefixLengths, count, sortedDim, packedValues);
+
+    } else {
+      // inner node
+
+      // compute the split dimension and partition around it
+      final int splitDim = split(minPackedValue, maxPackedValue);
+      final int mid = (from + to + 1) >>> 1;
+
+      int commonPrefixLen = bytesPerDim;
+      for (int i = 0; i < bytesPerDim; ++i) {
+        if (minPackedValue[splitDim * bytesPerDim + i] != maxPackedValue[splitDim * bytesPerDim + i]) {
+          commonPrefixLen = i;
+          break;
+        }
+      }
+      MutablePointsReaderUtils.partition(maxDoc, splitDim, bytesPerDim, commonPrefixLen,
+          reader, from, to, mid, scratchBytesRef1, scratchBytesRef2);
+
+      // set the split value
+      final int address = nodeID * (1+bytesPerDim);
+      splitPackedValues[address] = (byte) splitDim;
+      reader.getValue(mid, scratchBytesRef1);
+      System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + splitDim * bytesPerDim, splitPackedValues, address + 1, bytesPerDim);
+
+      byte[] minSplitPackedValue = Arrays.copyOf(minPackedValue, packedBytesLength);
+      byte[] maxSplitPackedValue = Arrays.copyOf(maxPackedValue, packedBytesLength);
+      System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + splitDim * bytesPerDim,
+          minSplitPackedValue, splitDim * bytesPerDim, bytesPerDim);
+      System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + splitDim * bytesPerDim,
+          maxSplitPackedValue, splitDim * bytesPerDim, bytesPerDim);
+
+      // recurse
+      build(nodeID * 2, leafNodeOffset, reader, from, mid, out,
+          minPackedValue, maxSplitPackedValue, splitPackedValues, leafBlockFPs, spareDocIds);
+      build(nodeID * 2 + 1, leafNodeOffset, reader, mid, to, out,
+          minSplitPackedValue, maxPackedValue, splitPackedValues, leafBlockFPs, spareDocIds);
+    }
+  }
+
+  /** The array (sized numDims) of PathSlice describe the cell we have currently recursed to. */
+  private void build(int nodeID, int leafNodeOffset,
+                     PathSlice[] slices,
+                     LongBitSet ordBitSet,
+                     IndexOutput out,
+                     byte[] minPackedValue, byte[] maxPackedValue,
+                     byte[] splitPackedValues,
+                     long[] leafBlockFPs,
+                     List<Closeable> toCloseHeroically) throws IOException {
+
+    for(PathSlice slice : slices) {
+      assert slice.count == slices[0].count;
+    }
+
+    if (numDims == 1 && slices[0].writer instanceof OfflinePointWriter && slices[0].count <= maxPointsSortInHeap) {
+      // Special case for 1D, to cutover to heap once we recurse deeply enough:
+      slices[0] = switchToHeap(slices[0], toCloseHeroically);
+    }
+
+    if (nodeID >= leafNodeOffset) {
+
+      // Leaf node: write block
+      // We can write the block in any order so by default we write it sorted by the dimension that has the
+      // least number of unique bytes at commonPrefixLengths[dim], which makes compression more efficient
+      int sortedDim = 0;
+      int sortedDimCardinality = Integer.MAX_VALUE;
+
+      for (int dim=0;dim<numDims;dim++) {
+        if (slices[dim].writer instanceof HeapPointWriter == false) {
+          // Adversarial cases can cause this, e.g. very lopsided data, all equal points, such that we started
+          // offline, but then kept splitting only in one dimension, and so never had to rewrite into heap writer
+          slices[dim] = switchToHeap(slices[dim], toCloseHeroically);
+        }
+
+        PathSlice source = slices[dim];
+
+        HeapPointWriter heapSource = (HeapPointWriter) source.writer;
+
+        // Find common prefix by comparing first and last values, already sorted in this dimension:
+        heapSource.readPackedValue(Math.toIntExact(source.start), scratch1);
+        heapSource.readPackedValue(Math.toIntExact(source.start + source.count - 1), scratch2);
+
+        int offset = dim * bytesPerDim;
+        commonPrefixLengths[dim] = bytesPerDim;
+        for(int j=0;j<bytesPerDim;j++) {
+          if (scratch1[offset+j] != scratch2[offset+j]) {
+            commonPrefixLengths[dim] = j;
+            break;
+          }
+        }
+
+        int prefix = commonPrefixLengths[dim];
+        if (prefix < bytesPerDim) {
+          int cardinality = 1;
+          byte previous = scratch1[offset + prefix];
+          for (long i = 1; i < source.count; ++i) {
+            heapSource.readPackedValue(Math.toIntExact(source.start + i), scratch2);
+            byte b = scratch2[offset + prefix];
+            assert Byte.toUnsignedInt(previous) <= Byte.toUnsignedInt(b);
+            if (b != previous) {
+              cardinality++;
+              previous = b;
+            }
+          }
+          assert cardinality <= 256;
+          if (cardinality < sortedDimCardinality) {
+            sortedDim = dim;
+            sortedDimCardinality = cardinality;
+          }
+        }
+      }
+
+      PathSlice source = slices[sortedDim];
+
+      // We ensured that maxPointsSortInHeap was >= maxPointsInLeafNode, so we better be in heap at this point:
+      HeapPointWriter heapSource = (HeapPointWriter) source.writer;
+
+      // Save the block file pointer:
+      leafBlockFPs[nodeID - leafNodeOffset] = out.getFilePointer();
+      //System.out.println("  write leaf block @ fp=" + out.getFilePointer());
+
+      // Write docIDs first, as their own chunk, so that at intersect time we can add all docIDs w/o
+      // loading the values:
+      int count = Math.toIntExact(source.count);
+      assert count > 0: "nodeID=" + nodeID + " leafNodeOffset=" + leafNodeOffset;
+      writeLeafBlockDocs(out, heapSource.docIDs, Math.toIntExact(source.start), count);
+
+      // TODO: minor opto: we don't really have to write the actual common prefixes, because BKDReader on recursing can regenerate it for us
+      // from the index, much like how terms dict does so from the FST:
+
+      // Write the full values:
+      IntFunction<BytesRef> packedValues = new IntFunction<BytesRef>() {
+        final BytesRef scratch = new BytesRef();
+
+        {
+          scratch.length = packedBytesLength;
+        }
+
+        @Override
+        public BytesRef apply(int i) {
+          heapSource.getPackedValueSlice(Math.toIntExact(source.start + i), scratch);
+          return scratch;
+        }
+      };
+      assert valuesInOrderAndBounds(count, sortedDim, minPackedValue, maxPackedValue, packedValues,
+          heapSource.docIDs, Math.toIntExact(source.start));
+      writeLeafBlockPackedValues(out, commonPrefixLengths, count, sortedDim, packedValues);
+
+    } else {
+      // Inner node: partition/recurse
+
+      int splitDim;
+      if (numDims > 1) {
+        splitDim = split(minPackedValue, maxPackedValue);
+      } else {
+        splitDim = 0;
+      }
+
+      PathSlice source = slices[splitDim];
+
+      assert nodeID < splitPackedValues.length: "nodeID=" + nodeID + " splitValues.length=" + splitPackedValues.length;
+
+      // How many points will be in the left tree:
+      long rightCount = source.count / 2;
+      long leftCount = source.count - rightCount;
+
+      byte[] splitValue = markRightTree(rightCount, splitDim, source, ordBitSet);
+      int address = nodeID * (1+bytesPerDim);
+      splitPackedValues[address] = (byte) splitDim;
+      System.arraycopy(splitValue, 0, splitPackedValues, address + 1, bytesPerDim);
+
+      // Partition all PathSlice that are not the split dim into sorted left and right sets, so we can recurse:
+
+      PathSlice[] leftSlices = new PathSlice[numDims];
+      PathSlice[] rightSlices = new PathSlice[numDims];
+
+      byte[] minSplitPackedValue = new byte[packedBytesLength];
+      System.arraycopy(minPackedValue, 0, minSplitPackedValue, 0, packedBytesLength);
+
+      byte[] maxSplitPackedValue = new byte[packedBytesLength];
+      System.arraycopy(maxPackedValue, 0, maxSplitPackedValue, 0, packedBytesLength);
+
+      // When we are on this dim, below, we clear the ordBitSet:
+      int dimToClear;
+      if (numDims - 1 == splitDim) {
+        dimToClear = numDims - 2;
+      } else {
+        dimToClear = numDims - 1;
+      }
+
+      for(int dim=0;dim<numDims;dim++) {
+
+        if (dim == splitDim) {
+          // No need to partition on this dim since it's a simple slice of the incoming already sorted slice, and we
+          // will re-use its shared reader when visiting it as we recurse:
+          leftSlices[dim] = new PathSlice(source.writer, source.start, leftCount);
+          rightSlices[dim] = new PathSlice(source.writer, source.start + leftCount, rightCount);
+          System.arraycopy(splitValue, 0, minSplitPackedValue, dim*bytesPerDim, bytesPerDim);
+          System.arraycopy(splitValue, 0, maxSplitPackedValue, dim*bytesPerDim, bytesPerDim);
+          continue;
+        }
+
+        // Not inside the try because we don't want to close this one now, so that after recursion is done,
+        // we will have done a singel full sweep of the file:
+        PointReader reader = slices[dim].writer.getSharedReader(slices[dim].start, slices[dim].count, toCloseHeroically);
+
+        try (PointWriter leftPointWriter = getPointWriter(leftCount, "left" + dim);
+             PointWriter rightPointWriter = getPointWriter(source.count - leftCount, "right" + dim)) {
+
+          long nextRightCount = reader.split(source.count, ordBitSet, leftPointWriter, rightPointWriter, dim == dimToClear);
+          if (rightCount != nextRightCount) {
+            throw new IllegalStateException("wrong number of points in split: expected=" + rightCount + " but actual=" + nextRightCount);
+          }
+
+          leftSlices[dim] = new PathSlice(leftPointWriter, 0, leftCount);
+          rightSlices[dim] = new PathSlice(rightPointWriter, 0, rightCount);
+        } catch (Throwable t) {
+          verifyChecksum(t, slices[dim].writer);
+        }
+      }
+
+      // Recurse on left tree:
+      build(2*nodeID, leafNodeOffset, leftSlices,
+            ordBitSet, out,
+            minPackedValue, maxSplitPackedValue,
+            splitPackedValues, leafBlockFPs, toCloseHeroically);
+      for(int dim=0;dim<numDims;dim++) {
+        // Don't destroy the dim we split on because we just re-used what our caller above gave us for that dim:
+        if (dim != splitDim) {
+          leftSlices[dim].writer.destroy();
+        }
+      }
+
+      // TODO: we could "tail recurse" here?  have our parent discard its refs as we recurse right?
+      // Recurse on right tree:
+      build(2*nodeID+1, leafNodeOffset, rightSlices,
+            ordBitSet, out,
+            minSplitPackedValue, maxPackedValue,
+            splitPackedValues, leafBlockFPs, toCloseHeroically);
+      for(int dim=0;dim<numDims;dim++) {
+        // Don't destroy the dim we split on because we just re-used what our caller above gave us for that dim:
+        if (dim != splitDim) {
+          rightSlices[dim].writer.destroy();
+        }
+      }
+    }
+  }
+
+  // only called from assert
+  private boolean valuesInOrderAndBounds(int count, int sortedDim, byte[] minPackedValue, byte[] maxPackedValue,
+      IntFunction<BytesRef> values, int[] docs, int docsOffset) throws IOException {
+    byte[] lastPackedValue = new byte[packedBytesLength];
+    int lastDoc = -1;
+    for (int i=0;i<count;i++) {
+      BytesRef packedValue = values.apply(i);
+      assert packedValue.length == packedBytesLength;
+      assert valueInOrder(i, sortedDim, lastPackedValue, packedValue.bytes, packedValue.offset,
+          docs[docsOffset + i], lastDoc);
+      lastDoc = docs[docsOffset + i];
+
+      // Make sure this value does in fact fall within this leaf cell:
+      assert valueInBounds(packedValue, minPackedValue, maxPackedValue);
+    }
+    return true;
+  }
+
+  // only called from assert
+  private boolean valueInOrder(long ord, int sortedDim, byte[] lastPackedValue, byte[] packedValue, int packedValueOffset,
+      int doc, int lastDoc) {
+    int dimOffset = sortedDim * bytesPerDim;
+    if (ord > 0) {
+      int cmp = StringHelper.compare(bytesPerDim, lastPackedValue, dimOffset, packedValue, packedValueOffset + dimOffset);
+      if (cmp > 0) {
+        throw new AssertionError("values out of order: last value=" + new BytesRef(lastPackedValue) + " current value=" + new BytesRef(packedValue, packedValueOffset, packedBytesLength) + " ord=" + ord);
+      }
+      if (cmp == 0 && doc < lastDoc) {
+        throw new AssertionError("docs out of order: last doc=" + lastDoc + " current doc=" + doc + " ord=" + ord);
+      }
+    }
+    System.arraycopy(packedValue, packedValueOffset, lastPackedValue, 0, packedBytesLength);
+    return true;
+  }
+
+  PointWriter getPointWriter(long count, String desc) throws IOException {
+    if (count <= maxPointsSortInHeap) {
+      int size = Math.toIntExact(count);
+      return new HeapPointWriter(size, size, packedBytesLength, longOrds, singleValuePerDoc);
+    } else {
+      return new OfflinePointWriter(tempDir, tempFileNamePrefix, packedBytesLength, longOrds, desc, count, singleValuePerDoc);
+    }
+  }
+
+  private void write(IndexOutput out, String s) throws IOException {
+    SimpleTextUtil.write(out, s, scratch);
+  }
+
+  private void writeInt(IndexOutput out, int x) throws IOException {
+    SimpleTextUtil.write(out, Integer.toString(x), scratch);
+  }
+
+  private void writeLong(IndexOutput out, long x) throws IOException {
+    SimpleTextUtil.write(out, Long.toString(x), scratch);
+  }
+
+  private void write(IndexOutput out, BytesRef b) throws IOException {
+    SimpleTextUtil.write(out, b);
+  }
+
+  private void newline(IndexOutput out) throws IOException {
+    SimpleTextUtil.writeNewline(out);
+  }
+}
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsReader.java
index f7ff16ecbc2..453bd2384b2 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsReader.java
@@ -36,7 +36,6 @@ import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.StringHelper;
-import org.apache.lucene.util.bkd.BKDReader;
 
 import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_FP;
 import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BYTES_PER_DIM;
@@ -58,7 +57,7 @@ class SimpleTextPointsReader extends PointsReader {
 
   private final IndexInput dataIn;
   final SegmentReadState readState;
-  final Map<String,BKDReader> readers = new HashMap<>();
+  final Map<String,SimpleTextBKDReader> readers = new HashMap<>();
   final BytesRefBuilder scratch = new BytesRefBuilder();
 
   public SimpleTextPointsReader(SegmentReadState readState) throws IOException {
@@ -98,7 +97,7 @@ class SimpleTextPointsReader extends PointsReader {
     this.readState = readState;
   }
 
-  private BKDReader initReader(long fp) throws IOException {
+  private SimpleTextBKDReader initReader(long fp) throws IOException {
     // NOTE: matches what writeIndex does in SimpleTextPointsWriter
     dataIn.seek(fp);
     readLine(dataIn);
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java
index c06c128d154..9d2db890fa0 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java
@@ -20,7 +20,6 @@ package org.apache.lucene.codecs.simpletext;
 import java.io.IOException;
 import java.util.HashMap;
 import java.util.Map;
-import java.util.function.IntFunction;
 
 import org.apache.lucene.codecs.PointsReader;
 import org.apache.lucene.codecs.PointsWriter;
@@ -33,29 +32,28 @@ import org.apache.lucene.index.SegmentWriteState;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
-import org.apache.lucene.util.bkd.BKDWriter;
 
 class SimpleTextPointsWriter extends PointsWriter {
 
-  final static BytesRef NUM_DIMS      = new BytesRef("num dims ");
-  final static BytesRef BYTES_PER_DIM = new BytesRef("bytes per dim ");
-  final static BytesRef MAX_LEAF_POINTS = new BytesRef("max leaf points ");
-  final static BytesRef INDEX_COUNT = new BytesRef("index count ");
-  final static BytesRef BLOCK_COUNT   = new BytesRef("block count ");
-  final static BytesRef BLOCK_DOC_ID  = new BytesRef("  doc ");
-  final static BytesRef BLOCK_FP      = new BytesRef("  block fp ");
-  final static BytesRef BLOCK_VALUE   = new BytesRef("  block value ");
-  final static BytesRef SPLIT_COUNT   = new BytesRef("split count ");
-  final static BytesRef SPLIT_DIM     = new BytesRef("  split dim ");
-  final static BytesRef SPLIT_VALUE   = new BytesRef("  split value ");
-  final static BytesRef FIELD_COUNT   = new BytesRef("field count ");
-  final static BytesRef FIELD_FP_NAME = new BytesRef("  field fp name ");
-  final static BytesRef FIELD_FP      = new BytesRef("  field fp ");
-  final static BytesRef MIN_VALUE     = new BytesRef("min value ");
-  final static BytesRef MAX_VALUE     = new BytesRef("max value ");
-  final static BytesRef POINT_COUNT   = new BytesRef("point count ");
-  final static BytesRef DOC_COUNT     = new BytesRef("doc count ");
-  final static BytesRef END           = new BytesRef("END");
+  public final static BytesRef NUM_DIMS      = new BytesRef("num dims ");
+  public final static BytesRef BYTES_PER_DIM = new BytesRef("bytes per dim ");
+  public final static BytesRef MAX_LEAF_POINTS = new BytesRef("max leaf points ");
+  public final static BytesRef INDEX_COUNT = new BytesRef("index count ");
+  public final static BytesRef BLOCK_COUNT   = new BytesRef("block count ");
+  public final static BytesRef BLOCK_DOC_ID  = new BytesRef("  doc ");
+  public final static BytesRef BLOCK_FP      = new BytesRef("  block fp ");
+  public final static BytesRef BLOCK_VALUE   = new BytesRef("  block value ");
+  public final static BytesRef SPLIT_COUNT   = new BytesRef("split count ");
+  public final static BytesRef SPLIT_DIM     = new BytesRef("  split dim ");
+  public final static BytesRef SPLIT_VALUE   = new BytesRef("  split value ");
+  public final static BytesRef FIELD_COUNT   = new BytesRef("field count ");
+  public final static BytesRef FIELD_FP_NAME = new BytesRef("  field fp name ");
+  public final static BytesRef FIELD_FP      = new BytesRef("  field fp ");
+  public final static BytesRef MIN_VALUE     = new BytesRef("min value ");
+  public final static BytesRef MAX_VALUE     = new BytesRef("max value ");
+  public final static BytesRef POINT_COUNT   = new BytesRef("point count ");
+  public final static BytesRef DOC_COUNT     = new BytesRef("doc count ");
+  public final static BytesRef END           = new BytesRef("END");
 
   private IndexOutput dataOut;
   final BytesRefBuilder scratch = new BytesRefBuilder();
@@ -75,105 +73,15 @@ class SimpleTextPointsWriter extends PointsWriter {
     boolean singleValuePerDoc = values.size() == values.getDocCount();
 
     // We use the normal BKDWriter, but subclass to customize how it writes the index and blocks to disk:
-    try (BKDWriter writer = new BKDWriter(writeState.segmentInfo.maxDoc(),
-                                          writeState.directory,
-                                          writeState.segmentInfo.name,
-                                          fieldInfo.getPointDimensionCount(),
-                                          fieldInfo.getPointNumBytes(),
-                                          BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE,
-                                          BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP,
-                                          values.size(),
-                                          singleValuePerDoc) {
-
-        @Override
-        protected void writeIndex(IndexOutput out, long[] leafBlockFPs, byte[] splitPackedValues) throws IOException {
-          write(out, NUM_DIMS);
-          writeInt(out, numDims);
-          newline(out);
-
-          write(out, BYTES_PER_DIM);
-          writeInt(out, bytesPerDim);
-          newline(out);
-
-          write(out, MAX_LEAF_POINTS);
-          writeInt(out, maxPointsInLeafNode);
-          newline(out);
-
-          write(out, INDEX_COUNT);
-          writeInt(out, leafBlockFPs.length);
-          newline(out);
-
-          write(out, MIN_VALUE);
-          BytesRef br = new BytesRef(minPackedValue, 0, minPackedValue.length);
-          write(out, br.toString());
-          newline(out);
-
-          write(out, MAX_VALUE);
-          br = new BytesRef(maxPackedValue, 0, maxPackedValue.length);
-          write(out, br.toString());
-          newline(out);
-
-          write(out, POINT_COUNT);
-          writeLong(out, pointCount);
-          newline(out);
-
-          write(out, DOC_COUNT);
-          writeInt(out, docsSeen.cardinality());
-          newline(out);
-
-          for(int i=0;i<leafBlockFPs.length;i++) {
-            write(out, BLOCK_FP);
-            writeLong(out, leafBlockFPs[i]);
-            newline(out);
-          }
-
-          assert (splitPackedValues.length % (1 + fieldInfo.getPointNumBytes())) == 0;
-          int count = splitPackedValues.length / (1 + fieldInfo.getPointNumBytes());
-          assert count == leafBlockFPs.length;
-
-          write(out, SPLIT_COUNT);
-          writeInt(out, count);
-          newline(out);
-
-          for(int i=0;i<count;i++) {
-            write(out, SPLIT_DIM);
-            writeInt(out, splitPackedValues[i * (1 + fieldInfo.getPointNumBytes())] & 0xff);
-            newline(out);
-            write(out, SPLIT_VALUE);
-            br = new BytesRef(splitPackedValues, 1+(i * (1+fieldInfo.getPointNumBytes())), fieldInfo.getPointNumBytes());
-            write(out, br.toString());
-            newline(out);
-          }
-        }
-
-        @Override
-        protected void writeLeafBlockDocs(IndexOutput out, int[] docIDs, int start, int count) throws IOException {
-          write(out, BLOCK_COUNT);
-          writeInt(out, count);
-          newline(out);
-          for(int i=0;i<count;i++) {
-            write(out, BLOCK_DOC_ID);
-            writeInt(out, docIDs[start+i]);
-            newline(out);
-          }
-        }
-
-        @Override
-        protected void writeCommonPrefixes(IndexOutput out, int[] commonPrefixLengths, byte[] packedValue) {
-          // NOTE: we don't do prefix coding, so we ignore commonPrefixLengths
-        }
-
-        @Override
-        protected void writeLeafBlockPackedValues(IndexOutput out, int[] commonPrefixLengths, int count, int sortedDim, IntFunction<BytesRef> packedValues) throws IOException {
-          for (int i = 0; i < count; ++i) {
-            BytesRef packedValue = packedValues.apply(i);
-            // NOTE: we don't do prefix coding, so we ignore commonPrefixLengths
-            write(out, BLOCK_VALUE);
-            write(out, packedValue.toString());
-            newline(out);
-          }
-        }
-      }) {
+    try (SimpleTextBKDWriter writer = new SimpleTextBKDWriter(writeState.segmentInfo.maxDoc(),
+                                                              writeState.directory,
+                                                              writeState.segmentInfo.name,
+                                                              fieldInfo.getPointDimensionCount(),
+                                                              fieldInfo.getPointNumBytes(),
+                                                              SimpleTextBKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE,
+                                                              SimpleTextBKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP,
+                                                              values.size(),
+                                                              singleValuePerDoc)) {
 
       values.intersect(new IntersectVisitor() {
           @Override
@@ -198,26 +106,6 @@ class SimpleTextPointsWriter extends PointsWriter {
     }
   }
 
-  private void write(IndexOutput out, String s) throws IOException {
-    SimpleTextUtil.write(out, s, scratch);
-  }
-
-  private void writeInt(IndexOutput out, int x) throws IOException {
-    SimpleTextUtil.write(out, Integer.toString(x), scratch);
-  }
-
-  private void writeLong(IndexOutput out, long x) throws IOException {
-    SimpleTextUtil.write(out, Long.toString(x), scratch);
-  }
-
-  private void write(IndexOutput out, BytesRef b) throws IOException {
-    SimpleTextUtil.write(out, b);
-  }
-
-  private void newline(IndexOutput out) throws IOException {
-    SimpleTextUtil.writeNewline(out);
-  }
-
   @Override
   public void finish() throws IOException {
     SimpleTextUtil.write(dataOut, END);
@@ -250,4 +138,24 @@ class SimpleTextPointsWriter extends PointsWriter {
       }
     }
   }
+
+  private void write(IndexOutput out, String s) throws IOException {
+    SimpleTextUtil.write(out, s, scratch);
+  }
+
+  private void writeInt(IndexOutput out, int x) throws IOException {
+    SimpleTextUtil.write(out, Integer.toString(x), scratch);
+  }
+
+  private void writeLong(IndexOutput out, long x) throws IOException {
+    SimpleTextUtil.write(out, Long.toString(x), scratch);
+  }
+
+  private void write(IndexOutput out, BytesRef b) throws IOException {
+    SimpleTextUtil.write(out, b);
+  }
+
+  private void newline(IndexOutput out) throws IOException {
+    SimpleTextUtil.writeNewline(out);
+  }
 }
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsFormat.java
index e558d0d4fa8..1d2285c73b6 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsFormat.java
@@ -28,7 +28,8 @@ import org.apache.lucene.index.SegmentWriteState;
 
 /**
  * Lucene 6.0 point format, which encodes dimensional values in a block KD-tree structure
- * for fast shape intersection filtering. See <a href="https://www.cs.duke.edu/~pankaj/publications/papers/bkd-sstd.pdf">this paper</a> for details.
+ * for fast 1D range and N dimesional shape intersection filtering.
+ * See <a href="https://www.cs.duke.edu/~pankaj/publications/papers/bkd-sstd.pdf">this paper</a> for details.
  *
  * <p>This data structure is written as a series of blocks on disk, with an in-memory perfectly balanced
  * binary tree of split values referencing those blocks at the leaves.
@@ -50,10 +51,13 @@ import org.apache.lucene.index.SegmentWriteState;
  *  <li> maxPointsInLeafNode (vInt)
  *  <li> bytesPerDim (vInt)
  *  <li> count (vInt)
- *  <li> byte[bytesPerDim]<sup>count</sup> (packed <code>byte[]</code> all split values)
- *  <li> delta-blockFP (vLong)<sup>count</sup> (delta-coded file pointers to the on-disk leaf blocks))
+ *  <li> packed index (byte[])
  * </ul>
  *
+ * <p>The packed index uses hierarchical delta and prefix coding to compactly encode the file pointer for
+ * all leaf blocks, once the tree is traversed, as well as the split dimension and split value for each
+ * inner node of the tree.
+ *
  * <p>After all fields blocks + index data are written, {@link CodecUtil#writeFooter} writes the checksum.
  *
  * <p>The <code>.dii</code> file records the file pointer in the <code>.dim</code> file where each field's
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/package-info.java
index 8968a6d624c..a914001d9d2 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/package-info.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/package-info.java
@@ -16,7 +16,7 @@
  */
 
 /**
- * Components from the Lucene 6.0 index format.  See {@link org.apache.lucene.codecs.lucene62}
- * for an overview of the index format.
+ * Components from the Lucene 6.0 index format.  See {@link org.apache.lucene.codecs.lucene70}
+ * for an overview of the current index format.
  */
 package org.apache.lucene.codecs.lucene60;
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene62/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene62/package-info.java
index 2fe2dc74b4a..fb556732d08 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene62/package-info.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene62/package-info.java
@@ -17,8 +17,8 @@
 
 /**
  * Components from the Lucene 6.2 index format
- * See {@link org.apache.lucene.codecs.lucene62} for an overview
- * of the index format.
+ * See {@link org.apache.lucene.codecs.lucene70} for an overview
+ * of the current index format.
  */
 
 package org.apache.lucene.codecs.lucene62;
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/package-info.java
index 9b432f7c4f4..cab2859766e 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/package-info.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/package-info.java
@@ -185,6 +185,12 @@
  * {@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live documents}. 
  * An optional file indicating which documents are live.
  * </li>
+ * <li>
+ * {@link org.apache.lucene.codecs.lucene60.Lucene60PointsFormat Point values}.
+ * Optional pair of files, recording dimensionally indexed fields, to enable fast
+ * numeric range filtering and large numeric values like BigInteger and BigDecimal (1D)
+ * and geographic shape intersection (2D, 3D).
+ * </li>
  * </ul>
  * <p>Details on each of these are provided in their linked pages.</p>
  * </div>
@@ -300,7 +306,12 @@
  * <tr>
  * <td>{@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live Documents}</td>
  * <td>.liv</td>
- * <td>Info about what files are live</td>
+ * <td>Info about what documents are live</td>
+ * </tr>
+ * <tr>
+ * <td>{@link org.apache.lucene.codecs.lucene60.Lucene60PointsFormat Point values}</td>
+ * <td>.dii, .dim</td>
+ * <td>Holds indexed points, if any</td>
  * </tr>
  * </table>
  * </div>
@@ -374,6 +385,8 @@
  * that is suitable for faceting/sorting/analytics.
  * <li>In version 5.4, DocValues have been improved to store more information on disk:
  * addresses for binary fields and ord indexes for multi-valued fields.
+ * <li>In version 6.0, Points were added, for multi-dimensional range/distance search.
+ * <li>In version 6.2, new Segment info format that reads/writes the index sort, to support index sorting.
  * <li>In version 7.0, DocValues have been improved to better support sparse doc values
  * thanks to an iterator API.
  * </li>
diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
index 7bc08f3c4a8..fd8011d4d07 100644
--- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
+++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
@@ -1801,161 +1801,32 @@ public final class CheckIndex implements Closeable {
         }
         for (FieldInfo fieldInfo : fieldInfos) {
           if (fieldInfo.getPointDimensionCount() > 0) {
-            FixedBitSet docsSeen = new FixedBitSet(reader.maxDoc());
-            status.totalValueFields++;
-            int dimCount = fieldInfo.getPointDimensionCount();
-            int bytesPerDim = fieldInfo.getPointNumBytes();
-            int packedBytesCount = dimCount * bytesPerDim;
-            byte[] lastMinPackedValue = new byte[packedBytesCount];
-            byte[] lastMaxPackedValue = new byte[packedBytesCount];
-            BytesRef scratch = new BytesRef();
-            scratch.length = bytesPerDim;
-            byte[] lastPackedValue = new byte[packedBytesCount];
-
-            long[] pointCountSeen = new long[1];
-
             PointValues values = pointsReader.getValues(fieldInfo.name);
             if (values == null) {
               continue;
             }
-            byte[] globalMinPackedValue = values.getMinPackedValue();
+
+            status.totalValueFields++;
+
             long size = values.size();
             int docCount = values.getDocCount();
 
-            if (docCount > size) {
-              throw new RuntimeException("point values for field \"" + fieldInfo.name + "\" claims to have size=" + size + " points and inconsistent docCount=" + docCount);
+            VerifyPointsVisitor visitor = new VerifyPointsVisitor(fieldInfo.name, reader.maxDoc(), values);
+            values.intersect(visitor);
+
+            if (visitor.getPointCountSeen() != size) {
+              throw new RuntimeException("point values for field \"" + fieldInfo.name + "\" claims to have size=" + size + " points, but in fact has " + visitor.getPointCountSeen());
             }
 
-            if (docCount > reader.maxDoc()) {
-              throw new RuntimeException("point values for field \"" + fieldInfo.name + "\" claims to have docCount=" + docCount + " but that's greater than maxDoc=" + reader.maxDoc());
+            if (visitor.getDocCountSeen() != docCount) {
+              throw new RuntimeException("point values for field \"" + fieldInfo.name + "\" claims to have docCount=" + docCount + " but in fact has " + visitor.getDocCountSeen());
             }
 
-            if (globalMinPackedValue == null) {
-              if (size != 0) {
-                throw new RuntimeException("getMinPackedValue is null points for field \"" + fieldInfo.name + "\" yet size=" + size);
-              }
-            } else if (globalMinPackedValue.length != packedBytesCount) {
-              throw new RuntimeException("getMinPackedValue for field \"" + fieldInfo.name + "\" return length=" + globalMinPackedValue.length + " array, but should be " + packedBytesCount);
-            }
-            byte[] globalMaxPackedValue = values.getMaxPackedValue();
-            if (globalMaxPackedValue == null) {
-              if (size != 0) {
-                throw new RuntimeException("getMaxPackedValue is null points for field \"" + fieldInfo.name + "\" yet size=" + size);
-              }
-            } else if (globalMaxPackedValue.length != packedBytesCount) {
-              throw new RuntimeException("getMaxPackedValue for field \"" + fieldInfo.name + "\" return length=" + globalMaxPackedValue.length + " array, but should be " + packedBytesCount);
-            }
-
-            values.intersect(new PointValues.IntersectVisitor() {
-
-                               private int lastDocID = -1;
-
-                               @Override
-                               public void visit(int docID) {
-                                 throw new RuntimeException("codec called IntersectVisitor.visit without a packed value for docID=" + docID);
-                               }
-
-                               @Override
-                               public void visit(int docID, byte[] packedValue) {
-                                 checkPackedValue("packed value", packedValue, docID);
-                                 pointCountSeen[0]++;
-                                 docsSeen.set(docID);
-
-                                 for(int dim=0;dim<dimCount;dim++) {
-                                   int offset = bytesPerDim * dim;
-
-                                   // Compare to last cell:
-                                   if (StringHelper.compare(bytesPerDim, packedValue, offset, lastMinPackedValue, offset) < 0) {
-                                     // This doc's point, in this dimension, is lower than the minimum value of the last cell checked:
-                                     throw new RuntimeException("packed points value " + Arrays.toString(packedValue) + " for field=\"" + fieldInfo.name + "\", docID=" + docID + " is out-of-bounds of the last cell min=" + Arrays.toString(lastMinPackedValue) + " max=" + Arrays.toString(lastMaxPackedValue) + " dim=" + dim);
-                                   }
-
-                                   if (StringHelper.compare(bytesPerDim, packedValue, offset, lastMaxPackedValue, offset) > 0) {
-                                     // This doc's point, in this dimension, is greater than the maximum value of the last cell checked:
-                                     throw new RuntimeException("packed points value " + Arrays.toString(packedValue) + " for field=\"" + fieldInfo.name + "\", docID=" + docID + " is out-of-bounds of the last cell min=" + Arrays.toString(lastMinPackedValue) + " max=" + Arrays.toString(lastMaxPackedValue) + " dim=" + dim);
-                                   }
-                                 }
-
-                                 // In the 1D case, PointValues must make a single in-order sweep through all values, and tie-break by
-                                 // increasing docID:
-                                 if (dimCount == 1) {
-                                   int cmp = StringHelper.compare(bytesPerDim, lastPackedValue, 0, packedValue, 0);
-                                   if (cmp > 0) {
-                                     throw new RuntimeException("packed points value " + Arrays.toString(packedValue) + " for field=\"" + fieldInfo.name + "\", for docID=" + docID + " is out-of-order vs the previous document's value " + Arrays.toString(lastPackedValue));
-                                   } else if (cmp == 0) {
-                                     if (docID < lastDocID) {
-                                       throw new RuntimeException("packed points value is the same, but docID=" + docID + " is out of order vs previous docID=" + lastDocID + ", field=\"" + fieldInfo.name + "\"");
-                                     }
-                                   }
-                                   System.arraycopy(packedValue, 0, lastPackedValue, 0, bytesPerDim);
-                                   lastDocID = docID;
-                                 }
-
-                                 status.totalValuePoints++;
-                               }
-
-                               @Override
-                               public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
-                                 checkPackedValue("min packed value", minPackedValue, -1);
-                                 System.arraycopy(minPackedValue, 0, lastMinPackedValue, 0, packedBytesCount);
-                                 checkPackedValue("max packed value", maxPackedValue, -1);
-                                 System.arraycopy(maxPackedValue, 0, lastMaxPackedValue, 0, packedBytesCount);
-
-                                 for(int dim=0;dim<dimCount;dim++) {
-                                   int offset = bytesPerDim * dim;
-
-                                   if (StringHelper.compare(bytesPerDim, minPackedValue, offset, maxPackedValue, offset) > 0) {
-                                     throw new RuntimeException("packed points cell minPackedValue " + Arrays.toString(minPackedValue) +
-                                                                " is out-of-bounds of the cell's maxPackedValue " + Arrays.toString(maxPackedValue) + " dim=" + dim + " field=\"" + fieldInfo.name + "\"");
-                                   }
-
-                                   // Make sure this cell is not outside of the global min/max:
-                                   if (StringHelper.compare(bytesPerDim, minPackedValue, offset, globalMinPackedValue, offset) < 0) {
-                                     throw new RuntimeException("packed points cell minPackedValue " + Arrays.toString(minPackedValue) +
-                                                                " is out-of-bounds of the global minimum " + Arrays.toString(globalMinPackedValue) + " dim=" + dim + " field=\"" + fieldInfo.name + "\"");
-                                   }
-
-                                   if (StringHelper.compare(bytesPerDim, maxPackedValue, offset, globalMinPackedValue, offset) < 0) {
-                                     throw new RuntimeException("packed points cell maxPackedValue " + Arrays.toString(maxPackedValue) +
-                                                                " is out-of-bounds of the global minimum " + Arrays.toString(globalMinPackedValue) + " dim=" + dim + " field=\"" + fieldInfo.name + "\"");
-                                   }
-
-                                   if (StringHelper.compare(bytesPerDim, minPackedValue, offset, globalMaxPackedValue, offset) > 0) {
-                                     throw new RuntimeException("packed points cell minPackedValue " + Arrays.toString(minPackedValue) +
-                                                                " is out-of-bounds of the global maximum " + Arrays.toString(globalMaxPackedValue) + " dim=" + dim + " field=\"" + fieldInfo.name + "\"");
-                                   }
-                                   if (StringHelper.compare(bytesPerDim, maxPackedValue, offset, globalMaxPackedValue, offset) > 0) {
-                                     throw new RuntimeException("packed points cell maxPackedValue " + Arrays.toString(maxPackedValue) +
-                                                                " is out-of-bounds of the global maximum " + Arrays.toString(globalMaxPackedValue) + " dim=" + dim + " field=\"" + fieldInfo.name + "\"");
-                                   }
-                                 }                                   
-
-                                 // We always pretend the query shape is so complex that it crosses every cell, so
-                                 // that packedValue is passed for every document
-                                 return PointValues.Relation.CELL_CROSSES_QUERY;
-                               }
-
-                               private void checkPackedValue(String desc, byte[] packedValue, int docID) {
-                                 if (packedValue == null) {
-                                   throw new RuntimeException(desc + " is null for docID=" + docID + " field=\"" + fieldInfo.name + "\"");
-                                 }
-
-                                 if (packedValue.length != packedBytesCount) {
-                                   throw new RuntimeException(desc + " has incorrect length=" + packedValue.length + " vs expected=" + packedBytesCount + " for docID=" + docID + " field=\"" + fieldInfo.name + "\"");
-                                 }
-                               }
-                             });
-
-            if (pointCountSeen[0] != size) {
-              throw new RuntimeException("point values for field \"" + fieldInfo.name + "\" claims to have size=" + size + " points, but in fact has " + pointCountSeen[0]);
-            }
-
-            if (docsSeen.cardinality() != docCount) {
-              throw new RuntimeException("point values for field \"" + fieldInfo.name + "\" claims to have docCount=" + docCount + " but in fact has " + docsSeen.cardinality());
-            }
+            status.totalValuePoints += visitor.getPointCountSeen();
           }
         }
       }
+
       msg(infoStream, String.format(Locale.ROOT, "OK [%d fields, %d points] [took %.3f sec]", status.totalValueFields, status.totalValuePoints, nsToSec(System.nanoTime()-startNS)));
 
     } catch (Throwable e) {
@@ -1972,6 +1843,167 @@ public final class CheckIndex implements Closeable {
     return status;
   }
 
+  /** Walks the entire N-dimensional points space, verifying that all points fall within the last cell's boundaries.
+   *
+   * @lucene.internal */
+  public static class VerifyPointsVisitor implements PointValues.IntersectVisitor {
+    private long pointCountSeen;
+    private int lastDocID = -1;
+    private final int maxDoc;
+    private final FixedBitSet docsSeen;
+    private final byte[] lastMinPackedValue;
+    private final byte[] lastMaxPackedValue;
+    private final byte[] lastPackedValue;
+    private final byte[] globalMinPackedValue;
+    private final byte[] globalMaxPackedValue;
+    private final int packedBytesCount;
+    private final int numDims;
+    private final int bytesPerDim;
+    private final String fieldName;
+
+    /** Sole constructor */
+    public VerifyPointsVisitor(String fieldName, int maxDoc, PointValues values) throws IOException {
+      this.maxDoc = maxDoc;
+      this.fieldName = fieldName;
+      numDims = values.getNumDimensions();
+      bytesPerDim = values.getBytesPerDimension();
+      packedBytesCount = numDims * bytesPerDim;
+      globalMinPackedValue = values.getMinPackedValue();
+      globalMaxPackedValue = values.getMaxPackedValue();
+      docsSeen = new FixedBitSet(maxDoc);
+      lastMinPackedValue = new byte[packedBytesCount];
+      lastMaxPackedValue = new byte[packedBytesCount];
+      lastPackedValue = new byte[packedBytesCount];
+
+      if (values.getDocCount() > values.size()) {
+        throw new RuntimeException("point values for field \"" + fieldName + "\" claims to have size=" + values.size() + " points and inconsistent docCount=" + values.getDocCount());
+      }
+
+      if (values.getDocCount() > maxDoc) {
+        throw new RuntimeException("point values for field \"" + fieldName + "\" claims to have docCount=" + values.getDocCount() + " but that's greater than maxDoc=" + maxDoc);
+      }
+
+      if (globalMinPackedValue == null) {
+        if (values.size() != 0) {
+          throw new RuntimeException("getMinPackedValue is null points for field \"" + fieldName + "\" yet size=" + values.size());
+        }
+      } else if (globalMinPackedValue.length != packedBytesCount) {
+        throw new RuntimeException("getMinPackedValue for field \"" + fieldName + "\" return length=" + globalMinPackedValue.length + " array, but should be " + packedBytesCount);
+      }
+      if (globalMaxPackedValue == null) {
+        if (values.size() != 0) {
+          throw new RuntimeException("getMaxPackedValue is null points for field \"" + fieldName + "\" yet size=" + values.size());
+        }
+      } else if (globalMaxPackedValue.length != packedBytesCount) {
+        throw new RuntimeException("getMaxPackedValue for field \"" + fieldName + "\" return length=" + globalMaxPackedValue.length + " array, but should be " + packedBytesCount);
+      }
+    }
+
+    /** Returns total number of points in this BKD tree */
+    public long getPointCountSeen() {
+      return pointCountSeen;
+    }
+
+    /** Returns total number of unique docIDs in this BKD tree */
+    public long getDocCountSeen() {
+      return docsSeen.cardinality();
+    }
+
+    @Override
+    public void visit(int docID) {
+      throw new RuntimeException("codec called IntersectVisitor.visit without a packed value for docID=" + docID);
+    }
+
+    @Override
+    public void visit(int docID, byte[] packedValue) {
+      checkPackedValue("packed value", packedValue, docID);
+      pointCountSeen++;
+      docsSeen.set(docID);
+
+      for(int dim=0;dim<numDims;dim++) {
+        int offset = bytesPerDim * dim;
+
+        // Compare to last cell:
+        if (StringHelper.compare(bytesPerDim, packedValue, offset, lastMinPackedValue, offset) < 0) {
+          // This doc's point, in this dimension, is lower than the minimum value of the last cell checked:
+          throw new RuntimeException("packed points value " + Arrays.toString(packedValue) + " for field=\"" + fieldName + "\", docID=" + docID + " is out-of-bounds of the last cell min=" + Arrays.toString(lastMinPackedValue) + " max=" + Arrays.toString(lastMaxPackedValue) + " dim=" + dim);
+        }
+
+        if (StringHelper.compare(bytesPerDim, packedValue, offset, lastMaxPackedValue, offset) > 0) {
+          // This doc's point, in this dimension, is greater than the maximum value of the last cell checked:
+          throw new RuntimeException("packed points value " + Arrays.toString(packedValue) + " for field=\"" + fieldName + "\", docID=" + docID + " is out-of-bounds of the last cell min=" + Arrays.toString(lastMinPackedValue) + " max=" + Arrays.toString(lastMaxPackedValue) + " dim=" + dim);
+        }
+      }
+
+      // In the 1D case, PointValues must make a single in-order sweep through all values, and tie-break by
+      // increasing docID:
+      if (numDims == 1) {
+        int cmp = StringHelper.compare(bytesPerDim, lastPackedValue, 0, packedValue, 0);
+        if (cmp > 0) {
+          throw new RuntimeException("packed points value " + Arrays.toString(packedValue) + " for field=\"" + fieldName + "\", for docID=" + docID + " is out-of-order vs the previous document's value " + Arrays.toString(lastPackedValue));
+        } else if (cmp == 0) {
+          if (docID < lastDocID) {
+            throw new RuntimeException("packed points value is the same, but docID=" + docID + " is out of order vs previous docID=" + lastDocID + ", field=\"" + fieldName + "\"");
+          }
+        }
+        System.arraycopy(packedValue, 0, lastPackedValue, 0, bytesPerDim);
+        lastDocID = docID;
+      }
+    }
+
+    @Override
+    public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+      checkPackedValue("min packed value", minPackedValue, -1);
+      System.arraycopy(minPackedValue, 0, lastMinPackedValue, 0, packedBytesCount);
+      checkPackedValue("max packed value", maxPackedValue, -1);
+      System.arraycopy(maxPackedValue, 0, lastMaxPackedValue, 0, packedBytesCount);
+
+      for(int dim=0;dim<numDims;dim++) {
+        int offset = bytesPerDim * dim;
+
+        if (StringHelper.compare(bytesPerDim, minPackedValue, offset, maxPackedValue, offset) > 0) {
+          throw new RuntimeException("packed points cell minPackedValue " + Arrays.toString(minPackedValue) +
+                                     " is out-of-bounds of the cell's maxPackedValue " + Arrays.toString(maxPackedValue) + " dim=" + dim + " field=\"" + fieldName + "\"");
+        }
+
+        // Make sure this cell is not outside of the global min/max:
+        if (StringHelper.compare(bytesPerDim, minPackedValue, offset, globalMinPackedValue, offset) < 0) {
+          throw new RuntimeException("packed points cell minPackedValue " + Arrays.toString(minPackedValue) +
+                                     " is out-of-bounds of the global minimum " + Arrays.toString(globalMinPackedValue) + " dim=" + dim + " field=\"" + fieldName + "\"");
+        }
+
+        if (StringHelper.compare(bytesPerDim, maxPackedValue, offset, globalMinPackedValue, offset) < 0) {
+          throw new RuntimeException("packed points cell maxPackedValue " + Arrays.toString(maxPackedValue) +
+                                     " is out-of-bounds of the global minimum " + Arrays.toString(globalMinPackedValue) + " dim=" + dim + " field=\"" + fieldName + "\"");
+        }
+
+        if (StringHelper.compare(bytesPerDim, minPackedValue, offset, globalMaxPackedValue, offset) > 0) {
+          throw new RuntimeException("packed points cell minPackedValue " + Arrays.toString(minPackedValue) +
+                                     " is out-of-bounds of the global maximum " + Arrays.toString(globalMaxPackedValue) + " dim=" + dim + " field=\"" + fieldName + "\"");
+        }
+        if (StringHelper.compare(bytesPerDim, maxPackedValue, offset, globalMaxPackedValue, offset) > 0) {
+          throw new RuntimeException("packed points cell maxPackedValue " + Arrays.toString(maxPackedValue) +
+                                     " is out-of-bounds of the global maximum " + Arrays.toString(globalMaxPackedValue) + " dim=" + dim + " field=\"" + fieldName + "\"");
+        }
+      }                                   
+
+      // We always pretend the query shape is so complex that it crosses every cell, so
+      // that packedValue is passed for every document
+      return PointValues.Relation.CELL_CROSSES_QUERY;
+    }
+
+    private void checkPackedValue(String desc, byte[] packedValue, int docID) {
+      if (packedValue == null) {
+        throw new RuntimeException(desc + " is null for docID=" + docID + " field=\"" + fieldName + "\"");
+      }
+
+      if (packedValue.length != packedBytesCount) {
+        throw new RuntimeException(desc + " has incorrect length=" + packedValue.length + " vs expected=" + packedBytesCount + " for docID=" + docID + " field=\"" + fieldName + "\"");
+      }
+    }
+  }
+
+  
   /**
    * Test stored fields.
    * @lucene.experimental
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
index 6bf7dfc1a86..6cccf4cf1d1 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
@@ -17,14 +17,15 @@
 package org.apache.lucene.util.bkd;
 
 import java.io.IOException;
-import java.util.Arrays;
 
 import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.index.CorruptIndexException;
 import org.apache.lucene.index.PointValues;
+import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.util.Accountable;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.MathUtil;
 import org.apache.lucene.util.RamUsageEstimator;
 import org.apache.lucene.util.StringHelper;
 
@@ -32,14 +33,12 @@ import org.apache.lucene.util.StringHelper;
  *
  * @lucene.experimental */
 
-public class BKDReader extends PointValues implements Accountable {
+public final class BKDReader extends PointValues implements Accountable {
   // Packed array of byte[] holding all split values in the full binary tree:
-  final private byte[] splitPackedValues; 
-  final long[] leafBlockFPs;
-  final private int leafNodeOffset;
+  final int leafNodeOffset;
   final int numDims;
   final int bytesPerDim;
-  final int bytesPerIndexEntry;
+  final int numLeaves;
   final IndexInput in;
   final int maxPointsInLeafNode;
   final byte[] minPackedValue;
@@ -49,6 +48,14 @@ public class BKDReader extends PointValues implements Accountable {
   final int version;
   protected final int packedBytesLength;
 
+  // Used for 6.4.0+ index format:
+  final byte[] packedIndex;
+
+  // Used for Legacy (pre-6.4.0) index format, to hold a compact form of the index:
+  final private byte[] splitPackedValues; 
+  final int bytesPerIndexEntry;
+  final long[] leafBlockFPs;
+
   /** Caller must pre-seek the provided {@link IndexInput} to the index location that {@link BKDWriter#finish} returned */
   public BKDReader(IndexInput in) throws IOException {
     version = CodecUtil.checkHeader(in, BKDWriter.CODEC_NAME, BKDWriter.VERSION_START, BKDWriter.VERSION_CURRENT);
@@ -59,7 +66,7 @@ public class BKDReader extends PointValues implements Accountable {
     packedBytesLength = numDims * bytesPerDim;
 
     // Read index:
-    int numLeaves = in.readVInt();
+    numLeaves = in.readVInt();
     assert numLeaves > 0;
     leafNodeOffset = numLeaves;
 
@@ -78,203 +85,378 @@ public class BKDReader extends PointValues implements Accountable {
     pointCount = in.readVLong();
     docCount = in.readVInt();
 
-    splitPackedValues = new byte[bytesPerIndexEntry*numLeaves];
-
-    // TODO: don't write split packed values[0]!
-    in.readBytes(splitPackedValues, 0, splitPackedValues.length);
-
-    // Read the file pointers to the start of each leaf block:
-    long[] leafBlockFPs = new long[numLeaves];
-    long lastFP = 0;
-    for(int i=0;i<numLeaves;i++) {
-      long delta = in.readVLong();
-      leafBlockFPs[i] = lastFP + delta;
-      lastFP += delta;
-    }
-
-    // Possibly rotate the leaf block FPs, if the index not fully balanced binary tree (only happens
-    // if it was created by BKDWriter.merge).  In this case the leaf nodes may straddle the two bottom
-    // levels of the binary tree:
-    if (numDims == 1 && numLeaves > 1) {
-      //System.out.println("BKDR: numLeaves=" + numLeaves);
-      int levelCount = 2;
-      while (true) {
-        //System.out.println("  cycle levelCount=" + levelCount);
-        if (numLeaves >= levelCount && numLeaves <= 2*levelCount) {
-          int lastLevel = 2*(numLeaves - levelCount);
-          assert lastLevel >= 0;
-          /*
-          System.out.println("BKDR: lastLevel=" + lastLevel + " vs " + levelCount);
-          System.out.println("FPs before:");
-          for(int i=0;i<leafBlockFPs.length;i++) {
-            System.out.println("  " + i + " " + leafBlockFPs[i]);
-          }
-          */
-          if (lastLevel != 0) {
-            // Last level is partially filled, so we must rotate the leaf FPs to match.  We do this here, after loading
-            // at read-time, so that we can still delta code them on disk at write:
-            //System.out.println("BKDR: now rotate index");
-            long[] newLeafBlockFPs = new long[numLeaves];
-            System.arraycopy(leafBlockFPs, lastLevel, newLeafBlockFPs, 0, leafBlockFPs.length - lastLevel);
-            System.arraycopy(leafBlockFPs, 0, newLeafBlockFPs, leafBlockFPs.length - lastLevel, lastLevel);
-            leafBlockFPs = newLeafBlockFPs;
-          }
-          /*
-          System.out.println("FPs:");
-          for(int i=0;i<leafBlockFPs.length;i++) {
-            System.out.println("  " + i + " " + leafBlockFPs[i]);
-          }
-          */
-          break;
-        }
-
-        levelCount *= 2;
-      }
-    }
-
-    this.leafBlockFPs = leafBlockFPs;
-    this.in = in;
-  }
-
-  /** Called by consumers that have their own on-disk format for the index (e.g. SimpleText) */
-  protected BKDReader(IndexInput in, int numDims, int maxPointsInLeafNode, int bytesPerDim, long[] leafBlockFPs, byte[] splitPackedValues,
-                      byte[] minPackedValue, byte[] maxPackedValue, long pointCount, int docCount) throws IOException {
-    this.in = in;
-    this.numDims = numDims;
-    this.maxPointsInLeafNode = maxPointsInLeafNode;
-    this.bytesPerDim = bytesPerDim;
-    // no version check here because callers of this API (SimpleText) have no back compat:
-    bytesPerIndexEntry = numDims == 1 ? bytesPerDim : bytesPerDim + 1;
-    packedBytesLength = numDims * bytesPerDim;
-    this.leafNodeOffset = leafBlockFPs.length;
-    this.leafBlockFPs = leafBlockFPs;
-    this.splitPackedValues = splitPackedValues;
-    this.minPackedValue = minPackedValue;
-    this.maxPackedValue = maxPackedValue;
-    this.pointCount = pointCount;
-    this.docCount = docCount;
-    this.version = BKDWriter.VERSION_CURRENT;
-    assert minPackedValue.length == packedBytesLength;
-    assert maxPackedValue.length == packedBytesLength;
-  }
-
-  private static class VerifyVisitor implements IntersectVisitor {
-    byte[] cellMinPacked;
-    byte[] cellMaxPacked;
-    byte[] lastPackedValue;
-    final int numDims;
-    final int bytesPerDim;
-    final int maxDoc;
-
-    public VerifyVisitor(int numDims, int bytesPerDim, int maxDoc) {
-      this.numDims = numDims;
-      this.bytesPerDim = bytesPerDim;
-      this.maxDoc = maxDoc;
-    }
-
-    @Override
-    public void visit(int docID) {
-      throw new UnsupportedOperationException();
-    }
-
-    @Override
-    public void visit(int docID, byte[] packedValue) {
-      if (docID < 0 || docID >= maxDoc) {
-        throw new RuntimeException("docID=" + docID + " is out of bounds of 0.." + maxDoc);
-      }
-      for(int dim=0;dim<numDims;dim++) {
-        if (StringHelper.compare(bytesPerDim, cellMinPacked, dim*bytesPerDim, packedValue, dim*bytesPerDim) > 0) {
-          throw new RuntimeException("value=" + new BytesRef(packedValue, dim*bytesPerDim, bytesPerDim) + " for docID=" + docID + " dim=" + dim + " is less than this leaf block's minimum=" + new BytesRef(cellMinPacked, dim*bytesPerDim, bytesPerDim));
-        }
-        if (StringHelper.compare(bytesPerDim, cellMaxPacked, dim*bytesPerDim, packedValue, dim*bytesPerDim) < 0) {
-          throw new RuntimeException("value=" + new BytesRef(packedValue, dim*bytesPerDim, bytesPerDim) + " for docID=" + docID + " dim=" + dim + " is greater than this leaf block's maximum=" + new BytesRef(cellMaxPacked, dim*bytesPerDim, bytesPerDim));
-        }
-      }
-
-      if (numDims == 1) {
-        // With only 1D, all values should always be in sorted order
-        if (lastPackedValue == null) {
-          lastPackedValue = Arrays.copyOf(packedValue, packedValue.length);
-        } else if (StringHelper.compare(bytesPerDim, lastPackedValue, 0, packedValue, 0) > 0) {
-          throw new RuntimeException("value=" + new BytesRef(packedValue) + " for docID=" + docID + " dim=0" + " sorts before last value=" + new BytesRef(lastPackedValue));
-        } else {
-          System.arraycopy(packedValue, 0, lastPackedValue, 0, bytesPerDim);
-        }
-      }
-    }
-
-    @Override
-    public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
-      throw new UnsupportedOperationException();
-    }
-  }
-
-  /** Only used for debugging, to make sure all values in each leaf block fall within the range expected by the index */
-  // TODO: maybe we can get this into CheckIndex?
-  public void verify(int maxDoc) throws IOException {
-    //System.out.println("BKDR.verify this=" + this);
-    // Visits every doc in every leaf block and confirms that
-    // their values agree with the index:
-    byte[] rootMinPacked = new byte[packedBytesLength];
-    byte[] rootMaxPacked = new byte[packedBytesLength];
-    Arrays.fill(rootMaxPacked, (byte) 0xff);
-    verify(getIntersectState(new VerifyVisitor(numDims, bytesPerDim, maxDoc)), 1, rootMinPacked, rootMaxPacked);
-  }
-
-  private void verify(IntersectState state, int nodeID, byte[] cellMinPacked, byte[] cellMaxPacked) throws IOException {
-
-    if (nodeID >= leafNodeOffset) {
-      int leafID = nodeID - leafNodeOffset;
-
-      // In the unbalanced case it's possible the left most node only has one child:
-      if (leafID < leafBlockFPs.length) {
-        //System.out.println("CHECK nodeID=" + nodeID + " leaf=" + (nodeID-leafNodeOffset) + " offset=" + leafNodeOffset + " fp=" + leafBlockFPs[leafID]);
-        //System.out.println("BKDR.verify leafID=" + leafID + " nodeID=" + nodeID + " fp=" + leafBlockFPs[leafID] + " min=" + new BytesRef(cellMinPacked) + " max=" + new BytesRef(cellMaxPacked));
-
-        // Leaf node: check that all values are in fact in bounds:
-        VerifyVisitor visitor = (VerifyVisitor) state.visitor;
-        visitor.cellMinPacked = cellMinPacked;
-        visitor.cellMaxPacked = cellMaxPacked;
-
-        int count = readDocIDs(state.in, leafBlockFPs[leafID], state.scratchDocIDs);
-        visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor);
-      } else {
-        //System.out.println("BKDR.verify skip leafID=" + leafID);
-      }
+    if (version >= BKDWriter.VERSION_PACKED_INDEX) {
+      int numBytes = in.readVInt();
+      packedIndex = new byte[numBytes];
+      in.readBytes(packedIndex, 0, numBytes);
+      leafBlockFPs = null;
+      splitPackedValues = null;
     } else {
-      // Non-leaf node:
+      // legacy un-packed index
 
-      int address = nodeID * bytesPerIndexEntry;
-      int splitDim;
-      if (numDims == 1) {
-        splitDim = 0;
-        if (version < BKDWriter.VERSION_IMPLICIT_SPLIT_DIM_1D) {
-          // skip over wastefully encoded 0 splitDim:
-          assert splitPackedValues[address] == 0;
-          address++;
+      splitPackedValues = new byte[bytesPerIndexEntry*numLeaves];
+
+      in.readBytes(splitPackedValues, 0, splitPackedValues.length);
+
+      // Read the file pointers to the start of each leaf block:
+      long[] leafBlockFPs = new long[numLeaves];
+      long lastFP = 0;
+      for(int i=0;i<numLeaves;i++) {
+        long delta = in.readVLong();
+        leafBlockFPs[i] = lastFP + delta;
+        lastFP += delta;
+      }
+
+      // Possibly rotate the leaf block FPs, if the index not fully balanced binary tree (only happens
+      // if it was created by BKDWriter.merge or OneDimWriter).  In this case the leaf nodes may straddle the two bottom
+      // levels of the binary tree:
+      if (numDims == 1 && numLeaves > 1) {
+        int levelCount = 2;
+        while (true) {
+          if (numLeaves >= levelCount && numLeaves <= 2*levelCount) {
+            int lastLevel = 2*(numLeaves - levelCount);
+            assert lastLevel >= 0;
+            if (lastLevel != 0) {
+              // Last level is partially filled, so we must rotate the leaf FPs to match.  We do this here, after loading
+              // at read-time, so that we can still delta code them on disk at write:
+              long[] newLeafBlockFPs = new long[numLeaves];
+              System.arraycopy(leafBlockFPs, lastLevel, newLeafBlockFPs, 0, leafBlockFPs.length - lastLevel);
+              System.arraycopy(leafBlockFPs, 0, newLeafBlockFPs, leafBlockFPs.length - lastLevel, lastLevel);
+              leafBlockFPs = newLeafBlockFPs;
+            }
+            break;
+          }
+
+          levelCount *= 2;
         }
-      } else {
-        splitDim = splitPackedValues[address++] & 0xff;
       }
       
-      assert splitDim < numDims;
-
-      byte[] splitPackedValue = new byte[packedBytesLength];
-
-      // Recurse on left sub-tree:
-      System.arraycopy(cellMaxPacked, 0, splitPackedValue, 0, packedBytesLength);
-      System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
-      verify(state,
-             2*nodeID,
-             cellMinPacked, splitPackedValue);
-
-      // Recurse on right sub-tree:
-      System.arraycopy(cellMinPacked, 0, splitPackedValue, 0, packedBytesLength);
-      System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
-      verify(state,
-             2*nodeID+1,
-             splitPackedValue, cellMaxPacked);
+      this.leafBlockFPs = leafBlockFPs;
+      packedIndex = null;
     }
+
+    this.in = in;
+  }
+
+  long getMinLeafBlockFP() {
+    if (packedIndex != null) {
+      return new ByteArrayDataInput(packedIndex).readVLong();
+    } else {
+      long minFP = Long.MAX_VALUE;
+      for(long fp : leafBlockFPs) {
+        minFP = Math.min(minFP, fp);
+      }
+      return minFP;
+    }
+  }
+
+  /** Used to walk the in-heap index
+   *
+   * @lucene.internal */
+  public abstract class IndexTree implements Cloneable {
+    protected int nodeID;
+    // level is 1-based so that we can do level-1 w/o checking each time:
+    protected int level;
+    protected int splitDim;
+    protected final byte[][] splitPackedValueStack;
+
+    protected IndexTree() {
+      int treeDepth = getTreeDepth();
+      splitPackedValueStack = new byte[treeDepth+1][];
+      nodeID = 1;
+      level = 1;
+      splitPackedValueStack[level] = new byte[packedBytesLength];
+    }      
+
+    public void pushLeft() {
+      nodeID *= 2;
+      level++;
+      if (splitPackedValueStack[level] == null) {
+        splitPackedValueStack[level] = new byte[packedBytesLength];
+      }
+    }
+
+    /** Clone, but you are not allowed to pop up past the point where the clone happened. */
+    public abstract IndexTree clone();
+    
+    public void pushRight() {
+      nodeID = nodeID * 2 + 1;
+      level++;
+      if (splitPackedValueStack[level] == null) {
+        splitPackedValueStack[level] = new byte[packedBytesLength];
+      }
+    }
+
+    public void pop() {
+      nodeID /= 2;
+      level--;
+      splitDim = -1;
+      //System.out.println("  pop nodeID=" + nodeID);
+    }
+
+    public boolean isLeafNode() {
+      return nodeID >= leafNodeOffset;
+    }
+
+    public boolean nodeExists() {
+      return nodeID - leafNodeOffset < leafNodeOffset;
+    }
+
+    public int getNodeID() {
+      return nodeID;
+    }
+
+    public byte[] getSplitPackedValue() {
+      assert isLeafNode() == false;
+      assert splitPackedValueStack[level] != null: "level=" + level;
+      return splitPackedValueStack[level];
+    }
+                                                       
+    /** Only valid after pushLeft or pushRight, not pop! */
+    public int getSplitDim() {
+      assert isLeafNode() == false;
+      return splitDim;
+    }
+
+    /** Only valid after pushLeft or pushRight, not pop! */
+    public abstract BytesRef getSplitDimValue();
+    
+    /** Only valid after pushLeft or pushRight, not pop! */
+    public abstract long getLeafBlockFP();
+  }
+
+  /** Reads the original simple yet heap-heavy index format */
+  private final class LegacyIndexTree extends IndexTree {
+
+    private long leafBlockFP;
+    private final byte[] splitDimValue = new byte[bytesPerDim];
+    private final BytesRef scratch = new BytesRef();
+
+    public LegacyIndexTree() {
+      setNodeData();
+      scratch.bytes = splitDimValue;
+      scratch.length = bytesPerDim;
+    }
+
+    @Override
+    public LegacyIndexTree clone() {
+      LegacyIndexTree index = new LegacyIndexTree();
+      index.nodeID = nodeID;
+      index.level = level;
+      index.splitDim = splitDim;
+      index.leafBlockFP = leafBlockFP;
+      index.splitPackedValueStack[index.level] = splitPackedValueStack[index.level].clone();
+
+      return index;
+    }
+    
+    @Override
+    public void pushLeft() {
+      super.pushLeft();
+      setNodeData();
+    }
+    
+    @Override
+    public void pushRight() {
+      super.pushRight();
+      setNodeData();
+    }
+
+    private void setNodeData() {
+      if (isLeafNode()) {
+        leafBlockFP = leafBlockFPs[nodeID - leafNodeOffset];
+        splitDim = -1;
+      } else {
+        leafBlockFP = -1;
+        int address = nodeID * bytesPerIndexEntry;
+        if (numDims == 1) {
+          splitDim = 0;
+          if (version < BKDWriter.VERSION_IMPLICIT_SPLIT_DIM_1D) {
+            // skip over wastefully encoded 0 splitDim:
+            assert splitPackedValues[address] == 0;
+            address++;
+          }
+        } else {
+          splitDim = splitPackedValues[address++] & 0xff;
+        }
+        System.arraycopy(splitPackedValues, address, splitDimValue, 0, bytesPerDim);
+      }
+    }
+
+    @Override
+    public long getLeafBlockFP() {
+      assert isLeafNode();
+      return leafBlockFP;
+    }
+
+    @Override
+    public BytesRef getSplitDimValue() {
+      assert isLeafNode() == false;
+      return scratch;
+    }
+
+    @Override
+    public void pop() {
+      super.pop();
+      leafBlockFP = -1;
+    }
+  }
+
+  /** Reads the new packed byte[] index format which can be up to ~63% smaller than the legacy index format on 20M NYC taxis tests.  This
+   *  format takes advantage of the limited access pattern to the BKD tree at search time, i.e. starting at the root node and recursing
+   *  downwards one child at a time. */
+  private final class PackedIndexTree extends IndexTree {
+    // used to read the packed byte[]
+    private final ByteArrayDataInput in;
+    // holds the minimum (left most) leaf block file pointer for each level we've recursed to:
+    private final long[] leafBlockFPStack;
+    // holds the address, in the packed byte[] index, of the left-node of each level:
+    private final int[] leftNodePositions;
+    // holds the address, in the packed byte[] index, of the right-node of each level:
+    private final int[] rightNodePositions;
+    // holds the splitDim for each level:
+    private final int[] splitDims;
+    // true if the per-dim delta we read for the node at this level is a negative offset vs. the last split on this dim; this is a packed
+    // 2D array, i.e. to access array[level][dim] you read from negativeDeltas[level*numDims+dim].  this will be true if the last time we
+    // split on this dimension, we next pushed to the left sub-tree:
+    private final boolean[] negativeDeltas;
+    // holds the packed per-level split values; the intersect method uses this to save the cell min/max as it recurses:
+    private final byte[][] splitValuesStack;
+    // scratch value to return from getPackedValue:
+    private final BytesRef scratch;
+
+    public PackedIndexTree() {
+      int treeDepth = getTreeDepth();
+      leafBlockFPStack = new long[treeDepth+1];
+      leftNodePositions = new int[treeDepth+1];
+      rightNodePositions = new int[treeDepth+1];
+      splitValuesStack = new byte[treeDepth+1][];
+      splitDims = new int[treeDepth+1];
+      negativeDeltas = new boolean[numDims*(treeDepth+1)];
+
+      in = new ByteArrayDataInput(packedIndex);
+      splitValuesStack[0] = new byte[packedBytesLength];
+      readNodeData(false);
+      scratch = new BytesRef();
+      scratch.length = bytesPerDim;
+    }
+
+    @Override
+    public PackedIndexTree clone() {
+      PackedIndexTree index = new PackedIndexTree();
+      index.nodeID = nodeID;
+      index.level = level;
+      index.splitDim = splitDim;
+      System.arraycopy(negativeDeltas, level*numDims, index.negativeDeltas, level*numDims, numDims);
+      index.leafBlockFPStack[level] = leafBlockFPStack[level];
+      index.leftNodePositions[level] = leftNodePositions[level];
+      index.rightNodePositions[level] = rightNodePositions[level];
+      index.splitValuesStack[index.level] = splitValuesStack[index.level].clone();
+      System.arraycopy(negativeDeltas, level*numDims, index.negativeDeltas, level*numDims, numDims);
+      index.splitDims[level] = splitDims[level];
+      return index;
+    }
+
+    @Override
+    public void pushLeft() {
+      int nodePosition = leftNodePositions[level];
+      super.pushLeft();
+      System.arraycopy(negativeDeltas, (level-1)*numDims, negativeDeltas, level*numDims, numDims);
+      assert splitDim != -1;
+      negativeDeltas[level*numDims+splitDim] = true;
+      in.setPosition(nodePosition);
+      readNodeData(true);
+    }
+    
+    @Override
+    public void pushRight() {
+      int nodePosition = rightNodePositions[level];
+      super.pushRight();
+      System.arraycopy(negativeDeltas, (level-1)*numDims, negativeDeltas, level*numDims, numDims);
+      assert splitDim != -1;
+      negativeDeltas[level*numDims+splitDim] = false;
+      in.setPosition(nodePosition);
+      readNodeData(false);
+    }
+
+    @Override
+    public void pop() {
+      super.pop();
+      splitDim = splitDims[level];
+    }
+
+    @Override
+    public long getLeafBlockFP() {
+      assert isLeafNode(): "nodeID=" + nodeID + " is not a leaf";
+      return leafBlockFPStack[level];
+    }
+
+    @Override
+    public BytesRef getSplitDimValue() {
+      assert isLeafNode() == false;
+      scratch.bytes = splitValuesStack[level];
+      scratch.offset = splitDim * bytesPerDim;
+      return scratch;
+    }
+
+    private void readNodeData(boolean isLeft) {
+
+      leafBlockFPStack[level] = leafBlockFPStack[level-1];
+
+      // read leaf block FP delta
+      if (isLeft == false) {
+        leafBlockFPStack[level] += in.readVLong();
+      }
+
+      if (isLeafNode()) {
+        splitDim = -1;
+      } else {
+
+        // read split dim, prefix, firstDiffByteDelta encoded as int:
+        int code = in.readVInt();
+        splitDim = code % numDims;
+        splitDims[level] = splitDim;
+        code /= numDims;
+        int prefix = code % (1+bytesPerDim);
+        int suffix = bytesPerDim - prefix;
+
+        if (splitValuesStack[level] == null) {
+          splitValuesStack[level] = new byte[packedBytesLength];
+        }
+        System.arraycopy(splitValuesStack[level-1], 0, splitValuesStack[level], 0, packedBytesLength);
+        if (suffix > 0) {
+          int firstDiffByteDelta = code / (1+bytesPerDim);
+          if (negativeDeltas[level*numDims + splitDim]) {
+            firstDiffByteDelta = -firstDiffByteDelta;
+          }
+          int oldByte = splitValuesStack[level][splitDim*bytesPerDim+prefix] & 0xFF;
+          splitValuesStack[level][splitDim*bytesPerDim+prefix] = (byte) (oldByte + firstDiffByteDelta);
+          in.readBytes(splitValuesStack[level], splitDim*bytesPerDim+prefix+1, suffix-1);
+        } else {
+          // our split value is == last split value in this dim, which can happen when there are many duplicate values
+        }
+
+        int leftNumBytes;
+        if (nodeID * 2 < leafNodeOffset) {
+          leftNumBytes = in.readVInt();
+        } else {
+          leftNumBytes = 0;
+        }
+
+        leftNodePositions[level] = in.getPosition();
+        rightNodePositions[level] = leftNodePositions[level] + leftNumBytes;
+      }
+    }
+  }
+
+  private int getTreeDepth() {
+    // First +1 because all the non-leave nodes makes another power
+    // of 2; e.g. to have a fully balanced tree with 4 leaves you
+    // need a depth=3 tree:
+
+    // Second +1 because MathUtil.log computes floor of the logarithm; e.g.
+    // with 5 leaves you need a depth=4 tree:
+    return MathUtil.log(numLeaves, 2) + 2;
   }
 
   /** Used to track all state for a single call to {@link #intersect}. */
@@ -285,57 +467,73 @@ public class BKDReader extends PointValues implements Accountable {
     final int[] commonPrefixLengths;
 
     final IntersectVisitor visitor;
+    public final IndexTree index;
 
     public IntersectState(IndexInput in, int numDims,
                           int packedBytesLength,
                           int maxPointsInLeafNode,
-                          IntersectVisitor visitor) {
+                          IntersectVisitor visitor,
+                          IndexTree indexVisitor) {
       this.in = in;
       this.visitor = visitor;
       this.commonPrefixLengths = new int[numDims];
       this.scratchDocIDs = new int[maxPointsInLeafNode];
       this.scratchPackedValue = new byte[packedBytesLength];
+      this.index = indexVisitor;
     }
   }
 
   public void intersect(IntersectVisitor visitor) throws IOException {
-    intersect(getIntersectState(visitor), 1, minPackedValue, maxPackedValue);
+    intersect(getIntersectState(visitor), minPackedValue, maxPackedValue);
   }
 
   /** Fast path: this is called when the query box fully encompasses all cells under this node. */
-  private void addAll(IntersectState state, int nodeID) throws IOException {
+  private void addAll(IntersectState state) throws IOException {
     //System.out.println("R: addAll nodeID=" + nodeID);
 
-    if (nodeID >= leafNodeOffset) {
+    if (state.index.isLeafNode()) {
       //System.out.println("ADDALL");
-      visitDocIDs(state.in, leafBlockFPs[nodeID-leafNodeOffset], state.visitor);
+      if (state.index.nodeExists()) {
+        visitDocIDs(state.in, state.index.getLeafBlockFP(), state.visitor);
+      }
       // TODO: we can assert that the first value here in fact matches what the index claimed?
     } else {
-      addAll(state, 2*nodeID);
-      addAll(state, 2*nodeID+1);
+      state.index.pushLeft();
+      addAll(state);
+      state.index.pop();
+
+      state.index.pushRight();
+      addAll(state);
+      state.index.pop();
     }
   }
 
   /** Create a new {@link IntersectState} */
   public IntersectState getIntersectState(IntersectVisitor visitor) {
+    IndexTree index;
+    if (packedIndex != null) {
+      index = new PackedIndexTree();
+    } else {
+      index = new LegacyIndexTree();
+    }
     return new IntersectState(in.clone(), numDims,
                               packedBytesLength,
                               maxPointsInLeafNode,
-                              visitor);
+                              visitor,
+                              index);
   }
 
   /** Visits all docIDs and packed values in a single leaf block */
-  public void visitLeafBlockValues(int nodeID, IntersectState state) throws IOException {
-    int leafID = nodeID - leafNodeOffset;
+  public void visitLeafBlockValues(IndexTree index, IntersectState state) throws IOException {
 
     // Leaf node; scan and filter all points in this block:
-    int count = readDocIDs(state.in, leafBlockFPs[leafID], state.scratchDocIDs);
+    int count = readDocIDs(state.in, index.getLeafBlockFP(), state.scratchDocIDs);
 
     // Again, this time reading values and checking with the visitor
     visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor);
   }
 
-  protected void visitDocIDs(IndexInput in, long blockFP, IntersectVisitor visitor) throws IOException {
+  private void visitDocIDs(IndexInput in, long blockFP, IntersectVisitor visitor) throws IOException {
     // Leaf node
     in.seek(blockFP);
 
@@ -350,7 +548,7 @@ public class BKDReader extends PointValues implements Accountable {
     }
   }
 
-  protected int readDocIDs(IndexInput in, long blockFP, int[] docIDs) throws IOException {
+  int readDocIDs(IndexInput in, long blockFP, int[] docIDs) throws IOException {
     in.seek(blockFP);
 
     // How many points are stored in this leaf cell:
@@ -365,7 +563,7 @@ public class BKDReader extends PointValues implements Accountable {
     return count;
   }
 
-  protected void visitDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
+  void visitDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
     visitor.grow(count);
 
     readCommonPrefixes(commonPrefixLengths, scratchPackedValue, in);
@@ -434,13 +632,10 @@ public class BKDReader extends PointValues implements Accountable {
     }
   }
 
-  private void intersect(IntersectState state,
-                         int nodeID,
-                         byte[] cellMinPacked, byte[] cellMaxPacked)
-    throws IOException {
+  private void intersect(IntersectState state, byte[] cellMinPacked, byte[] cellMaxPacked) throws IOException {
 
     /*
-    System.out.println("\nR: intersect nodeID=" + nodeID);
+    System.out.println("\nR: intersect nodeID=" + state.index.getNodeID());
     for(int dim=0;dim<numDims;dim++) {
       System.out.println("  dim=" + dim + "\n    cellMin=" + new BytesRef(cellMinPacked, dim*bytesPerDim, bytesPerDim) + "\n    cellMax=" + new BytesRef(cellMaxPacked, dim*bytesPerDim, bytesPerDim));
     }
@@ -450,24 +645,18 @@ public class BKDReader extends PointValues implements Accountable {
 
     if (r == Relation.CELL_OUTSIDE_QUERY) {
       // This cell is fully outside of the query shape: stop recursing
-      return;
     } else if (r == Relation.CELL_INSIDE_QUERY) {
       // This cell is fully inside of the query shape: recursively add all points in this cell without filtering
-      addAll(state, nodeID);
-      return;
-    } else {
-      // The cell crosses the shape boundary, or the cell fully contains the query, so we fall through and do full filtering
-    }
-
-    if (nodeID >= leafNodeOffset) {
+      addAll(state);
+      // The cell crosses the shape boundary, or the cell fully contains the query, so we fall through and do full filtering:
+    } else if (state.index.isLeafNode()) {
+      
       // TODO: we can assert that the first value here in fact matches what the index claimed?
-
-      int leafID = nodeID - leafNodeOffset;
       
       // In the unbalanced case it's possible the left most node only has one child:
-      if (leafID < leafBlockFPs.length) {
+      if (state.index.nodeExists()) {
         // Leaf node; scan and filter all points in this block:
-        int count = readDocIDs(state.in, leafBlockFPs[leafID], state.scratchDocIDs);
+        int count = readDocIDs(state.in, state.index.getLeafBlockFP(), state.scratchDocIDs);
 
         // Again, this time reading values and checking with the visitor
         visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor);
@@ -476,65 +665,45 @@ public class BKDReader extends PointValues implements Accountable {
     } else {
       
       // Non-leaf node: recurse on the split left and right nodes
-
-      int address = nodeID * bytesPerIndexEntry;
-      int splitDim;
-      if (numDims == 1) {
-        splitDim = 0;
-        if (version < BKDWriter.VERSION_IMPLICIT_SPLIT_DIM_1D) {
-          // skip over wastefully encoded 0 splitDim:
-          assert splitPackedValues[address] == 0;
-          address++;
-        }
-      } else {
-        splitDim = splitPackedValues[address++] & 0xff;
-      }
-      
+      int splitDim = state.index.getSplitDim();
+      assert splitDim >= 0: "splitDim=" + splitDim;
       assert splitDim < numDims;
 
-      // TODO: can we alloc & reuse this up front?
+      byte[] splitPackedValue = state.index.getSplitPackedValue();
+      BytesRef splitDimValue = state.index.getSplitDimValue();
+      assert splitDimValue.length == bytesPerDim;
+      //System.out.println("  splitDimValue=" + splitDimValue + " splitDim=" + splitDim);
 
-      byte[] splitPackedValue = new byte[packedBytesLength];
+      // make sure cellMin <= splitValue <= cellMax:
+      assert StringHelper.compare(bytesPerDim, cellMinPacked, splitDim*bytesPerDim, splitDimValue.bytes, splitDimValue.offset) <= 0: "bytesPerDim=" + bytesPerDim + " splitDim=" + splitDim + " numDims=" + numDims;
+      assert StringHelper.compare(bytesPerDim, cellMaxPacked, splitDim*bytesPerDim, splitDimValue.bytes, splitDimValue.offset) >= 0: "bytesPerDim=" + bytesPerDim + " splitDim=" + splitDim + " numDims=" + numDims;
 
       // Recurse on left sub-tree:
       System.arraycopy(cellMaxPacked, 0, splitPackedValue, 0, packedBytesLength);
-      System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
-      intersect(state,
-                2*nodeID,
-                cellMinPacked, splitPackedValue);
+      System.arraycopy(splitDimValue.bytes, splitDimValue.offset, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
+      state.index.pushLeft();
+      intersect(state, cellMinPacked, splitPackedValue);
+      state.index.pop();
+
+      // Restore the split dim value since it may have been overwritten while recursing:
+      System.arraycopy(splitPackedValue, splitDim*bytesPerDim, splitDimValue.bytes, splitDimValue.offset, bytesPerDim);
 
       // Recurse on right sub-tree:
       System.arraycopy(cellMinPacked, 0, splitPackedValue, 0, packedBytesLength);
-      System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
-      intersect(state,
-                2*nodeID+1,
-                splitPackedValue, cellMaxPacked);
+      System.arraycopy(splitDimValue.bytes, splitDimValue.offset, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
+      state.index.pushRight();
+      intersect(state, splitPackedValue, cellMaxPacked);
+      state.index.pop();
     }
   }
 
-  /** Copies the split value for this node into the provided byte array */
-  public void copySplitValue(int nodeID, byte[] splitPackedValue) {
-    int address = nodeID * bytesPerIndexEntry;
-    int splitDim;
-    if (numDims == 1) {
-      splitDim = 0;
-      if (version < BKDWriter.VERSION_IMPLICIT_SPLIT_DIM_1D) {
-        // skip over wastefully encoded 0 splitDim:
-        assert splitPackedValues[address] == 0;
-        address++;
-      }
-    } else {
-      splitDim = splitPackedValues[address++] & 0xff;
-    }
-    
-    assert splitDim < numDims;
-    System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
-  }
-
   @Override
   public long ramBytesUsed() {
-    return RamUsageEstimator.sizeOf(splitPackedValues) +
-        RamUsageEstimator.sizeOf(leafBlockFPs);
+    if (packedIndex != null) {
+      return packedIndex.length;
+    } else {
+      return RamUsageEstimator.sizeOf(splitPackedValues) + RamUsageEstimator.sizeOf(leafBlockFPs);
+    }
   }
 
   @Override
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
index 5526624a961..c82a0c8bf25 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
@@ -33,6 +33,7 @@ import org.apache.lucene.store.ChecksumIndexInput;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
 import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RAMOutputStream;
 import org.apache.lucene.store.TrackingDirectoryWrapper;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
@@ -83,7 +84,8 @@ public class BKDWriter implements Closeable {
   public static final int VERSION_COMPRESSED_DOC_IDS = 1;
   public static final int VERSION_COMPRESSED_VALUES = 2;
   public static final int VERSION_IMPLICIT_SPLIT_DIM_1D = 3;
-  public static final int VERSION_CURRENT = VERSION_IMPLICIT_SPLIT_DIM_1D;
+  public static final int VERSION_PACKED_INDEX = 4;
+  public static final int VERSION_CURRENT = VERSION_PACKED_INDEX;
 
   /** How many bytes each docs takes in the fixed-width offline format */
   private final int bytesPerDoc;
@@ -325,15 +327,10 @@ public class BKDWriter implements Closeable {
                                            bkd.numDims,
                                            bkd.packedBytesLength,
                                            bkd.maxPointsInLeafNode,
+                                           null,
                                            null);
       this.docMap = docMap;
-      long minFP = Long.MAX_VALUE;
-      //System.out.println("MR.init " + this + " bkdreader=" + bkd + " leafBlockFPs.length=" + bkd.leafBlockFPs.length);
-      for(long fp : bkd.leafBlockFPs) {
-        minFP = Math.min(minFP, fp);
-        //System.out.println("  leaf fp=" + fp);
-      }
-      state.in.seek(minFP);
+      state.in.seek(bkd.getMinLeafBlockFP());
       this.packedValues = new byte[bkd.maxPointsInLeafNode * bkd.packedBytesLength];
     }
 
@@ -341,7 +338,7 @@ public class BKDWriter implements Closeable {
       //System.out.println("MR.next this=" + this);
       while (true) {
         if (docBlockUpto == docsInBlock) {
-          if (blockID == bkd.leafBlockFPs.length) {
+          if (blockID == bkd.leafNodeOffset) {
             //System.out.println("  done!");
             return false;
           }
@@ -489,7 +486,6 @@ public class BKDWriter implements Closeable {
     return indexFP;
   }
 
-
   /* In the 1D case, we can simply sort points in ascending order and use the
    * same writing logic as we use at merge time. */
   private long writeField1Dim(IndexOutput out, String fieldName, MutablePointValues reader) throws IOException {
@@ -648,6 +644,7 @@ public class BKDWriter implements Closeable {
     }
 
     private void writeLeafBlock() throws IOException {
+      //System.out.println("writeLeafBlock pos=" + out.getFilePointer());
       assert leafCount != 0;
       if (valueCount == 0) {
         System.arraycopy(leafValues, 0, minPackedValue, 0, packedBytesLength);
@@ -811,6 +808,24 @@ public class BKDWriter implements Closeable {
     }.sort(0, pointCount);
   }
 
+  // useful for debugging:
+  /*
+  private void printPathSlice(String desc, PathSlice slice, int dim) throws IOException {
+    System.out.println("    " + desc + " dim=" + dim + " count=" + slice.count + ":");    
+    try(PointReader r = slice.writer.getReader(slice.start, slice.count)) {
+      int count = 0;
+      while (r.next()) {
+        byte[] v = r.packedValue();
+        System.out.println("      " + count + ": " + new BytesRef(v, dim*bytesPerDim, bytesPerDim));
+        count++;
+        if (count == slice.count) {
+          break;
+        }
+      }
+    }
+  }
+  */
+
   private PointWriter sort(int dim) throws IOException {
     assert dim >= 0 && dim < numDims;
 
@@ -1019,46 +1034,238 @@ public class BKDWriter implements Closeable {
     return indexFP;
   }
 
-  /** Subclass can change how it writes the index. */
-  protected void writeIndex(IndexOutput out, long[] leafBlockFPs, byte[] splitPackedValues) throws IOException {
+  /** Packs the two arrays, representing a balanced binary tree, into a compact byte[] structure. */
+  private byte[] packIndex(long[] leafBlockFPs, byte[] splitPackedValues) throws IOException {
+
+    int numLeaves = leafBlockFPs.length;
+
+    // Possibly rotate the leaf block FPs, if the index not fully balanced binary tree (only happens
+    // if it was created by OneDimensionBKDWriter).  In this case the leaf nodes may straddle the two bottom
+    // levels of the binary tree:
+    if (numDims == 1 && numLeaves > 1) {
+      int levelCount = 2;
+      while (true) {
+        if (numLeaves >= levelCount && numLeaves <= 2*levelCount) {
+          int lastLevel = 2*(numLeaves - levelCount);
+          assert lastLevel >= 0;
+          if (lastLevel != 0) {
+            // Last level is partially filled, so we must rotate the leaf FPs to match.  We do this here, after loading
+            // at read-time, so that we can still delta code them on disk at write:
+            long[] newLeafBlockFPs = new long[numLeaves];
+            System.arraycopy(leafBlockFPs, lastLevel, newLeafBlockFPs, 0, leafBlockFPs.length - lastLevel);
+            System.arraycopy(leafBlockFPs, 0, newLeafBlockFPs, leafBlockFPs.length - lastLevel, lastLevel);
+            leafBlockFPs = newLeafBlockFPs;
+          }
+          break;
+        }
+
+        levelCount *= 2;
+      }
+    }
+
+    /** Reused while packing the index */
+    RAMOutputStream writeBuffer = new RAMOutputStream();
+
+    // This is the "file" we append the byte[] to:
+    List<byte[]> blocks = new ArrayList<>();
+    byte[] lastSplitValues = new byte[bytesPerDim * numDims];
+    //System.out.println("\npack index");
+    int totalSize = recursePackIndex(writeBuffer, leafBlockFPs, splitPackedValues, 0l, blocks, 1, lastSplitValues, new boolean[numDims], false);
+
+    // Compact the byte[] blocks into single byte index:
+    byte[] index = new byte[totalSize];
+    int upto = 0;
+    for(byte[] block : blocks) {
+      System.arraycopy(block, 0, index, upto, block.length);
+      upto += block.length;
+    }
+    assert upto == totalSize;
+
+    return index;
+  }
+
+  /** Appends the current contents of writeBuffer as another block on the growing in-memory file */
+  private int appendBlock(RAMOutputStream writeBuffer, List<byte[]> blocks) throws IOException {
+    int pos = Math.toIntExact(writeBuffer.getFilePointer());
+    byte[] bytes = new byte[pos];
+    writeBuffer.writeTo(bytes, 0);
+    writeBuffer.reset();
+    blocks.add(bytes);
+    return pos;
+  }
+
+  /**
+   * lastSplitValues is per-dimension split value previously seen; we use this to prefix-code the split byte[] on each inner node
+   */
+  private int recursePackIndex(RAMOutputStream writeBuffer, long[] leafBlockFPs, byte[] splitPackedValues, long minBlockFP, List<byte[]> blocks,
+                               int nodeID, byte[] lastSplitValues, boolean[] negativeDeltas, boolean isLeft) throws IOException {
+    if (nodeID >= leafBlockFPs.length) {
+      int leafID = nodeID - leafBlockFPs.length;
+      //System.out.println("recursePack leaf nodeID=" + nodeID);
+
+      // In the unbalanced case it's possible the left most node only has one child:
+      if (leafID < leafBlockFPs.length) {
+        long delta = leafBlockFPs[leafID] - minBlockFP;
+        if (isLeft) {
+          assert delta == 0;
+          return 0;
+        } else {
+          assert nodeID == 1 || delta > 0: "nodeID=" + nodeID;
+          writeBuffer.writeVLong(delta);
+          return appendBlock(writeBuffer, blocks);
+        }
+      } else {
+        return 0;
+      }
+    } else {
+      long leftBlockFP;
+      if (isLeft == false) {
+        leftBlockFP = getLeftMostLeafBlockFP(leafBlockFPs, nodeID);
+        long delta = leftBlockFP - minBlockFP;
+        assert nodeID == 1 || delta > 0;
+        writeBuffer.writeVLong(delta);
+      } else {
+        // The left tree's left most leaf block FP is always the minimal FP:
+        leftBlockFP = minBlockFP;
+      }
+
+      int address = nodeID * (1+bytesPerDim);
+      int splitDim = splitPackedValues[address++] & 0xff;
+
+      //System.out.println("recursePack inner nodeID=" + nodeID + " splitDim=" + splitDim + " splitValue=" + new BytesRef(splitPackedValues, address, bytesPerDim));
+
+      // find common prefix with last split value in this dim:
+      int prefix = 0;
+      for(;prefix<bytesPerDim;prefix++) {
+        if (splitPackedValues[address+prefix] != lastSplitValues[splitDim * bytesPerDim + prefix]) {
+          break;
+        }
+      }
+
+      //System.out.println("writeNodeData nodeID=" + nodeID + " splitDim=" + splitDim + " numDims=" + numDims + " bytesPerDim=" + bytesPerDim + " prefix=" + prefix);
+
+      int firstDiffByteDelta;
+      if (prefix < bytesPerDim) {
+        //System.out.println("  delta byte cur=" + Integer.toHexString(splitPackedValues[address+prefix]&0xFF) + " prev=" + Integer.toHexString(lastSplitValues[splitDim * bytesPerDim + prefix]&0xFF) + " negated?=" + negativeDeltas[splitDim]);
+        firstDiffByteDelta = (splitPackedValues[address+prefix]&0xFF) - (lastSplitValues[splitDim * bytesPerDim + prefix]&0xFF);
+        if (negativeDeltas[splitDim]) {
+          firstDiffByteDelta = -firstDiffByteDelta;
+        }
+        //System.out.println("  delta=" + firstDiffByteDelta);
+        assert firstDiffByteDelta > 0;
+      } else {
+        firstDiffByteDelta = 0;
+      }
+
+      // pack the prefix, splitDim and delta first diff byte into a single vInt:
+      int code = (firstDiffByteDelta * (1+bytesPerDim) + prefix) * numDims + splitDim;
+
+      //System.out.println("  code=" + code);
+      //System.out.println("  splitValue=" + new BytesRef(splitPackedValues, address, bytesPerDim));
+
+      writeBuffer.writeVInt(code);
+
+      // write the split value, prefix coded vs. our parent's split value:
+      int suffix = bytesPerDim - prefix;
+      byte[] savSplitValue = new byte[suffix];
+      if (suffix > 1) {
+        writeBuffer.writeBytes(splitPackedValues, address+prefix+1, suffix-1);
+      }
+
+      byte[] cmp = lastSplitValues.clone();
+
+      System.arraycopy(lastSplitValues, splitDim * bytesPerDim + prefix, savSplitValue, 0, suffix);
+
+      // copy our split value into lastSplitValues for our children to prefix-code against
+      System.arraycopy(splitPackedValues, address+prefix, lastSplitValues, splitDim * bytesPerDim + prefix, suffix);
+
+      int numBytes = appendBlock(writeBuffer, blocks);
+
+      // placeholder for left-tree numBytes; we need this so that at search time if we only need to recurse into the right sub-tree we can
+      // quickly seek to its starting point
+      int idxSav = blocks.size();
+      blocks.add(null);
+
+      boolean savNegativeDelta = negativeDeltas[splitDim];
+      negativeDeltas[splitDim] = true;
+
+      int leftNumBytes = recursePackIndex(writeBuffer, leafBlockFPs, splitPackedValues, leftBlockFP, blocks, 2*nodeID, lastSplitValues, negativeDeltas, true);
+
+      if (nodeID * 2 < leafBlockFPs.length) {
+        writeBuffer.writeVInt(leftNumBytes);
+      } else {
+        assert leftNumBytes == 0: "leftNumBytes=" + leftNumBytes;
+      }
+      int numBytes2 = Math.toIntExact(writeBuffer.getFilePointer());
+      byte[] bytes2 = new byte[numBytes2];
+      writeBuffer.writeTo(bytes2, 0);
+      writeBuffer.reset();
+      // replace our placeholder:
+      blocks.set(idxSav, bytes2);
+
+      negativeDeltas[splitDim] = false;
+      int rightNumBytes = recursePackIndex(writeBuffer, leafBlockFPs, splitPackedValues, leftBlockFP, blocks, 2*nodeID+1, lastSplitValues, negativeDeltas, false);
+
+      negativeDeltas[splitDim] = savNegativeDelta;
+
+      // restore lastSplitValues to what caller originally passed us:
+      System.arraycopy(savSplitValue, 0, lastSplitValues, splitDim * bytesPerDim + prefix, suffix);
+
+      assert Arrays.equals(lastSplitValues, cmp);
+      
+      return numBytes + numBytes2 + leftNumBytes + rightNumBytes;
+    }
+  }
+
+  private long getLeftMostLeafBlockFP(long[] leafBlockFPs, int nodeID) {
+    int nodeIDIn = nodeID;
+    // TODO: can we do this cheaper, e.g. a closed form solution instead of while loop?  Or
+    // change the recursion while packing the index to return this left-most leaf block FP
+    // from each recursion instead?
+    //
+    // Still, the overall cost here is minor: this method's cost is O(log(N)), and while writing
+    // we call it O(N) times (N = number of leaf blocks)
+    while (nodeID < leafBlockFPs.length) {
+      nodeID *= 2;
+    }
+    int leafID = nodeID - leafBlockFPs.length;
+    long result = leafBlockFPs[leafID];
+    if (result < 0) {
+      throw new AssertionError(result + " for leaf " + leafID);
+    }
+    return result;
+  }
+
+  private void writeIndex(IndexOutput out, long[] leafBlockFPs, byte[] splitPackedValues) throws IOException {
+    byte[] packedIndex = packIndex(leafBlockFPs, splitPackedValues);
+    writeIndex(out, leafBlockFPs.length, packedIndex);
+  }
+  
+  private void writeIndex(IndexOutput out, int numLeaves, byte[] packedIndex) throws IOException {
+    
     CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT);
     out.writeVInt(numDims);
     out.writeVInt(maxPointsInLeafNode);
     out.writeVInt(bytesPerDim);
 
-    assert leafBlockFPs.length > 0;
-    out.writeVInt(leafBlockFPs.length);
+    assert numLeaves > 0;
+    out.writeVInt(numLeaves);
     out.writeBytes(minPackedValue, 0, packedBytesLength);
     out.writeBytes(maxPackedValue, 0, packedBytesLength);
 
     out.writeVLong(pointCount);
     out.writeVInt(docsSeen.cardinality());
-
-    // NOTE: splitPackedValues[0] is unused, because nodeID is 1-based:
-    if (numDims == 1) {
-      // write the index, skipping the byte used to store the split dim since it is always 0
-      for (int i = 1; i < splitPackedValues.length; i += 1 + bytesPerDim) {
-        out.writeBytes(splitPackedValues, i, bytesPerDim);
-      }
-    } else {
-      out.writeBytes(splitPackedValues, 0, splitPackedValues.length);
-    }
-
-    long lastFP = 0;
-    for (int i=0;i<leafBlockFPs.length;i++) {
-      long delta = leafBlockFPs[i]-lastFP;
-      out.writeVLong(delta);
-      lastFP = leafBlockFPs[i];
-    }
+    out.writeVInt(packedIndex.length);
+    out.writeBytes(packedIndex, 0, packedIndex.length);
   }
 
-  protected void writeLeafBlockDocs(IndexOutput out, int[] docIDs, int start, int count) throws IOException {
+  private void writeLeafBlockDocs(IndexOutput out, int[] docIDs, int start, int count) throws IOException {
     assert count > 0: "maxPointsInLeafNode=" + maxPointsInLeafNode;
     out.writeVInt(count);
     DocIdsWriter.writeDocIds(docIDs, start, count, out);
   }
 
-  protected void writeLeafBlockPackedValues(IndexOutput out, int[] commonPrefixLengths, int count, int sortedDim, IntFunction<BytesRef> packedValues) throws IOException {
+  private void writeLeafBlockPackedValues(IndexOutput out, int[] commonPrefixLengths, int count, int sortedDim, IntFunction<BytesRef> packedValues) throws IOException {
     int prefixLenSum = Arrays.stream(commonPrefixLengths).sum();
     if (prefixLenSum == packedBytesLength) {
       // all values in this block are equal
@@ -1109,7 +1316,7 @@ public class BKDWriter implements Closeable {
     return end - start;
   }
 
-  protected void writeCommonPrefixes(IndexOutput out, int[] commonPrefixes, byte[] packedValue) throws IOException {
+  private void writeCommonPrefixes(IndexOutput out, int[] commonPrefixes, byte[] packedValue) throws IOException {
     for(int dim=0;dim<numDims;dim++) {
       out.writeVInt(commonPrefixes[dim]);
       //System.out.println(commonPrefixes[dim] + " of " + bytesPerDim);
@@ -1177,7 +1384,7 @@ public class BKDWriter implements Closeable {
     // TODO: find a way to also checksum this reader?  If we changed to markLeftTree, and scanned the final chunk, it could work?
     try (PointReader reader = source.writer.getReader(source.start + source.count - rightCount, rightCount)) {
       boolean result = reader.next();
-      assert result;
+      assert result: "rightCount=" + rightCount + " source.count=" + source.count + " source.writer=" + source.writer;
       System.arraycopy(reader.packedValue(), splitDim*bytesPerDim, scratch1, 0, bytesPerDim);
       if (numDims > 1) {
         assert ordBitSet.get(reader.ord()) == false;
@@ -1244,12 +1451,12 @@ public class BKDWriter implements Closeable {
 
   /* Recursively reorders the provided reader and writes the bkd-tree on the fly. */
   private void build(int nodeID, int leafNodeOffset,
-      MutablePointValues reader, int from, int to,
-      IndexOutput out,
-      byte[] minPackedValue, byte[] maxPackedValue,
-      byte[] splitPackedValues,
-      long[] leafBlockFPs,
-      int[] spareDocIds) throws IOException {
+                     MutablePointValues reader, int from, int to,
+                     IndexOutput out,
+                     byte[] minPackedValue, byte[] maxPackedValue,
+                     byte[] splitPackedValues,
+                     long[] leafBlockFPs,
+                     int[] spareDocIds) throws IOException {
 
     if (nodeID >= leafNodeOffset) {
       // leaf node
@@ -1311,6 +1518,7 @@ public class BKDWriter implements Closeable {
       for (int i = from; i < to; ++i) {
         docIDs[i - from] = reader.getDocID(i);
       }
+      //System.out.println("writeLeafBlock pos=" + out.getFilePointer());
       writeLeafBlockDocs(out, docIDs, 0, count);
 
       // Write the common prefixes:
@@ -1344,6 +1552,7 @@ public class BKDWriter implements Closeable {
           break;
         }
       }
+
       MutablePointsReaderUtils.partition(maxDoc, splitDim, bytesPerDim, commonPrefixLen,
           reader, from, to, mid, scratchBytesRef1, scratchBytesRef2);
 
@@ -1381,7 +1590,7 @@ public class BKDWriter implements Closeable {
     for(PathSlice slice : slices) {
       assert slice.count == slices[0].count;
     }
-
+    
     if (numDims == 1 && slices[0].writer instanceof OfflinePointWriter && slices[0].count <= maxPointsSortInHeap) {
       // Special case for 1D, to cutover to heap once we recurse deeply enough:
       slices[0] = switchToHeap(slices[0], toCloseHeroically);
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointReader.java
index 0cd4bd2a8a0..99182cb7f91 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointReader.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointReader.java
@@ -18,7 +18,10 @@ package org.apache.lucene.util.bkd;
 
 import java.util.List;
 
-final class HeapPointReader extends PointReader {
+/** Utility class to read buffered points from in-heap arrays.
+ *
+ * @lucene.internal */
+public final class HeapPointReader extends PointReader {
   private int curRead;
   final List<byte[]> blocks;
   final int valuesPerBlock;
@@ -30,7 +33,7 @@ final class HeapPointReader extends PointReader {
   final byte[] scratch;
   final boolean singleValuePerDoc;
 
-  HeapPointReader(List<byte[]> blocks, int valuesPerBlock, int packedBytesLength, int[] ords, long[] ordsLong, int[] docIDs, int start, int end, boolean singleValuePerDoc) {
+  public HeapPointReader(List<byte[]> blocks, int valuesPerBlock, int packedBytesLength, int[] ords, long[] ordsLong, int[] docIDs, int start, int end, boolean singleValuePerDoc) {
     this.blocks = blocks;
     this.valuesPerBlock = valuesPerBlock;
     this.singleValuePerDoc = singleValuePerDoc;
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointWriter.java
index 24d248b930d..e102651ceb2 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointWriter.java
@@ -24,18 +24,21 @@ import java.util.List;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 
-final class HeapPointWriter implements PointWriter {
-  int[] docIDs;
-  long[] ordsLong;
-  int[] ords;
+/** Utility class to write new points into in-heap arrays.
+ *
+ *  @lucene.internal */
+public final class HeapPointWriter implements PointWriter {
+  public int[] docIDs;
+  public long[] ordsLong;
+  public int[] ords;
   private int nextWrite;
   private boolean closed;
   final int maxSize;
-  final int valuesPerBlock;
+  public final int valuesPerBlock;
   final int packedBytesLength;
   final boolean singleValuePerDoc;
   // NOTE: can't use ByteBlockPool because we need random-write access when sorting in heap
-  final List<byte[]> blocks = new ArrayList<>();
+  public final List<byte[]> blocks = new ArrayList<>();
 
   public HeapPointWriter(int initSize, int maxSize, int packedBytesLength, boolean longOrds, boolean singleValuePerDoc) {
     docIDs = new int[initSize];
@@ -77,7 +80,7 @@ final class HeapPointWriter implements PointWriter {
     nextWrite = other.nextWrite;
   }
 
-  void readPackedValue(int index, byte[] bytes) {
+  public void readPackedValue(int index, byte[] bytes) {
     assert bytes.length == packedBytesLength;
     int block = index / valuesPerBlock;
     int blockIndex = index % valuesPerBlock;
@@ -85,7 +88,7 @@ final class HeapPointWriter implements PointWriter {
   }
 
   /** Returns a reference, in <code>result</code>, to the byte[] slice holding this value */
-  void getPackedValueSlice(int index, BytesRef result) {
+  public void getPackedValueSlice(int index, BytesRef result) {
     int block = index / valuesPerBlock;
     int blockIndex = index % valuesPerBlock;
     result.bytes = blocks.get(block);
@@ -138,7 +141,8 @@ final class HeapPointWriter implements PointWriter {
   @Override
   public PointReader getReader(long start, long length) {
     assert start + length <= docIDs.length: "start=" + start + " length=" + length + " docIDs.length=" + docIDs.length;
-    return new HeapPointReader(blocks, valuesPerBlock, packedBytesLength, ords, ordsLong, docIDs, (int) start, nextWrite, singleValuePerDoc);
+    assert start + length <= nextWrite: "start=" + start + " length=" + length + " nextWrite=" + nextWrite;
+    return new HeapPointReader(blocks, valuesPerBlock, packedBytesLength, ords, ordsLong, docIDs, (int) start, Math.toIntExact(start+length), singleValuePerDoc);
   }
 
   @Override
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java b/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java
index 132ad3cefa3..1c68478db66 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java
@@ -26,13 +26,16 @@ import org.apache.lucene.util.Selector;
 import org.apache.lucene.util.StringHelper;
 import org.apache.lucene.util.packed.PackedInts;
 
-final class MutablePointsReaderUtils {
+/** Utility APIs for sorting and partitioning buffered points.
+ *
+ * @lucene.internal */
+public final class MutablePointsReaderUtils {
 
   MutablePointsReaderUtils() {}
 
   /** Sort the given {@link MutablePointValues} based on its packed value then doc ID. */
-  static void sort(int maxDoc, int packedBytesLength,
-      MutablePointValues reader, int from, int to) {
+  public static void sort(int maxDoc, int packedBytesLength,
+                          MutablePointValues reader, int from, int to) {
     final int bitsPerDocId = PackedInts.bitsRequired(maxDoc - 1);
     new MSBRadixSorter(packedBytesLength + (bitsPerDocId + 7) / 8) {
 
@@ -88,9 +91,9 @@ final class MutablePointsReaderUtils {
   }
 
   /** Sort points on the given dimension. */
-  static void sortByDim(int sortedDim, int bytesPerDim, int[] commonPrefixLengths,
-      MutablePointValues reader, int from, int to,
-      BytesRef scratch1, BytesRef scratch2) {
+  public static void sortByDim(int sortedDim, int bytesPerDim, int[] commonPrefixLengths,
+                               MutablePointValues reader, int from, int to,
+                               BytesRef scratch1, BytesRef scratch2) {
 
     // No need for a fancy radix sort here, this is called on the leaves only so
     // there are not many values to sort
@@ -127,9 +130,9 @@ final class MutablePointsReaderUtils {
   /** Partition points around {@code mid}. All values on the left must be less
    *  than or equal to it and all values on the right must be greater than or
    *  equal to it. */
-  static void partition(int maxDoc, int splitDim, int bytesPerDim, int commonPrefixLen,
-      MutablePointValues reader, int from, int to, int mid,
-      BytesRef scratch1, BytesRef scratch2) {
+  public static void partition(int maxDoc, int splitDim, int bytesPerDim, int commonPrefixLen,
+                               MutablePointValues reader, int from, int to, int mid,
+                               BytesRef scratch1, BytesRef scratch2) {
     final int offset = splitDim * bytesPerDim + commonPrefixLen;
     final int cmpBytes = bytesPerDim - commonPrefixLen;
     final int bitsPerDocId = PackedInts.bitsRequired(maxDoc - 1);
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointReader.java
index 17758c02c93..2861d593deb 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointReader.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointReader.java
@@ -27,8 +27,10 @@ import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.LongBitSet;
 
-/** Reads points from disk in a fixed-with format, previously written with {@link OfflinePointWriter}. */
-final class OfflinePointReader extends PointReader {
+/** Reads points from disk in a fixed-with format, previously written with {@link OfflinePointWriter}.
+ *
+ * @lucene.internal */
+public final class OfflinePointReader extends PointReader {
   long countLeft;
   final IndexInput in;
   private final byte[] packedValue;
@@ -43,7 +45,7 @@ final class OfflinePointReader extends PointReader {
   // File name we are reading
   final String name;
 
-  OfflinePointReader(Directory tempDir, String tempFileName, int packedBytesLength, long start, long length,
+  public OfflinePointReader(Directory tempDir, String tempFileName, int packedBytesLength, long start, long length,
                      boolean longOrds, boolean singleValuePerDoc) throws IOException {
     this.singleValuePerDoc = singleValuePerDoc;
     int bytesPerDoc = packedBytesLength + Integer.BYTES;
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointWriter.java
index 87637ae27ba..7e615a657ec 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointWriter.java
@@ -26,12 +26,14 @@ import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
 import org.apache.lucene.store.IndexOutput;
 
-/** Writes points to disk in a fixed-with format. */
-final class OfflinePointWriter implements PointWriter {
+/** Writes points to disk in a fixed-with format.
+ *
+ * @lucene.internal */
+public final class OfflinePointWriter implements PointWriter {
 
   final Directory tempDir;
-  final IndexOutput out;
-  final String name;
+  public final IndexOutput out;
+  public final String name;
   final int packedBytesLength;
   final boolean singleValuePerDoc;
   long count;
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/PointReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/PointReader.java
index 90de0d182e1..0c31275d02b 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/PointReader.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/PointReader.java
@@ -24,20 +24,22 @@ import org.apache.lucene.util.LongBitSet;
 
 /** One pass iterator through all points previously written with a
  *  {@link PointWriter}, abstracting away whether points a read
- *  from (offline) disk or simple arrays in heap. */
-abstract class PointReader implements Closeable {
+ *  from (offline) disk or simple arrays in heap.
+ *
+ * @lucene.internal */
+public abstract class PointReader implements Closeable {
 
   /** Returns false once iteration is done, else true. */
-  abstract boolean next() throws IOException;
+  public abstract boolean next() throws IOException;
 
   /** Returns the packed byte[] value */
-  abstract byte[] packedValue();
+  public abstract byte[] packedValue();
 
   /** Point ordinal */
-  abstract long ord();
+  public abstract long ord();
 
   /** DocID for this point */
-  abstract int docID();
+  public abstract int docID();
 
   /** Iterates through the next {@code count} ords, marking them in the provided {@code ordBitSet}. */
   public void markOrds(long count, LongBitSet ordBitSet) throws IOException {
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/PointWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/PointWriter.java
index d19f6e547b2..0222d0ef5bb 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/PointWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/PointWriter.java
@@ -23,8 +23,10 @@ import java.util.List;
 
 /** Appends many points, and then at the end provides a {@link PointReader} to iterate
  *  those points.  This abstracts away whether we write to disk, or use simple arrays
- *  in heap. */
-interface PointWriter extends Closeable {
+ *  in heap.
+ *
+ *  @lucene.internal */
+public interface PointWriter extends Closeable {
   /** Add a new point */
   void append(byte[] packedValue, long ord, int docID) throws IOException;
 
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java b/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java
index 5ad71bfca17..73b28139f9e 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java
@@ -621,6 +621,9 @@ public class TestPointQueries extends LuceneTestCase {
     int numDims = TestUtil.nextInt(random(), 1, PointValues.MAX_DIMENSIONS);
 
     int sameValuePct = random().nextInt(100);
+    if (VERBOSE) {
+      System.out.println("TEST: sameValuePct=" + sameValuePct);
+    }
 
     byte[][][] docValues = new byte[numValues][][];
 
diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/Test2BBKDPoints.java b/lucene/core/src/test/org/apache/lucene/util/bkd/Test2BBKDPoints.java
index af2e463102c..e30168c17e8 100644
--- a/lucene/core/src/test/org/apache/lucene/util/bkd/Test2BBKDPoints.java
+++ b/lucene/core/src/test/org/apache/lucene/util/bkd/Test2BBKDPoints.java
@@ -16,6 +16,7 @@
  */
 package org.apache.lucene.util.bkd;
 
+import org.apache.lucene.index.CheckIndex;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.store.IOContext;
@@ -64,7 +65,10 @@ public class Test2BBKDPoints extends LuceneTestCase {
     IndexInput in = dir.openInput("1d.bkd", IOContext.DEFAULT);
     in.seek(indexFP);
     BKDReader r = new BKDReader(in);
-    r.verify(numDocs);
+    CheckIndex.VerifyPointsVisitor visitor = new CheckIndex.VerifyPointsVisitor("1d", numDocs, r);
+    r.intersect(visitor);
+    assertEquals(r.size(), visitor.getPointCountSeen());
+    assertEquals(r.getDocCount(), visitor.getDocCountSeen());
     in.close();
     dir.close();
   }
@@ -101,7 +105,10 @@ public class Test2BBKDPoints extends LuceneTestCase {
     IndexInput in = dir.openInput("2d.bkd", IOContext.DEFAULT);
     in.seek(indexFP);
     BKDReader r = new BKDReader(in);
-    r.verify(numDocs);
+    CheckIndex.VerifyPointsVisitor visitor = new CheckIndex.VerifyPointsVisitor("2d", numDocs, r);
+    r.intersect(visitor);
+    assertEquals(r.size(), visitor.getPointCountSeen());
+    assertEquals(r.getDocCount(), visitor.getDocCountSeen());
     in.close();
     dir.close();
   }
diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
index 9eb1fd3ff09..8b9b7a5dd4f 100644
--- a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
+++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
@@ -28,6 +28,7 @@ import org.apache.lucene.index.CorruptIndexException;
 import org.apache.lucene.index.MergeState;
 import org.apache.lucene.index.PointValues.IntersectVisitor;
 import org.apache.lucene.index.PointValues.Relation;
+import org.apache.lucene.index.PointValues;
 import org.apache.lucene.store.CorruptingIndexOutput;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FilterDirectory;
@@ -1010,4 +1011,57 @@ public class TestBKD extends LuceneTestCase {
     }
   }
 
+  // Claims 16 bytes per dim, but only use the bottom N 1-3 bytes; this would happen e.g. if a user indexes what are actually just short
+  // values as a LongPoint:
+  public void testWastedLeadingBytes() throws Exception {
+    int numDims = TestUtil.nextInt(random(), 1, PointValues.MAX_DIMENSIONS);
+    int bytesPerDim = PointValues.MAX_NUM_BYTES;
+    int bytesUsed = TestUtil.nextInt(random(), 1, 3);
+
+    Directory dir = newFSDirectory(createTempDir());
+    int numDocs = 100000;
+    BKDWriter w = new BKDWriter(numDocs+1, dir, "tmp", numDims, bytesPerDim, 32, 1f, numDocs, true);
+    byte[] tmp = new byte[bytesUsed];
+    byte[] buffer = new byte[numDims * bytesPerDim];
+    for(int i=0;i<numDocs;i++) {
+      for(int dim=0;dim<numDims;dim++) {
+        random().nextBytes(tmp);
+        System.arraycopy(tmp, 0, buffer, dim*bytesPerDim+(bytesPerDim-bytesUsed), tmp.length);
+      }
+      w.add(buffer, i);
+    }
+    
+    IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT);
+    long fp = w.finish(out);
+    out.close();
+
+    IndexInput in = dir.openInput("bkd", IOContext.DEFAULT);
+    in.seek(fp);
+    BKDReader r = new BKDReader(in);
+    int[] count = new int[1];
+    r.intersect(new IntersectVisitor() {
+
+        @Override
+        public void visit(int docID) {
+          count[0]++;
+        }
+
+        @Override
+        public void visit(int docID, byte[] packedValue) {
+          visit(docID);
+        }
+
+        @Override
+        public Relation compare(byte[] minPacked, byte[] maxPacked) {
+          if (random().nextInt(7) == 1) {
+            return Relation.CELL_CROSSES_QUERY;
+          } else {
+            return Relation.CELL_INSIDE_QUERY;
+          }
+        }
+      });
+    assertEquals(numDocs, count[0]);
+    in.close();
+    dir.close();
+  }
 }
diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java
index 6b218cf3b9d..dcce285383b 100644
--- a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java
+++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java
@@ -228,7 +228,7 @@ public class TestFSTs extends LuceneTestCase {
         final long value = lastOutput + TestUtil.nextInt(random(), 1, 1000);
         lastOutput = value;
         pairs.add(new FSTTester.InputOutput<>(terms[idx],
-                                                                         outputs.newPair((long) idx, value)));
+                                              outputs.newPair((long) idx, value)));
       }
       new FSTTester<>(random(), dir, inputMode, pairs, outputs, false).doTest(true);
     }
diff --git a/lucene/sandbox/src/java/org/apache/lucene/document/NearestNeighbor.java b/lucene/sandbox/src/java/org/apache/lucene/document/NearestNeighbor.java
index 3b9f302f5eb..587c63fb7a3 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/document/NearestNeighbor.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/document/NearestNeighbor.java
@@ -26,7 +26,10 @@ import org.apache.lucene.geo.Rectangle;
 import org.apache.lucene.index.PointValues.IntersectVisitor;
 import org.apache.lucene.index.PointValues.Relation;
 import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.SloppyMath;
+import org.apache.lucene.util.bkd.BKDReader.IndexTree;
+import org.apache.lucene.util.bkd.BKDReader.IntersectState;
 import org.apache.lucene.util.bkd.BKDReader;
 
 import static org.apache.lucene.geo.GeoEncodingUtils.decodeLatitude;
@@ -41,16 +44,16 @@ class NearestNeighbor {
 
   static class Cell implements Comparable<Cell> {
     final int readerIndex;
-    final int nodeID;
     final byte[] minPacked;
     final byte[] maxPacked;
+    final IndexTree index;
 
     /** The closest possible distance of all points in this cell */
     final double distanceMeters;
 
-    public Cell(int readerIndex, int nodeID, byte[] minPacked, byte[] maxPacked, double distanceMeters) {
+    public Cell(IndexTree index, int readerIndex, byte[] minPacked, byte[] maxPacked, double distanceMeters) {
+      this.index = index;
       this.readerIndex = readerIndex;
-      this.nodeID = nodeID;
       this.minPacked = minPacked.clone();
       this.maxPacked = maxPacked.clone();
       this.distanceMeters = distanceMeters;
@@ -66,7 +69,7 @@ class NearestNeighbor {
       double minLon = decodeLongitude(minPacked, Integer.BYTES);
       double maxLat = decodeLatitude(maxPacked, 0);
       double maxLon = decodeLongitude(maxPacked, Integer.BYTES);
-      return "Cell(readerIndex=" + readerIndex + " lat=" + minLat + " TO " + maxLat + ", lon=" + minLon + " TO " + maxLon + "; distanceMeters=" + distanceMeters + ")";
+      return "Cell(readerIndex=" + readerIndex + " nodeID=" + index.getNodeID() + " isLeaf=" + index.isLeafNode() + " lat=" + minLat + " TO " + maxLat + ", lon=" + minLon + " TO " + maxLon + "; distanceMeters=" + distanceMeters + ")";
     }
   }
 
@@ -219,13 +222,21 @@ class NearestNeighbor {
     List<BKDReader.IntersectState> states = new ArrayList<>();
 
     // Add root cell for each reader into the queue:
+    int bytesPerDim = -1;
+    
     for(int i=0;i<readers.size();i++) {
       BKDReader reader = readers.get(i);
+      if (bytesPerDim == -1) {
+        bytesPerDim = reader.getBytesPerDimension();
+      } else if (bytesPerDim != reader.getBytesPerDimension()) {
+        throw new IllegalStateException("bytesPerDim changed from " + bytesPerDim + " to " + reader.getBytesPerDimension() + " across readers");
+      }
       byte[] minPackedValue = reader.getMinPackedValue();
       byte[] maxPackedValue = reader.getMaxPackedValue();
-      states.add(reader.getIntersectState(visitor));
+      IntersectState state = reader.getIntersectState(visitor);
+      states.add(state);
 
-      cellQueue.offer(new Cell(i, 1, reader.getMinPackedValue(), reader.getMaxPackedValue(),
+      cellQueue.offer(new Cell(state.index, i, reader.getMinPackedValue(), reader.getMaxPackedValue(),
                                approxBestDistance(minPackedValue, maxPackedValue, pointLat, pointLon)));
     }
 
@@ -236,12 +247,12 @@ class NearestNeighbor {
       // TODO: if we replace approxBestDistance with actualBestDistance, we can put an opto here to break once this "best" cell is fully outside of the hitQueue bottom's radius:
       BKDReader reader = readers.get(cell.readerIndex);
 
-      if (reader.isLeafNode(cell.nodeID)) {
+      if (cell.index.isLeafNode()) {
         //System.out.println("    leaf");
         // Leaf block: visit all points and possibly collect them:
         visitor.curDocBase = docBases.get(cell.readerIndex);
         visitor.curLiveDocs = liveDocs.get(cell.readerIndex);
-        reader.visitLeafBlockValues(cell.nodeID, states.get(cell.readerIndex));
+        reader.visitLeafBlockValues(cell.index, states.get(cell.readerIndex));
         //System.out.println("    now " + hitQueue.size() + " hits");
       } else {
         //System.out.println("    non-leaf");
@@ -257,14 +268,23 @@ class NearestNeighbor {
           continue;
         }
         
+        BytesRef splitValue = BytesRef.deepCopyOf(cell.index.getSplitDimValue());
+        int splitDim = cell.index.getSplitDim();
+        
+        // we must clone the index so that we we can recurse left and right "concurrently":
+        IndexTree newIndex = cell.index.clone();
         byte[] splitPackedValue = cell.maxPacked.clone();
-        reader.copySplitValue(cell.nodeID, splitPackedValue);
-        cellQueue.offer(new Cell(cell.readerIndex, 2*cell.nodeID, cell.minPacked, splitPackedValue,
+        System.arraycopy(splitValue.bytes, splitValue.offset, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
+
+        cell.index.pushLeft();
+        cellQueue.offer(new Cell(cell.index, cell.readerIndex, cell.minPacked, splitPackedValue,
                                  approxBestDistance(cell.minPacked, splitPackedValue, pointLat, pointLon)));
 
         splitPackedValue = cell.minPacked.clone();
-        reader.copySplitValue(cell.nodeID, splitPackedValue);
-        cellQueue.offer(new Cell(cell.readerIndex, 2*cell.nodeID+1, splitPackedValue, cell.maxPacked,
+        System.arraycopy(splitValue.bytes, splitValue.offset, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
+
+        newIndex.pushRight();
+        cellQueue.offer(new Cell(newIndex, cell.readerIndex, splitPackedValue, cell.maxPacked,
                                  approxBestDistance(splitPackedValue, cell.maxPacked, pointLat, pointLon)));
       }
     }