mirror of
https://github.com/apache/lucene.git
synced 2025-02-09 03:25:15 +00:00
LUCENE-7563: use a compressed format for the in-heap BKD index
This commit is contained in:
parent
39c2f3d80f
commit
5e8db2e068
@ -126,6 +126,10 @@ Optimizations
|
|||||||
* LUCENE-7568: Optimize merging when index sorting is used but the
|
* LUCENE-7568: Optimize merging when index sorting is used but the
|
||||||
index is already sorted (Jim Ferenczi via Mike McCandless)
|
index is already sorted (Jim Ferenczi via Mike McCandless)
|
||||||
|
|
||||||
|
* LUCENE-7563: The BKD in-memory index for dimensional points now uses
|
||||||
|
a compressed format, using substantially less RAM in some cases
|
||||||
|
(Adrien Grand, Mike McCandless)
|
||||||
|
|
||||||
Other
|
Other
|
||||||
|
|
||||||
* LUCENE-7546: Fixed references to benchmark wikipedia data and the Jenkins line-docs file
|
* LUCENE-7546: Fixed references to benchmark wikipedia data and the Jenkins line-docs file
|
||||||
|
@ -16,13 +16,17 @@
|
|||||||
*/
|
*/
|
||||||
package org.apache.lucene.codecs.simpletext;
|
package org.apache.lucene.codecs.simpletext;
|
||||||
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
|
|
||||||
|
import org.apache.lucene.codecs.simpletext.SimpleTextUtil;
|
||||||
|
import org.apache.lucene.index.CorruptIndexException;
|
||||||
|
import org.apache.lucene.index.PointValues;
|
||||||
import org.apache.lucene.store.IndexInput;
|
import org.apache.lucene.store.IndexInput;
|
||||||
|
import org.apache.lucene.util.Accountable;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.BytesRefBuilder;
|
import org.apache.lucene.util.BytesRefBuilder;
|
||||||
|
import org.apache.lucene.util.RamUsageEstimator;
|
||||||
import org.apache.lucene.util.StringHelper;
|
import org.apache.lucene.util.StringHelper;
|
||||||
import org.apache.lucene.util.bkd.BKDReader;
|
import org.apache.lucene.util.bkd.BKDReader;
|
||||||
|
|
||||||
@ -30,15 +34,105 @@ import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_C
|
|||||||
import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_DOC_ID;
|
import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_DOC_ID;
|
||||||
import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_VALUE;
|
import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_VALUE;
|
||||||
|
|
||||||
class SimpleTextBKDReader extends BKDReader {
|
/** Forked from {@link BKDReader} and simplified/specialized for SimpleText's usage */
|
||||||
|
|
||||||
public SimpleTextBKDReader(IndexInput datIn, int numDims, int maxPointsInLeafNode, int bytesPerDim, long[] leafBlockFPs, byte[] splitPackedValues,
|
final class SimpleTextBKDReader extends PointValues implements Accountable {
|
||||||
|
// Packed array of byte[] holding all split values in the full binary tree:
|
||||||
|
final private byte[] splitPackedValues;
|
||||||
|
final long[] leafBlockFPs;
|
||||||
|
final private int leafNodeOffset;
|
||||||
|
final int numDims;
|
||||||
|
final int bytesPerDim;
|
||||||
|
final int bytesPerIndexEntry;
|
||||||
|
final IndexInput in;
|
||||||
|
final int maxPointsInLeafNode;
|
||||||
|
final byte[] minPackedValue;
|
||||||
|
final byte[] maxPackedValue;
|
||||||
|
final long pointCount;
|
||||||
|
final int docCount;
|
||||||
|
final int version;
|
||||||
|
protected final int packedBytesLength;
|
||||||
|
|
||||||
|
public SimpleTextBKDReader(IndexInput in, int numDims, int maxPointsInLeafNode, int bytesPerDim, long[] leafBlockFPs, byte[] splitPackedValues,
|
||||||
byte[] minPackedValue, byte[] maxPackedValue, long pointCount, int docCount) throws IOException {
|
byte[] minPackedValue, byte[] maxPackedValue, long pointCount, int docCount) throws IOException {
|
||||||
super(datIn, numDims, maxPointsInLeafNode, bytesPerDim, leafBlockFPs, splitPackedValues, minPackedValue, maxPackedValue, pointCount, docCount);
|
this.in = in;
|
||||||
|
this.numDims = numDims;
|
||||||
|
this.maxPointsInLeafNode = maxPointsInLeafNode;
|
||||||
|
this.bytesPerDim = bytesPerDim;
|
||||||
|
// no version check here because callers of this API (SimpleText) have no back compat:
|
||||||
|
bytesPerIndexEntry = numDims == 1 ? bytesPerDim : bytesPerDim + 1;
|
||||||
|
packedBytesLength = numDims * bytesPerDim;
|
||||||
|
this.leafNodeOffset = leafBlockFPs.length;
|
||||||
|
this.leafBlockFPs = leafBlockFPs;
|
||||||
|
this.splitPackedValues = splitPackedValues;
|
||||||
|
this.minPackedValue = minPackedValue;
|
||||||
|
this.maxPackedValue = maxPackedValue;
|
||||||
|
this.pointCount = pointCount;
|
||||||
|
this.docCount = docCount;
|
||||||
|
this.version = SimpleTextBKDWriter.VERSION_CURRENT;
|
||||||
|
assert minPackedValue.length == packedBytesLength;
|
||||||
|
assert maxPackedValue.length == packedBytesLength;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
/** Used to track all state for a single call to {@link #intersect}. */
|
||||||
protected void visitDocIDs(IndexInput in, long blockFP, IntersectVisitor visitor) throws IOException {
|
public static final class IntersectState {
|
||||||
|
final IndexInput in;
|
||||||
|
final int[] scratchDocIDs;
|
||||||
|
final byte[] scratchPackedValue;
|
||||||
|
final int[] commonPrefixLengths;
|
||||||
|
|
||||||
|
final IntersectVisitor visitor;
|
||||||
|
|
||||||
|
public IntersectState(IndexInput in, int numDims,
|
||||||
|
int packedBytesLength,
|
||||||
|
int maxPointsInLeafNode,
|
||||||
|
IntersectVisitor visitor) {
|
||||||
|
this.in = in;
|
||||||
|
this.visitor = visitor;
|
||||||
|
this.commonPrefixLengths = new int[numDims];
|
||||||
|
this.scratchDocIDs = new int[maxPointsInLeafNode];
|
||||||
|
this.scratchPackedValue = new byte[packedBytesLength];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void intersect(IntersectVisitor visitor) throws IOException {
|
||||||
|
intersect(getIntersectState(visitor), 1, minPackedValue, maxPackedValue);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Fast path: this is called when the query box fully encompasses all cells under this node. */
|
||||||
|
private void addAll(IntersectState state, int nodeID) throws IOException {
|
||||||
|
//System.out.println("R: addAll nodeID=" + nodeID);
|
||||||
|
|
||||||
|
if (nodeID >= leafNodeOffset) {
|
||||||
|
//System.out.println("ADDALL");
|
||||||
|
visitDocIDs(state.in, leafBlockFPs[nodeID-leafNodeOffset], state.visitor);
|
||||||
|
// TODO: we can assert that the first value here in fact matches what the index claimed?
|
||||||
|
} else {
|
||||||
|
addAll(state, 2*nodeID);
|
||||||
|
addAll(state, 2*nodeID+1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Create a new {@link IntersectState} */
|
||||||
|
public IntersectState getIntersectState(IntersectVisitor visitor) {
|
||||||
|
return new IntersectState(in.clone(), numDims,
|
||||||
|
packedBytesLength,
|
||||||
|
maxPointsInLeafNode,
|
||||||
|
visitor);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Visits all docIDs and packed values in a single leaf block */
|
||||||
|
public void visitLeafBlockValues(int nodeID, IntersectState state) throws IOException {
|
||||||
|
int leafID = nodeID - leafNodeOffset;
|
||||||
|
|
||||||
|
// Leaf node; scan and filter all points in this block:
|
||||||
|
int count = readDocIDs(state.in, leafBlockFPs[leafID], state.scratchDocIDs);
|
||||||
|
|
||||||
|
// Again, this time reading values and checking with the visitor
|
||||||
|
visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor);
|
||||||
|
}
|
||||||
|
|
||||||
|
void visitDocIDs(IndexInput in, long blockFP, IntersectVisitor visitor) throws IOException {
|
||||||
BytesRefBuilder scratch = new BytesRefBuilder();
|
BytesRefBuilder scratch = new BytesRefBuilder();
|
||||||
in.seek(blockFP);
|
in.seek(blockFP);
|
||||||
readLine(in, scratch);
|
readLine(in, scratch);
|
||||||
@ -50,8 +144,7 @@ class SimpleTextBKDReader extends BKDReader {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
int readDocIDs(IndexInput in, long blockFP, int[] docIDs) throws IOException {
|
||||||
protected int readDocIDs(IndexInput in, long blockFP, int[] docIDs) throws IOException {
|
|
||||||
BytesRefBuilder scratch = new BytesRefBuilder();
|
BytesRefBuilder scratch = new BytesRefBuilder();
|
||||||
in.seek(blockFP);
|
in.seek(blockFP);
|
||||||
readLine(in, scratch);
|
readLine(in, scratch);
|
||||||
@ -63,8 +156,7 @@ class SimpleTextBKDReader extends BKDReader {
|
|||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
void visitDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
|
||||||
protected void visitDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
|
|
||||||
visitor.grow(count);
|
visitor.grow(count);
|
||||||
// NOTE: we don't do prefix coding, so we ignore commonPrefixLengths
|
// NOTE: we don't do prefix coding, so we ignore commonPrefixLengths
|
||||||
assert scratchPackedValue.length == packedBytesLength;
|
assert scratchPackedValue.length == packedBytesLength;
|
||||||
@ -79,6 +171,175 @@ class SimpleTextBKDReader extends BKDReader {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void visitCompressedDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor, int compressedDim) throws IOException {
|
||||||
|
// the byte at `compressedByteOffset` is compressed using run-length compression,
|
||||||
|
// other suffix bytes are stored verbatim
|
||||||
|
final int compressedByteOffset = compressedDim * bytesPerDim + commonPrefixLengths[compressedDim];
|
||||||
|
commonPrefixLengths[compressedDim]++;
|
||||||
|
int i;
|
||||||
|
for (i = 0; i < count; ) {
|
||||||
|
scratchPackedValue[compressedByteOffset] = in.readByte();
|
||||||
|
final int runLen = Byte.toUnsignedInt(in.readByte());
|
||||||
|
for (int j = 0; j < runLen; ++j) {
|
||||||
|
for(int dim=0;dim<numDims;dim++) {
|
||||||
|
int prefix = commonPrefixLengths[dim];
|
||||||
|
in.readBytes(scratchPackedValue, dim*bytesPerDim + prefix, bytesPerDim - prefix);
|
||||||
|
}
|
||||||
|
visitor.visit(docIDs[i+j], scratchPackedValue);
|
||||||
|
}
|
||||||
|
i += runLen;
|
||||||
|
}
|
||||||
|
if (i != count) {
|
||||||
|
throw new CorruptIndexException("Sub blocks do not add up to the expected count: " + count + " != " + i, in);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private int readCompressedDim(IndexInput in) throws IOException {
|
||||||
|
int compressedDim = in.readByte();
|
||||||
|
if (compressedDim < -1 || compressedDim >= numDims) {
|
||||||
|
throw new CorruptIndexException("Got compressedDim="+compressedDim, in);
|
||||||
|
}
|
||||||
|
return compressedDim;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void readCommonPrefixes(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in) throws IOException {
|
||||||
|
for(int dim=0;dim<numDims;dim++) {
|
||||||
|
int prefix = in.readVInt();
|
||||||
|
commonPrefixLengths[dim] = prefix;
|
||||||
|
if (prefix > 0) {
|
||||||
|
in.readBytes(scratchPackedValue, dim*bytesPerDim, prefix);
|
||||||
|
}
|
||||||
|
//System.out.println("R: " + dim + " of " + numDims + " prefix=" + prefix);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void intersect(IntersectState state,
|
||||||
|
int nodeID,
|
||||||
|
byte[] cellMinPacked, byte[] cellMaxPacked)
|
||||||
|
throws IOException {
|
||||||
|
|
||||||
|
/*
|
||||||
|
System.out.println("\nR: intersect nodeID=" + nodeID);
|
||||||
|
for(int dim=0;dim<numDims;dim++) {
|
||||||
|
System.out.println(" dim=" + dim + "\n cellMin=" + new BytesRef(cellMinPacked, dim*bytesPerDim, bytesPerDim) + "\n cellMax=" + new BytesRef(cellMaxPacked, dim*bytesPerDim, bytesPerDim));
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
Relation r = state.visitor.compare(cellMinPacked, cellMaxPacked);
|
||||||
|
|
||||||
|
if (r == Relation.CELL_OUTSIDE_QUERY) {
|
||||||
|
// This cell is fully outside of the query shape: stop recursing
|
||||||
|
return;
|
||||||
|
} else if (r == Relation.CELL_INSIDE_QUERY) {
|
||||||
|
// This cell is fully inside of the query shape: recursively add all points in this cell without filtering
|
||||||
|
addAll(state, nodeID);
|
||||||
|
return;
|
||||||
|
} else {
|
||||||
|
// The cell crosses the shape boundary, or the cell fully contains the query, so we fall through and do full filtering
|
||||||
|
}
|
||||||
|
|
||||||
|
if (nodeID >= leafNodeOffset) {
|
||||||
|
// TODO: we can assert that the first value here in fact matches what the index claimed?
|
||||||
|
|
||||||
|
int leafID = nodeID - leafNodeOffset;
|
||||||
|
|
||||||
|
// In the unbalanced case it's possible the left most node only has one child:
|
||||||
|
if (leafID < leafBlockFPs.length) {
|
||||||
|
// Leaf node; scan and filter all points in this block:
|
||||||
|
int count = readDocIDs(state.in, leafBlockFPs[leafID], state.scratchDocIDs);
|
||||||
|
|
||||||
|
// Again, this time reading values and checking with the visitor
|
||||||
|
visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor);
|
||||||
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
// Non-leaf node: recurse on the split left and right nodes
|
||||||
|
|
||||||
|
int address = nodeID * bytesPerIndexEntry;
|
||||||
|
int splitDim;
|
||||||
|
if (numDims == 1) {
|
||||||
|
splitDim = 0;
|
||||||
|
} else {
|
||||||
|
splitDim = splitPackedValues[address++] & 0xff;
|
||||||
|
}
|
||||||
|
|
||||||
|
assert splitDim < numDims;
|
||||||
|
|
||||||
|
// TODO: can we alloc & reuse this up front?
|
||||||
|
|
||||||
|
byte[] splitPackedValue = new byte[packedBytesLength];
|
||||||
|
|
||||||
|
// Recurse on left sub-tree:
|
||||||
|
System.arraycopy(cellMaxPacked, 0, splitPackedValue, 0, packedBytesLength);
|
||||||
|
System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
|
||||||
|
intersect(state,
|
||||||
|
2*nodeID,
|
||||||
|
cellMinPacked, splitPackedValue);
|
||||||
|
|
||||||
|
// Recurse on right sub-tree:
|
||||||
|
System.arraycopy(cellMinPacked, 0, splitPackedValue, 0, packedBytesLength);
|
||||||
|
System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
|
||||||
|
intersect(state,
|
||||||
|
2*nodeID+1,
|
||||||
|
splitPackedValue, cellMaxPacked);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Copies the split value for this node into the provided byte array */
|
||||||
|
public void copySplitValue(int nodeID, byte[] splitPackedValue) {
|
||||||
|
int address = nodeID * bytesPerIndexEntry;
|
||||||
|
int splitDim;
|
||||||
|
if (numDims == 1) {
|
||||||
|
splitDim = 0;
|
||||||
|
} else {
|
||||||
|
splitDim = splitPackedValues[address++] & 0xff;
|
||||||
|
}
|
||||||
|
|
||||||
|
assert splitDim < numDims;
|
||||||
|
System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long ramBytesUsed() {
|
||||||
|
return RamUsageEstimator.sizeOf(splitPackedValues) +
|
||||||
|
RamUsageEstimator.sizeOf(leafBlockFPs);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public byte[] getMinPackedValue() {
|
||||||
|
return minPackedValue.clone();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public byte[] getMaxPackedValue() {
|
||||||
|
return maxPackedValue.clone();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int getNumDimensions() {
|
||||||
|
return numDims;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int getBytesPerDimension() {
|
||||||
|
return bytesPerDim;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long size() {
|
||||||
|
return pointCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int getDocCount() {
|
||||||
|
return docCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isLeafNode(int nodeID) {
|
||||||
|
return nodeID >= leafNodeOffset;
|
||||||
|
}
|
||||||
|
|
||||||
private int parseInt(BytesRefBuilder scratch, BytesRef prefix) {
|
private int parseInt(BytesRefBuilder scratch, BytesRef prefix) {
|
||||||
assert startsWith(scratch, prefix);
|
assert startsWith(scratch, prefix);
|
||||||
return Integer.parseInt(stripPrefix(scratch, prefix));
|
return Integer.parseInt(stripPrefix(scratch, prefix));
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -36,7 +36,6 @@ import org.apache.lucene.util.BytesRef;
|
|||||||
import org.apache.lucene.util.BytesRefBuilder;
|
import org.apache.lucene.util.BytesRefBuilder;
|
||||||
import org.apache.lucene.util.IOUtils;
|
import org.apache.lucene.util.IOUtils;
|
||||||
import org.apache.lucene.util.StringHelper;
|
import org.apache.lucene.util.StringHelper;
|
||||||
import org.apache.lucene.util.bkd.BKDReader;
|
|
||||||
|
|
||||||
import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_FP;
|
import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_FP;
|
||||||
import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BYTES_PER_DIM;
|
import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BYTES_PER_DIM;
|
||||||
@ -58,7 +57,7 @@ class SimpleTextPointsReader extends PointsReader {
|
|||||||
|
|
||||||
private final IndexInput dataIn;
|
private final IndexInput dataIn;
|
||||||
final SegmentReadState readState;
|
final SegmentReadState readState;
|
||||||
final Map<String,BKDReader> readers = new HashMap<>();
|
final Map<String,SimpleTextBKDReader> readers = new HashMap<>();
|
||||||
final BytesRefBuilder scratch = new BytesRefBuilder();
|
final BytesRefBuilder scratch = new BytesRefBuilder();
|
||||||
|
|
||||||
public SimpleTextPointsReader(SegmentReadState readState) throws IOException {
|
public SimpleTextPointsReader(SegmentReadState readState) throws IOException {
|
||||||
@ -98,7 +97,7 @@ class SimpleTextPointsReader extends PointsReader {
|
|||||||
this.readState = readState;
|
this.readState = readState;
|
||||||
}
|
}
|
||||||
|
|
||||||
private BKDReader initReader(long fp) throws IOException {
|
private SimpleTextBKDReader initReader(long fp) throws IOException {
|
||||||
// NOTE: matches what writeIndex does in SimpleTextPointsWriter
|
// NOTE: matches what writeIndex does in SimpleTextPointsWriter
|
||||||
dataIn.seek(fp);
|
dataIn.seek(fp);
|
||||||
readLine(dataIn);
|
readLine(dataIn);
|
||||||
|
@ -20,7 +20,6 @@ package org.apache.lucene.codecs.simpletext;
|
|||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.function.IntFunction;
|
|
||||||
|
|
||||||
import org.apache.lucene.codecs.PointsReader;
|
import org.apache.lucene.codecs.PointsReader;
|
||||||
import org.apache.lucene.codecs.PointsWriter;
|
import org.apache.lucene.codecs.PointsWriter;
|
||||||
@ -33,29 +32,28 @@ import org.apache.lucene.index.SegmentWriteState;
|
|||||||
import org.apache.lucene.store.IndexOutput;
|
import org.apache.lucene.store.IndexOutput;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.BytesRefBuilder;
|
import org.apache.lucene.util.BytesRefBuilder;
|
||||||
import org.apache.lucene.util.bkd.BKDWriter;
|
|
||||||
|
|
||||||
class SimpleTextPointsWriter extends PointsWriter {
|
class SimpleTextPointsWriter extends PointsWriter {
|
||||||
|
|
||||||
final static BytesRef NUM_DIMS = new BytesRef("num dims ");
|
public final static BytesRef NUM_DIMS = new BytesRef("num dims ");
|
||||||
final static BytesRef BYTES_PER_DIM = new BytesRef("bytes per dim ");
|
public final static BytesRef BYTES_PER_DIM = new BytesRef("bytes per dim ");
|
||||||
final static BytesRef MAX_LEAF_POINTS = new BytesRef("max leaf points ");
|
public final static BytesRef MAX_LEAF_POINTS = new BytesRef("max leaf points ");
|
||||||
final static BytesRef INDEX_COUNT = new BytesRef("index count ");
|
public final static BytesRef INDEX_COUNT = new BytesRef("index count ");
|
||||||
final static BytesRef BLOCK_COUNT = new BytesRef("block count ");
|
public final static BytesRef BLOCK_COUNT = new BytesRef("block count ");
|
||||||
final static BytesRef BLOCK_DOC_ID = new BytesRef(" doc ");
|
public final static BytesRef BLOCK_DOC_ID = new BytesRef(" doc ");
|
||||||
final static BytesRef BLOCK_FP = new BytesRef(" block fp ");
|
public final static BytesRef BLOCK_FP = new BytesRef(" block fp ");
|
||||||
final static BytesRef BLOCK_VALUE = new BytesRef(" block value ");
|
public final static BytesRef BLOCK_VALUE = new BytesRef(" block value ");
|
||||||
final static BytesRef SPLIT_COUNT = new BytesRef("split count ");
|
public final static BytesRef SPLIT_COUNT = new BytesRef("split count ");
|
||||||
final static BytesRef SPLIT_DIM = new BytesRef(" split dim ");
|
public final static BytesRef SPLIT_DIM = new BytesRef(" split dim ");
|
||||||
final static BytesRef SPLIT_VALUE = new BytesRef(" split value ");
|
public final static BytesRef SPLIT_VALUE = new BytesRef(" split value ");
|
||||||
final static BytesRef FIELD_COUNT = new BytesRef("field count ");
|
public final static BytesRef FIELD_COUNT = new BytesRef("field count ");
|
||||||
final static BytesRef FIELD_FP_NAME = new BytesRef(" field fp name ");
|
public final static BytesRef FIELD_FP_NAME = new BytesRef(" field fp name ");
|
||||||
final static BytesRef FIELD_FP = new BytesRef(" field fp ");
|
public final static BytesRef FIELD_FP = new BytesRef(" field fp ");
|
||||||
final static BytesRef MIN_VALUE = new BytesRef("min value ");
|
public final static BytesRef MIN_VALUE = new BytesRef("min value ");
|
||||||
final static BytesRef MAX_VALUE = new BytesRef("max value ");
|
public final static BytesRef MAX_VALUE = new BytesRef("max value ");
|
||||||
final static BytesRef POINT_COUNT = new BytesRef("point count ");
|
public final static BytesRef POINT_COUNT = new BytesRef("point count ");
|
||||||
final static BytesRef DOC_COUNT = new BytesRef("doc count ");
|
public final static BytesRef DOC_COUNT = new BytesRef("doc count ");
|
||||||
final static BytesRef END = new BytesRef("END");
|
public final static BytesRef END = new BytesRef("END");
|
||||||
|
|
||||||
private IndexOutput dataOut;
|
private IndexOutput dataOut;
|
||||||
final BytesRefBuilder scratch = new BytesRefBuilder();
|
final BytesRefBuilder scratch = new BytesRefBuilder();
|
||||||
@ -75,105 +73,15 @@ class SimpleTextPointsWriter extends PointsWriter {
|
|||||||
boolean singleValuePerDoc = values.size() == values.getDocCount();
|
boolean singleValuePerDoc = values.size() == values.getDocCount();
|
||||||
|
|
||||||
// We use the normal BKDWriter, but subclass to customize how it writes the index and blocks to disk:
|
// We use the normal BKDWriter, but subclass to customize how it writes the index and blocks to disk:
|
||||||
try (BKDWriter writer = new BKDWriter(writeState.segmentInfo.maxDoc(),
|
try (SimpleTextBKDWriter writer = new SimpleTextBKDWriter(writeState.segmentInfo.maxDoc(),
|
||||||
writeState.directory,
|
writeState.directory,
|
||||||
writeState.segmentInfo.name,
|
writeState.segmentInfo.name,
|
||||||
fieldInfo.getPointDimensionCount(),
|
fieldInfo.getPointDimensionCount(),
|
||||||
fieldInfo.getPointNumBytes(),
|
fieldInfo.getPointNumBytes(),
|
||||||
BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE,
|
SimpleTextBKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE,
|
||||||
BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP,
|
SimpleTextBKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP,
|
||||||
values.size(),
|
values.size(),
|
||||||
singleValuePerDoc) {
|
singleValuePerDoc)) {
|
||||||
|
|
||||||
@Override
|
|
||||||
protected void writeIndex(IndexOutput out, long[] leafBlockFPs, byte[] splitPackedValues) throws IOException {
|
|
||||||
write(out, NUM_DIMS);
|
|
||||||
writeInt(out, numDims);
|
|
||||||
newline(out);
|
|
||||||
|
|
||||||
write(out, BYTES_PER_DIM);
|
|
||||||
writeInt(out, bytesPerDim);
|
|
||||||
newline(out);
|
|
||||||
|
|
||||||
write(out, MAX_LEAF_POINTS);
|
|
||||||
writeInt(out, maxPointsInLeafNode);
|
|
||||||
newline(out);
|
|
||||||
|
|
||||||
write(out, INDEX_COUNT);
|
|
||||||
writeInt(out, leafBlockFPs.length);
|
|
||||||
newline(out);
|
|
||||||
|
|
||||||
write(out, MIN_VALUE);
|
|
||||||
BytesRef br = new BytesRef(minPackedValue, 0, minPackedValue.length);
|
|
||||||
write(out, br.toString());
|
|
||||||
newline(out);
|
|
||||||
|
|
||||||
write(out, MAX_VALUE);
|
|
||||||
br = new BytesRef(maxPackedValue, 0, maxPackedValue.length);
|
|
||||||
write(out, br.toString());
|
|
||||||
newline(out);
|
|
||||||
|
|
||||||
write(out, POINT_COUNT);
|
|
||||||
writeLong(out, pointCount);
|
|
||||||
newline(out);
|
|
||||||
|
|
||||||
write(out, DOC_COUNT);
|
|
||||||
writeInt(out, docsSeen.cardinality());
|
|
||||||
newline(out);
|
|
||||||
|
|
||||||
for(int i=0;i<leafBlockFPs.length;i++) {
|
|
||||||
write(out, BLOCK_FP);
|
|
||||||
writeLong(out, leafBlockFPs[i]);
|
|
||||||
newline(out);
|
|
||||||
}
|
|
||||||
|
|
||||||
assert (splitPackedValues.length % (1 + fieldInfo.getPointNumBytes())) == 0;
|
|
||||||
int count = splitPackedValues.length / (1 + fieldInfo.getPointNumBytes());
|
|
||||||
assert count == leafBlockFPs.length;
|
|
||||||
|
|
||||||
write(out, SPLIT_COUNT);
|
|
||||||
writeInt(out, count);
|
|
||||||
newline(out);
|
|
||||||
|
|
||||||
for(int i=0;i<count;i++) {
|
|
||||||
write(out, SPLIT_DIM);
|
|
||||||
writeInt(out, splitPackedValues[i * (1 + fieldInfo.getPointNumBytes())] & 0xff);
|
|
||||||
newline(out);
|
|
||||||
write(out, SPLIT_VALUE);
|
|
||||||
br = new BytesRef(splitPackedValues, 1+(i * (1+fieldInfo.getPointNumBytes())), fieldInfo.getPointNumBytes());
|
|
||||||
write(out, br.toString());
|
|
||||||
newline(out);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected void writeLeafBlockDocs(IndexOutput out, int[] docIDs, int start, int count) throws IOException {
|
|
||||||
write(out, BLOCK_COUNT);
|
|
||||||
writeInt(out, count);
|
|
||||||
newline(out);
|
|
||||||
for(int i=0;i<count;i++) {
|
|
||||||
write(out, BLOCK_DOC_ID);
|
|
||||||
writeInt(out, docIDs[start+i]);
|
|
||||||
newline(out);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected void writeCommonPrefixes(IndexOutput out, int[] commonPrefixLengths, byte[] packedValue) {
|
|
||||||
// NOTE: we don't do prefix coding, so we ignore commonPrefixLengths
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected void writeLeafBlockPackedValues(IndexOutput out, int[] commonPrefixLengths, int count, int sortedDim, IntFunction<BytesRef> packedValues) throws IOException {
|
|
||||||
for (int i = 0; i < count; ++i) {
|
|
||||||
BytesRef packedValue = packedValues.apply(i);
|
|
||||||
// NOTE: we don't do prefix coding, so we ignore commonPrefixLengths
|
|
||||||
write(out, BLOCK_VALUE);
|
|
||||||
write(out, packedValue.toString());
|
|
||||||
newline(out);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}) {
|
|
||||||
|
|
||||||
values.intersect(new IntersectVisitor() {
|
values.intersect(new IntersectVisitor() {
|
||||||
@Override
|
@Override
|
||||||
@ -198,26 +106,6 @@ class SimpleTextPointsWriter extends PointsWriter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void write(IndexOutput out, String s) throws IOException {
|
|
||||||
SimpleTextUtil.write(out, s, scratch);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void writeInt(IndexOutput out, int x) throws IOException {
|
|
||||||
SimpleTextUtil.write(out, Integer.toString(x), scratch);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void writeLong(IndexOutput out, long x) throws IOException {
|
|
||||||
SimpleTextUtil.write(out, Long.toString(x), scratch);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void write(IndexOutput out, BytesRef b) throws IOException {
|
|
||||||
SimpleTextUtil.write(out, b);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void newline(IndexOutput out) throws IOException {
|
|
||||||
SimpleTextUtil.writeNewline(out);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void finish() throws IOException {
|
public void finish() throws IOException {
|
||||||
SimpleTextUtil.write(dataOut, END);
|
SimpleTextUtil.write(dataOut, END);
|
||||||
@ -250,4 +138,24 @@ class SimpleTextPointsWriter extends PointsWriter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void write(IndexOutput out, String s) throws IOException {
|
||||||
|
SimpleTextUtil.write(out, s, scratch);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void writeInt(IndexOutput out, int x) throws IOException {
|
||||||
|
SimpleTextUtil.write(out, Integer.toString(x), scratch);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void writeLong(IndexOutput out, long x) throws IOException {
|
||||||
|
SimpleTextUtil.write(out, Long.toString(x), scratch);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void write(IndexOutput out, BytesRef b) throws IOException {
|
||||||
|
SimpleTextUtil.write(out, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void newline(IndexOutput out) throws IOException {
|
||||||
|
SimpleTextUtil.writeNewline(out);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -28,7 +28,8 @@ import org.apache.lucene.index.SegmentWriteState;
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Lucene 6.0 point format, which encodes dimensional values in a block KD-tree structure
|
* Lucene 6.0 point format, which encodes dimensional values in a block KD-tree structure
|
||||||
* for fast shape intersection filtering. See <a href="https://www.cs.duke.edu/~pankaj/publications/papers/bkd-sstd.pdf">this paper</a> for details.
|
* for fast 1D range and N dimesional shape intersection filtering.
|
||||||
|
* See <a href="https://www.cs.duke.edu/~pankaj/publications/papers/bkd-sstd.pdf">this paper</a> for details.
|
||||||
*
|
*
|
||||||
* <p>This data structure is written as a series of blocks on disk, with an in-memory perfectly balanced
|
* <p>This data structure is written as a series of blocks on disk, with an in-memory perfectly balanced
|
||||||
* binary tree of split values referencing those blocks at the leaves.
|
* binary tree of split values referencing those blocks at the leaves.
|
||||||
@ -50,10 +51,13 @@ import org.apache.lucene.index.SegmentWriteState;
|
|||||||
* <li> maxPointsInLeafNode (vInt)
|
* <li> maxPointsInLeafNode (vInt)
|
||||||
* <li> bytesPerDim (vInt)
|
* <li> bytesPerDim (vInt)
|
||||||
* <li> count (vInt)
|
* <li> count (vInt)
|
||||||
* <li> byte[bytesPerDim]<sup>count</sup> (packed <code>byte[]</code> all split values)
|
* <li> packed index (byte[])
|
||||||
* <li> delta-blockFP (vLong)<sup>count</sup> (delta-coded file pointers to the on-disk leaf blocks))
|
|
||||||
* </ul>
|
* </ul>
|
||||||
*
|
*
|
||||||
|
* <p>The packed index uses hierarchical delta and prefix coding to compactly encode the file pointer for
|
||||||
|
* all leaf blocks, once the tree is traversed, as well as the split dimension and split value for each
|
||||||
|
* inner node of the tree.
|
||||||
|
*
|
||||||
* <p>After all fields blocks + index data are written, {@link CodecUtil#writeFooter} writes the checksum.
|
* <p>After all fields blocks + index data are written, {@link CodecUtil#writeFooter} writes the checksum.
|
||||||
*
|
*
|
||||||
* <p>The <code>.dii</code> file records the file pointer in the <code>.dim</code> file where each field's
|
* <p>The <code>.dii</code> file records the file pointer in the <code>.dim</code> file where each field's
|
||||||
|
@ -16,7 +16,7 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Components from the Lucene 6.0 index format. See {@link org.apache.lucene.codecs.lucene62}
|
* Components from the Lucene 6.0 index format. See {@link org.apache.lucene.codecs.lucene70}
|
||||||
* for an overview of the index format.
|
* for an overview of the current index format.
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.codecs.lucene60;
|
package org.apache.lucene.codecs.lucene60;
|
||||||
|
@ -17,8 +17,8 @@
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Components from the Lucene 6.2 index format
|
* Components from the Lucene 6.2 index format
|
||||||
* See {@link org.apache.lucene.codecs.lucene62} for an overview
|
* See {@link org.apache.lucene.codecs.lucene70} for an overview
|
||||||
* of the index format.
|
* of the current index format.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package org.apache.lucene.codecs.lucene62;
|
package org.apache.lucene.codecs.lucene62;
|
||||||
|
@ -185,6 +185,12 @@
|
|||||||
* {@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live documents}.
|
* {@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live documents}.
|
||||||
* An optional file indicating which documents are live.
|
* An optional file indicating which documents are live.
|
||||||
* </li>
|
* </li>
|
||||||
|
* <li>
|
||||||
|
* {@link org.apache.lucene.codecs.lucene60.Lucene60PointsFormat Point values}.
|
||||||
|
* Optional pair of files, recording dimensionally indexed fields, to enable fast
|
||||||
|
* numeric range filtering and large numeric values like BigInteger and BigDecimal (1D)
|
||||||
|
* and geographic shape intersection (2D, 3D).
|
||||||
|
* </li>
|
||||||
* </ul>
|
* </ul>
|
||||||
* <p>Details on each of these are provided in their linked pages.</p>
|
* <p>Details on each of these are provided in their linked pages.</p>
|
||||||
* </div>
|
* </div>
|
||||||
@ -300,7 +306,12 @@
|
|||||||
* <tr>
|
* <tr>
|
||||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live Documents}</td>
|
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live Documents}</td>
|
||||||
* <td>.liv</td>
|
* <td>.liv</td>
|
||||||
* <td>Info about what files are live</td>
|
* <td>Info about what documents are live</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>{@link org.apache.lucene.codecs.lucene60.Lucene60PointsFormat Point values}</td>
|
||||||
|
* <td>.dii, .dim</td>
|
||||||
|
* <td>Holds indexed points, if any</td>
|
||||||
* </tr>
|
* </tr>
|
||||||
* </table>
|
* </table>
|
||||||
* </div>
|
* </div>
|
||||||
@ -374,6 +385,8 @@
|
|||||||
* that is suitable for faceting/sorting/analytics.
|
* that is suitable for faceting/sorting/analytics.
|
||||||
* <li>In version 5.4, DocValues have been improved to store more information on disk:
|
* <li>In version 5.4, DocValues have been improved to store more information on disk:
|
||||||
* addresses for binary fields and ord indexes for multi-valued fields.
|
* addresses for binary fields and ord indexes for multi-valued fields.
|
||||||
|
* <li>In version 6.0, Points were added, for multi-dimensional range/distance search.
|
||||||
|
* <li>In version 6.2, new Segment info format that reads/writes the index sort, to support index sorting.
|
||||||
* <li>In version 7.0, DocValues have been improved to better support sparse doc values
|
* <li>In version 7.0, DocValues have been improved to better support sparse doc values
|
||||||
* thanks to an iterator API.
|
* thanks to an iterator API.
|
||||||
* </li>
|
* </li>
|
||||||
|
@ -1801,161 +1801,32 @@ public final class CheckIndex implements Closeable {
|
|||||||
}
|
}
|
||||||
for (FieldInfo fieldInfo : fieldInfos) {
|
for (FieldInfo fieldInfo : fieldInfos) {
|
||||||
if (fieldInfo.getPointDimensionCount() > 0) {
|
if (fieldInfo.getPointDimensionCount() > 0) {
|
||||||
FixedBitSet docsSeen = new FixedBitSet(reader.maxDoc());
|
|
||||||
status.totalValueFields++;
|
|
||||||
int dimCount = fieldInfo.getPointDimensionCount();
|
|
||||||
int bytesPerDim = fieldInfo.getPointNumBytes();
|
|
||||||
int packedBytesCount = dimCount * bytesPerDim;
|
|
||||||
byte[] lastMinPackedValue = new byte[packedBytesCount];
|
|
||||||
byte[] lastMaxPackedValue = new byte[packedBytesCount];
|
|
||||||
BytesRef scratch = new BytesRef();
|
|
||||||
scratch.length = bytesPerDim;
|
|
||||||
byte[] lastPackedValue = new byte[packedBytesCount];
|
|
||||||
|
|
||||||
long[] pointCountSeen = new long[1];
|
|
||||||
|
|
||||||
PointValues values = pointsReader.getValues(fieldInfo.name);
|
PointValues values = pointsReader.getValues(fieldInfo.name);
|
||||||
if (values == null) {
|
if (values == null) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
byte[] globalMinPackedValue = values.getMinPackedValue();
|
|
||||||
|
status.totalValueFields++;
|
||||||
|
|
||||||
long size = values.size();
|
long size = values.size();
|
||||||
int docCount = values.getDocCount();
|
int docCount = values.getDocCount();
|
||||||
|
|
||||||
if (docCount > size) {
|
VerifyPointsVisitor visitor = new VerifyPointsVisitor(fieldInfo.name, reader.maxDoc(), values);
|
||||||
throw new RuntimeException("point values for field \"" + fieldInfo.name + "\" claims to have size=" + size + " points and inconsistent docCount=" + docCount);
|
values.intersect(visitor);
|
||||||
|
|
||||||
|
if (visitor.getPointCountSeen() != size) {
|
||||||
|
throw new RuntimeException("point values for field \"" + fieldInfo.name + "\" claims to have size=" + size + " points, but in fact has " + visitor.getPointCountSeen());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (docCount > reader.maxDoc()) {
|
if (visitor.getDocCountSeen() != docCount) {
|
||||||
throw new RuntimeException("point values for field \"" + fieldInfo.name + "\" claims to have docCount=" + docCount + " but that's greater than maxDoc=" + reader.maxDoc());
|
throw new RuntimeException("point values for field \"" + fieldInfo.name + "\" claims to have docCount=" + docCount + " but in fact has " + visitor.getDocCountSeen());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (globalMinPackedValue == null) {
|
status.totalValuePoints += visitor.getPointCountSeen();
|
||||||
if (size != 0) {
|
|
||||||
throw new RuntimeException("getMinPackedValue is null points for field \"" + fieldInfo.name + "\" yet size=" + size);
|
|
||||||
}
|
|
||||||
} else if (globalMinPackedValue.length != packedBytesCount) {
|
|
||||||
throw new RuntimeException("getMinPackedValue for field \"" + fieldInfo.name + "\" return length=" + globalMinPackedValue.length + " array, but should be " + packedBytesCount);
|
|
||||||
}
|
|
||||||
byte[] globalMaxPackedValue = values.getMaxPackedValue();
|
|
||||||
if (globalMaxPackedValue == null) {
|
|
||||||
if (size != 0) {
|
|
||||||
throw new RuntimeException("getMaxPackedValue is null points for field \"" + fieldInfo.name + "\" yet size=" + size);
|
|
||||||
}
|
|
||||||
} else if (globalMaxPackedValue.length != packedBytesCount) {
|
|
||||||
throw new RuntimeException("getMaxPackedValue for field \"" + fieldInfo.name + "\" return length=" + globalMaxPackedValue.length + " array, but should be " + packedBytesCount);
|
|
||||||
}
|
|
||||||
|
|
||||||
values.intersect(new PointValues.IntersectVisitor() {
|
|
||||||
|
|
||||||
private int lastDocID = -1;
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void visit(int docID) {
|
|
||||||
throw new RuntimeException("codec called IntersectVisitor.visit without a packed value for docID=" + docID);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void visit(int docID, byte[] packedValue) {
|
|
||||||
checkPackedValue("packed value", packedValue, docID);
|
|
||||||
pointCountSeen[0]++;
|
|
||||||
docsSeen.set(docID);
|
|
||||||
|
|
||||||
for(int dim=0;dim<dimCount;dim++) {
|
|
||||||
int offset = bytesPerDim * dim;
|
|
||||||
|
|
||||||
// Compare to last cell:
|
|
||||||
if (StringHelper.compare(bytesPerDim, packedValue, offset, lastMinPackedValue, offset) < 0) {
|
|
||||||
// This doc's point, in this dimension, is lower than the minimum value of the last cell checked:
|
|
||||||
throw new RuntimeException("packed points value " + Arrays.toString(packedValue) + " for field=\"" + fieldInfo.name + "\", docID=" + docID + " is out-of-bounds of the last cell min=" + Arrays.toString(lastMinPackedValue) + " max=" + Arrays.toString(lastMaxPackedValue) + " dim=" + dim);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (StringHelper.compare(bytesPerDim, packedValue, offset, lastMaxPackedValue, offset) > 0) {
|
|
||||||
// This doc's point, in this dimension, is greater than the maximum value of the last cell checked:
|
|
||||||
throw new RuntimeException("packed points value " + Arrays.toString(packedValue) + " for field=\"" + fieldInfo.name + "\", docID=" + docID + " is out-of-bounds of the last cell min=" + Arrays.toString(lastMinPackedValue) + " max=" + Arrays.toString(lastMaxPackedValue) + " dim=" + dim);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// In the 1D case, PointValues must make a single in-order sweep through all values, and tie-break by
|
|
||||||
// increasing docID:
|
|
||||||
if (dimCount == 1) {
|
|
||||||
int cmp = StringHelper.compare(bytesPerDim, lastPackedValue, 0, packedValue, 0);
|
|
||||||
if (cmp > 0) {
|
|
||||||
throw new RuntimeException("packed points value " + Arrays.toString(packedValue) + " for field=\"" + fieldInfo.name + "\", for docID=" + docID + " is out-of-order vs the previous document's value " + Arrays.toString(lastPackedValue));
|
|
||||||
} else if (cmp == 0) {
|
|
||||||
if (docID < lastDocID) {
|
|
||||||
throw new RuntimeException("packed points value is the same, but docID=" + docID + " is out of order vs previous docID=" + lastDocID + ", field=\"" + fieldInfo.name + "\"");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
System.arraycopy(packedValue, 0, lastPackedValue, 0, bytesPerDim);
|
|
||||||
lastDocID = docID;
|
|
||||||
}
|
|
||||||
|
|
||||||
status.totalValuePoints++;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
|
|
||||||
checkPackedValue("min packed value", minPackedValue, -1);
|
|
||||||
System.arraycopy(minPackedValue, 0, lastMinPackedValue, 0, packedBytesCount);
|
|
||||||
checkPackedValue("max packed value", maxPackedValue, -1);
|
|
||||||
System.arraycopy(maxPackedValue, 0, lastMaxPackedValue, 0, packedBytesCount);
|
|
||||||
|
|
||||||
for(int dim=0;dim<dimCount;dim++) {
|
|
||||||
int offset = bytesPerDim * dim;
|
|
||||||
|
|
||||||
if (StringHelper.compare(bytesPerDim, minPackedValue, offset, maxPackedValue, offset) > 0) {
|
|
||||||
throw new RuntimeException("packed points cell minPackedValue " + Arrays.toString(minPackedValue) +
|
|
||||||
" is out-of-bounds of the cell's maxPackedValue " + Arrays.toString(maxPackedValue) + " dim=" + dim + " field=\"" + fieldInfo.name + "\"");
|
|
||||||
}
|
|
||||||
|
|
||||||
// Make sure this cell is not outside of the global min/max:
|
|
||||||
if (StringHelper.compare(bytesPerDim, minPackedValue, offset, globalMinPackedValue, offset) < 0) {
|
|
||||||
throw new RuntimeException("packed points cell minPackedValue " + Arrays.toString(minPackedValue) +
|
|
||||||
" is out-of-bounds of the global minimum " + Arrays.toString(globalMinPackedValue) + " dim=" + dim + " field=\"" + fieldInfo.name + "\"");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (StringHelper.compare(bytesPerDim, maxPackedValue, offset, globalMinPackedValue, offset) < 0) {
|
|
||||||
throw new RuntimeException("packed points cell maxPackedValue " + Arrays.toString(maxPackedValue) +
|
|
||||||
" is out-of-bounds of the global minimum " + Arrays.toString(globalMinPackedValue) + " dim=" + dim + " field=\"" + fieldInfo.name + "\"");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (StringHelper.compare(bytesPerDim, minPackedValue, offset, globalMaxPackedValue, offset) > 0) {
|
|
||||||
throw new RuntimeException("packed points cell minPackedValue " + Arrays.toString(minPackedValue) +
|
|
||||||
" is out-of-bounds of the global maximum " + Arrays.toString(globalMaxPackedValue) + " dim=" + dim + " field=\"" + fieldInfo.name + "\"");
|
|
||||||
}
|
|
||||||
if (StringHelper.compare(bytesPerDim, maxPackedValue, offset, globalMaxPackedValue, offset) > 0) {
|
|
||||||
throw new RuntimeException("packed points cell maxPackedValue " + Arrays.toString(maxPackedValue) +
|
|
||||||
" is out-of-bounds of the global maximum " + Arrays.toString(globalMaxPackedValue) + " dim=" + dim + " field=\"" + fieldInfo.name + "\"");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// We always pretend the query shape is so complex that it crosses every cell, so
|
|
||||||
// that packedValue is passed for every document
|
|
||||||
return PointValues.Relation.CELL_CROSSES_QUERY;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void checkPackedValue(String desc, byte[] packedValue, int docID) {
|
|
||||||
if (packedValue == null) {
|
|
||||||
throw new RuntimeException(desc + " is null for docID=" + docID + " field=\"" + fieldInfo.name + "\"");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (packedValue.length != packedBytesCount) {
|
|
||||||
throw new RuntimeException(desc + " has incorrect length=" + packedValue.length + " vs expected=" + packedBytesCount + " for docID=" + docID + " field=\"" + fieldInfo.name + "\"");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
if (pointCountSeen[0] != size) {
|
|
||||||
throw new RuntimeException("point values for field \"" + fieldInfo.name + "\" claims to have size=" + size + " points, but in fact has " + pointCountSeen[0]);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (docsSeen.cardinality() != docCount) {
|
|
||||||
throw new RuntimeException("point values for field \"" + fieldInfo.name + "\" claims to have docCount=" + docCount + " but in fact has " + docsSeen.cardinality());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
msg(infoStream, String.format(Locale.ROOT, "OK [%d fields, %d points] [took %.3f sec]", status.totalValueFields, status.totalValuePoints, nsToSec(System.nanoTime()-startNS)));
|
msg(infoStream, String.format(Locale.ROOT, "OK [%d fields, %d points] [took %.3f sec]", status.totalValueFields, status.totalValuePoints, nsToSec(System.nanoTime()-startNS)));
|
||||||
|
|
||||||
} catch (Throwable e) {
|
} catch (Throwable e) {
|
||||||
@ -1972,6 +1843,167 @@ public final class CheckIndex implements Closeable {
|
|||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Walks the entire N-dimensional points space, verifying that all points fall within the last cell's boundaries.
|
||||||
|
*
|
||||||
|
* @lucene.internal */
|
||||||
|
public static class VerifyPointsVisitor implements PointValues.IntersectVisitor {
|
||||||
|
private long pointCountSeen;
|
||||||
|
private int lastDocID = -1;
|
||||||
|
private final int maxDoc;
|
||||||
|
private final FixedBitSet docsSeen;
|
||||||
|
private final byte[] lastMinPackedValue;
|
||||||
|
private final byte[] lastMaxPackedValue;
|
||||||
|
private final byte[] lastPackedValue;
|
||||||
|
private final byte[] globalMinPackedValue;
|
||||||
|
private final byte[] globalMaxPackedValue;
|
||||||
|
private final int packedBytesCount;
|
||||||
|
private final int numDims;
|
||||||
|
private final int bytesPerDim;
|
||||||
|
private final String fieldName;
|
||||||
|
|
||||||
|
/** Sole constructor */
|
||||||
|
public VerifyPointsVisitor(String fieldName, int maxDoc, PointValues values) throws IOException {
|
||||||
|
this.maxDoc = maxDoc;
|
||||||
|
this.fieldName = fieldName;
|
||||||
|
numDims = values.getNumDimensions();
|
||||||
|
bytesPerDim = values.getBytesPerDimension();
|
||||||
|
packedBytesCount = numDims * bytesPerDim;
|
||||||
|
globalMinPackedValue = values.getMinPackedValue();
|
||||||
|
globalMaxPackedValue = values.getMaxPackedValue();
|
||||||
|
docsSeen = new FixedBitSet(maxDoc);
|
||||||
|
lastMinPackedValue = new byte[packedBytesCount];
|
||||||
|
lastMaxPackedValue = new byte[packedBytesCount];
|
||||||
|
lastPackedValue = new byte[packedBytesCount];
|
||||||
|
|
||||||
|
if (values.getDocCount() > values.size()) {
|
||||||
|
throw new RuntimeException("point values for field \"" + fieldName + "\" claims to have size=" + values.size() + " points and inconsistent docCount=" + values.getDocCount());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (values.getDocCount() > maxDoc) {
|
||||||
|
throw new RuntimeException("point values for field \"" + fieldName + "\" claims to have docCount=" + values.getDocCount() + " but that's greater than maxDoc=" + maxDoc);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (globalMinPackedValue == null) {
|
||||||
|
if (values.size() != 0) {
|
||||||
|
throw new RuntimeException("getMinPackedValue is null points for field \"" + fieldName + "\" yet size=" + values.size());
|
||||||
|
}
|
||||||
|
} else if (globalMinPackedValue.length != packedBytesCount) {
|
||||||
|
throw new RuntimeException("getMinPackedValue for field \"" + fieldName + "\" return length=" + globalMinPackedValue.length + " array, but should be " + packedBytesCount);
|
||||||
|
}
|
||||||
|
if (globalMaxPackedValue == null) {
|
||||||
|
if (values.size() != 0) {
|
||||||
|
throw new RuntimeException("getMaxPackedValue is null points for field \"" + fieldName + "\" yet size=" + values.size());
|
||||||
|
}
|
||||||
|
} else if (globalMaxPackedValue.length != packedBytesCount) {
|
||||||
|
throw new RuntimeException("getMaxPackedValue for field \"" + fieldName + "\" return length=" + globalMaxPackedValue.length + " array, but should be " + packedBytesCount);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns total number of points in this BKD tree */
|
||||||
|
public long getPointCountSeen() {
|
||||||
|
return pointCountSeen;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns total number of unique docIDs in this BKD tree */
|
||||||
|
public long getDocCountSeen() {
|
||||||
|
return docsSeen.cardinality();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void visit(int docID) {
|
||||||
|
throw new RuntimeException("codec called IntersectVisitor.visit without a packed value for docID=" + docID);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void visit(int docID, byte[] packedValue) {
|
||||||
|
checkPackedValue("packed value", packedValue, docID);
|
||||||
|
pointCountSeen++;
|
||||||
|
docsSeen.set(docID);
|
||||||
|
|
||||||
|
for(int dim=0;dim<numDims;dim++) {
|
||||||
|
int offset = bytesPerDim * dim;
|
||||||
|
|
||||||
|
// Compare to last cell:
|
||||||
|
if (StringHelper.compare(bytesPerDim, packedValue, offset, lastMinPackedValue, offset) < 0) {
|
||||||
|
// This doc's point, in this dimension, is lower than the minimum value of the last cell checked:
|
||||||
|
throw new RuntimeException("packed points value " + Arrays.toString(packedValue) + " for field=\"" + fieldName + "\", docID=" + docID + " is out-of-bounds of the last cell min=" + Arrays.toString(lastMinPackedValue) + " max=" + Arrays.toString(lastMaxPackedValue) + " dim=" + dim);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (StringHelper.compare(bytesPerDim, packedValue, offset, lastMaxPackedValue, offset) > 0) {
|
||||||
|
// This doc's point, in this dimension, is greater than the maximum value of the last cell checked:
|
||||||
|
throw new RuntimeException("packed points value " + Arrays.toString(packedValue) + " for field=\"" + fieldName + "\", docID=" + docID + " is out-of-bounds of the last cell min=" + Arrays.toString(lastMinPackedValue) + " max=" + Arrays.toString(lastMaxPackedValue) + " dim=" + dim);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// In the 1D case, PointValues must make a single in-order sweep through all values, and tie-break by
|
||||||
|
// increasing docID:
|
||||||
|
if (numDims == 1) {
|
||||||
|
int cmp = StringHelper.compare(bytesPerDim, lastPackedValue, 0, packedValue, 0);
|
||||||
|
if (cmp > 0) {
|
||||||
|
throw new RuntimeException("packed points value " + Arrays.toString(packedValue) + " for field=\"" + fieldName + "\", for docID=" + docID + " is out-of-order vs the previous document's value " + Arrays.toString(lastPackedValue));
|
||||||
|
} else if (cmp == 0) {
|
||||||
|
if (docID < lastDocID) {
|
||||||
|
throw new RuntimeException("packed points value is the same, but docID=" + docID + " is out of order vs previous docID=" + lastDocID + ", field=\"" + fieldName + "\"");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
System.arraycopy(packedValue, 0, lastPackedValue, 0, bytesPerDim);
|
||||||
|
lastDocID = docID;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
|
||||||
|
checkPackedValue("min packed value", minPackedValue, -1);
|
||||||
|
System.arraycopy(minPackedValue, 0, lastMinPackedValue, 0, packedBytesCount);
|
||||||
|
checkPackedValue("max packed value", maxPackedValue, -1);
|
||||||
|
System.arraycopy(maxPackedValue, 0, lastMaxPackedValue, 0, packedBytesCount);
|
||||||
|
|
||||||
|
for(int dim=0;dim<numDims;dim++) {
|
||||||
|
int offset = bytesPerDim * dim;
|
||||||
|
|
||||||
|
if (StringHelper.compare(bytesPerDim, minPackedValue, offset, maxPackedValue, offset) > 0) {
|
||||||
|
throw new RuntimeException("packed points cell minPackedValue " + Arrays.toString(minPackedValue) +
|
||||||
|
" is out-of-bounds of the cell's maxPackedValue " + Arrays.toString(maxPackedValue) + " dim=" + dim + " field=\"" + fieldName + "\"");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Make sure this cell is not outside of the global min/max:
|
||||||
|
if (StringHelper.compare(bytesPerDim, minPackedValue, offset, globalMinPackedValue, offset) < 0) {
|
||||||
|
throw new RuntimeException("packed points cell minPackedValue " + Arrays.toString(minPackedValue) +
|
||||||
|
" is out-of-bounds of the global minimum " + Arrays.toString(globalMinPackedValue) + " dim=" + dim + " field=\"" + fieldName + "\"");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (StringHelper.compare(bytesPerDim, maxPackedValue, offset, globalMinPackedValue, offset) < 0) {
|
||||||
|
throw new RuntimeException("packed points cell maxPackedValue " + Arrays.toString(maxPackedValue) +
|
||||||
|
" is out-of-bounds of the global minimum " + Arrays.toString(globalMinPackedValue) + " dim=" + dim + " field=\"" + fieldName + "\"");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (StringHelper.compare(bytesPerDim, minPackedValue, offset, globalMaxPackedValue, offset) > 0) {
|
||||||
|
throw new RuntimeException("packed points cell minPackedValue " + Arrays.toString(minPackedValue) +
|
||||||
|
" is out-of-bounds of the global maximum " + Arrays.toString(globalMaxPackedValue) + " dim=" + dim + " field=\"" + fieldName + "\"");
|
||||||
|
}
|
||||||
|
if (StringHelper.compare(bytesPerDim, maxPackedValue, offset, globalMaxPackedValue, offset) > 0) {
|
||||||
|
throw new RuntimeException("packed points cell maxPackedValue " + Arrays.toString(maxPackedValue) +
|
||||||
|
" is out-of-bounds of the global maximum " + Arrays.toString(globalMaxPackedValue) + " dim=" + dim + " field=\"" + fieldName + "\"");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// We always pretend the query shape is so complex that it crosses every cell, so
|
||||||
|
// that packedValue is passed for every document
|
||||||
|
return PointValues.Relation.CELL_CROSSES_QUERY;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void checkPackedValue(String desc, byte[] packedValue, int docID) {
|
||||||
|
if (packedValue == null) {
|
||||||
|
throw new RuntimeException(desc + " is null for docID=" + docID + " field=\"" + fieldName + "\"");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (packedValue.length != packedBytesCount) {
|
||||||
|
throw new RuntimeException(desc + " has incorrect length=" + packedValue.length + " vs expected=" + packedBytesCount + " for docID=" + docID + " field=\"" + fieldName + "\"");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test stored fields.
|
* Test stored fields.
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
|
@ -17,14 +17,15 @@
|
|||||||
package org.apache.lucene.util.bkd;
|
package org.apache.lucene.util.bkd;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Arrays;
|
|
||||||
|
|
||||||
import org.apache.lucene.codecs.CodecUtil;
|
import org.apache.lucene.codecs.CodecUtil;
|
||||||
import org.apache.lucene.index.CorruptIndexException;
|
import org.apache.lucene.index.CorruptIndexException;
|
||||||
import org.apache.lucene.index.PointValues;
|
import org.apache.lucene.index.PointValues;
|
||||||
|
import org.apache.lucene.store.ByteArrayDataInput;
|
||||||
import org.apache.lucene.store.IndexInput;
|
import org.apache.lucene.store.IndexInput;
|
||||||
import org.apache.lucene.util.Accountable;
|
import org.apache.lucene.util.Accountable;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.MathUtil;
|
||||||
import org.apache.lucene.util.RamUsageEstimator;
|
import org.apache.lucene.util.RamUsageEstimator;
|
||||||
import org.apache.lucene.util.StringHelper;
|
import org.apache.lucene.util.StringHelper;
|
||||||
|
|
||||||
@ -32,14 +33,12 @@ import org.apache.lucene.util.StringHelper;
|
|||||||
*
|
*
|
||||||
* @lucene.experimental */
|
* @lucene.experimental */
|
||||||
|
|
||||||
public class BKDReader extends PointValues implements Accountable {
|
public final class BKDReader extends PointValues implements Accountable {
|
||||||
// Packed array of byte[] holding all split values in the full binary tree:
|
// Packed array of byte[] holding all split values in the full binary tree:
|
||||||
final private byte[] splitPackedValues;
|
final int leafNodeOffset;
|
||||||
final long[] leafBlockFPs;
|
|
||||||
final private int leafNodeOffset;
|
|
||||||
final int numDims;
|
final int numDims;
|
||||||
final int bytesPerDim;
|
final int bytesPerDim;
|
||||||
final int bytesPerIndexEntry;
|
final int numLeaves;
|
||||||
final IndexInput in;
|
final IndexInput in;
|
||||||
final int maxPointsInLeafNode;
|
final int maxPointsInLeafNode;
|
||||||
final byte[] minPackedValue;
|
final byte[] minPackedValue;
|
||||||
@ -49,6 +48,14 @@ public class BKDReader extends PointValues implements Accountable {
|
|||||||
final int version;
|
final int version;
|
||||||
protected final int packedBytesLength;
|
protected final int packedBytesLength;
|
||||||
|
|
||||||
|
// Used for 6.4.0+ index format:
|
||||||
|
final byte[] packedIndex;
|
||||||
|
|
||||||
|
// Used for Legacy (pre-6.4.0) index format, to hold a compact form of the index:
|
||||||
|
final private byte[] splitPackedValues;
|
||||||
|
final int bytesPerIndexEntry;
|
||||||
|
final long[] leafBlockFPs;
|
||||||
|
|
||||||
/** Caller must pre-seek the provided {@link IndexInput} to the index location that {@link BKDWriter#finish} returned */
|
/** Caller must pre-seek the provided {@link IndexInput} to the index location that {@link BKDWriter#finish} returned */
|
||||||
public BKDReader(IndexInput in) throws IOException {
|
public BKDReader(IndexInput in) throws IOException {
|
||||||
version = CodecUtil.checkHeader(in, BKDWriter.CODEC_NAME, BKDWriter.VERSION_START, BKDWriter.VERSION_CURRENT);
|
version = CodecUtil.checkHeader(in, BKDWriter.CODEC_NAME, BKDWriter.VERSION_START, BKDWriter.VERSION_CURRENT);
|
||||||
@ -59,7 +66,7 @@ public class BKDReader extends PointValues implements Accountable {
|
|||||||
packedBytesLength = numDims * bytesPerDim;
|
packedBytesLength = numDims * bytesPerDim;
|
||||||
|
|
||||||
// Read index:
|
// Read index:
|
||||||
int numLeaves = in.readVInt();
|
numLeaves = in.readVInt();
|
||||||
assert numLeaves > 0;
|
assert numLeaves > 0;
|
||||||
leafNodeOffset = numLeaves;
|
leafNodeOffset = numLeaves;
|
||||||
|
|
||||||
@ -78,203 +85,378 @@ public class BKDReader extends PointValues implements Accountable {
|
|||||||
pointCount = in.readVLong();
|
pointCount = in.readVLong();
|
||||||
docCount = in.readVInt();
|
docCount = in.readVInt();
|
||||||
|
|
||||||
splitPackedValues = new byte[bytesPerIndexEntry*numLeaves];
|
if (version >= BKDWriter.VERSION_PACKED_INDEX) {
|
||||||
|
int numBytes = in.readVInt();
|
||||||
// TODO: don't write split packed values[0]!
|
packedIndex = new byte[numBytes];
|
||||||
in.readBytes(splitPackedValues, 0, splitPackedValues.length);
|
in.readBytes(packedIndex, 0, numBytes);
|
||||||
|
leafBlockFPs = null;
|
||||||
// Read the file pointers to the start of each leaf block:
|
splitPackedValues = null;
|
||||||
long[] leafBlockFPs = new long[numLeaves];
|
|
||||||
long lastFP = 0;
|
|
||||||
for(int i=0;i<numLeaves;i++) {
|
|
||||||
long delta = in.readVLong();
|
|
||||||
leafBlockFPs[i] = lastFP + delta;
|
|
||||||
lastFP += delta;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Possibly rotate the leaf block FPs, if the index not fully balanced binary tree (only happens
|
|
||||||
// if it was created by BKDWriter.merge). In this case the leaf nodes may straddle the two bottom
|
|
||||||
// levels of the binary tree:
|
|
||||||
if (numDims == 1 && numLeaves > 1) {
|
|
||||||
//System.out.println("BKDR: numLeaves=" + numLeaves);
|
|
||||||
int levelCount = 2;
|
|
||||||
while (true) {
|
|
||||||
//System.out.println(" cycle levelCount=" + levelCount);
|
|
||||||
if (numLeaves >= levelCount && numLeaves <= 2*levelCount) {
|
|
||||||
int lastLevel = 2*(numLeaves - levelCount);
|
|
||||||
assert lastLevel >= 0;
|
|
||||||
/*
|
|
||||||
System.out.println("BKDR: lastLevel=" + lastLevel + " vs " + levelCount);
|
|
||||||
System.out.println("FPs before:");
|
|
||||||
for(int i=0;i<leafBlockFPs.length;i++) {
|
|
||||||
System.out.println(" " + i + " " + leafBlockFPs[i]);
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
if (lastLevel != 0) {
|
|
||||||
// Last level is partially filled, so we must rotate the leaf FPs to match. We do this here, after loading
|
|
||||||
// at read-time, so that we can still delta code them on disk at write:
|
|
||||||
//System.out.println("BKDR: now rotate index");
|
|
||||||
long[] newLeafBlockFPs = new long[numLeaves];
|
|
||||||
System.arraycopy(leafBlockFPs, lastLevel, newLeafBlockFPs, 0, leafBlockFPs.length - lastLevel);
|
|
||||||
System.arraycopy(leafBlockFPs, 0, newLeafBlockFPs, leafBlockFPs.length - lastLevel, lastLevel);
|
|
||||||
leafBlockFPs = newLeafBlockFPs;
|
|
||||||
}
|
|
||||||
/*
|
|
||||||
System.out.println("FPs:");
|
|
||||||
for(int i=0;i<leafBlockFPs.length;i++) {
|
|
||||||
System.out.println(" " + i + " " + leafBlockFPs[i]);
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
levelCount *= 2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
this.leafBlockFPs = leafBlockFPs;
|
|
||||||
this.in = in;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Called by consumers that have their own on-disk format for the index (e.g. SimpleText) */
|
|
||||||
protected BKDReader(IndexInput in, int numDims, int maxPointsInLeafNode, int bytesPerDim, long[] leafBlockFPs, byte[] splitPackedValues,
|
|
||||||
byte[] minPackedValue, byte[] maxPackedValue, long pointCount, int docCount) throws IOException {
|
|
||||||
this.in = in;
|
|
||||||
this.numDims = numDims;
|
|
||||||
this.maxPointsInLeafNode = maxPointsInLeafNode;
|
|
||||||
this.bytesPerDim = bytesPerDim;
|
|
||||||
// no version check here because callers of this API (SimpleText) have no back compat:
|
|
||||||
bytesPerIndexEntry = numDims == 1 ? bytesPerDim : bytesPerDim + 1;
|
|
||||||
packedBytesLength = numDims * bytesPerDim;
|
|
||||||
this.leafNodeOffset = leafBlockFPs.length;
|
|
||||||
this.leafBlockFPs = leafBlockFPs;
|
|
||||||
this.splitPackedValues = splitPackedValues;
|
|
||||||
this.minPackedValue = minPackedValue;
|
|
||||||
this.maxPackedValue = maxPackedValue;
|
|
||||||
this.pointCount = pointCount;
|
|
||||||
this.docCount = docCount;
|
|
||||||
this.version = BKDWriter.VERSION_CURRENT;
|
|
||||||
assert minPackedValue.length == packedBytesLength;
|
|
||||||
assert maxPackedValue.length == packedBytesLength;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static class VerifyVisitor implements IntersectVisitor {
|
|
||||||
byte[] cellMinPacked;
|
|
||||||
byte[] cellMaxPacked;
|
|
||||||
byte[] lastPackedValue;
|
|
||||||
final int numDims;
|
|
||||||
final int bytesPerDim;
|
|
||||||
final int maxDoc;
|
|
||||||
|
|
||||||
public VerifyVisitor(int numDims, int bytesPerDim, int maxDoc) {
|
|
||||||
this.numDims = numDims;
|
|
||||||
this.bytesPerDim = bytesPerDim;
|
|
||||||
this.maxDoc = maxDoc;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void visit(int docID) {
|
|
||||||
throw new UnsupportedOperationException();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void visit(int docID, byte[] packedValue) {
|
|
||||||
if (docID < 0 || docID >= maxDoc) {
|
|
||||||
throw new RuntimeException("docID=" + docID + " is out of bounds of 0.." + maxDoc);
|
|
||||||
}
|
|
||||||
for(int dim=0;dim<numDims;dim++) {
|
|
||||||
if (StringHelper.compare(bytesPerDim, cellMinPacked, dim*bytesPerDim, packedValue, dim*bytesPerDim) > 0) {
|
|
||||||
throw new RuntimeException("value=" + new BytesRef(packedValue, dim*bytesPerDim, bytesPerDim) + " for docID=" + docID + " dim=" + dim + " is less than this leaf block's minimum=" + new BytesRef(cellMinPacked, dim*bytesPerDim, bytesPerDim));
|
|
||||||
}
|
|
||||||
if (StringHelper.compare(bytesPerDim, cellMaxPacked, dim*bytesPerDim, packedValue, dim*bytesPerDim) < 0) {
|
|
||||||
throw new RuntimeException("value=" + new BytesRef(packedValue, dim*bytesPerDim, bytesPerDim) + " for docID=" + docID + " dim=" + dim + " is greater than this leaf block's maximum=" + new BytesRef(cellMaxPacked, dim*bytesPerDim, bytesPerDim));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (numDims == 1) {
|
|
||||||
// With only 1D, all values should always be in sorted order
|
|
||||||
if (lastPackedValue == null) {
|
|
||||||
lastPackedValue = Arrays.copyOf(packedValue, packedValue.length);
|
|
||||||
} else if (StringHelper.compare(bytesPerDim, lastPackedValue, 0, packedValue, 0) > 0) {
|
|
||||||
throw new RuntimeException("value=" + new BytesRef(packedValue) + " for docID=" + docID + " dim=0" + " sorts before last value=" + new BytesRef(lastPackedValue));
|
|
||||||
} else {
|
|
||||||
System.arraycopy(packedValue, 0, lastPackedValue, 0, bytesPerDim);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
|
|
||||||
throw new UnsupportedOperationException();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Only used for debugging, to make sure all values in each leaf block fall within the range expected by the index */
|
|
||||||
// TODO: maybe we can get this into CheckIndex?
|
|
||||||
public void verify(int maxDoc) throws IOException {
|
|
||||||
//System.out.println("BKDR.verify this=" + this);
|
|
||||||
// Visits every doc in every leaf block and confirms that
|
|
||||||
// their values agree with the index:
|
|
||||||
byte[] rootMinPacked = new byte[packedBytesLength];
|
|
||||||
byte[] rootMaxPacked = new byte[packedBytesLength];
|
|
||||||
Arrays.fill(rootMaxPacked, (byte) 0xff);
|
|
||||||
verify(getIntersectState(new VerifyVisitor(numDims, bytesPerDim, maxDoc)), 1, rootMinPacked, rootMaxPacked);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void verify(IntersectState state, int nodeID, byte[] cellMinPacked, byte[] cellMaxPacked) throws IOException {
|
|
||||||
|
|
||||||
if (nodeID >= leafNodeOffset) {
|
|
||||||
int leafID = nodeID - leafNodeOffset;
|
|
||||||
|
|
||||||
// In the unbalanced case it's possible the left most node only has one child:
|
|
||||||
if (leafID < leafBlockFPs.length) {
|
|
||||||
//System.out.println("CHECK nodeID=" + nodeID + " leaf=" + (nodeID-leafNodeOffset) + " offset=" + leafNodeOffset + " fp=" + leafBlockFPs[leafID]);
|
|
||||||
//System.out.println("BKDR.verify leafID=" + leafID + " nodeID=" + nodeID + " fp=" + leafBlockFPs[leafID] + " min=" + new BytesRef(cellMinPacked) + " max=" + new BytesRef(cellMaxPacked));
|
|
||||||
|
|
||||||
// Leaf node: check that all values are in fact in bounds:
|
|
||||||
VerifyVisitor visitor = (VerifyVisitor) state.visitor;
|
|
||||||
visitor.cellMinPacked = cellMinPacked;
|
|
||||||
visitor.cellMaxPacked = cellMaxPacked;
|
|
||||||
|
|
||||||
int count = readDocIDs(state.in, leafBlockFPs[leafID], state.scratchDocIDs);
|
|
||||||
visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor);
|
|
||||||
} else {
|
|
||||||
//System.out.println("BKDR.verify skip leafID=" + leafID);
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
// Non-leaf node:
|
// legacy un-packed index
|
||||||
|
|
||||||
int address = nodeID * bytesPerIndexEntry;
|
splitPackedValues = new byte[bytesPerIndexEntry*numLeaves];
|
||||||
int splitDim;
|
|
||||||
if (numDims == 1) {
|
in.readBytes(splitPackedValues, 0, splitPackedValues.length);
|
||||||
splitDim = 0;
|
|
||||||
if (version < BKDWriter.VERSION_IMPLICIT_SPLIT_DIM_1D) {
|
// Read the file pointers to the start of each leaf block:
|
||||||
// skip over wastefully encoded 0 splitDim:
|
long[] leafBlockFPs = new long[numLeaves];
|
||||||
assert splitPackedValues[address] == 0;
|
long lastFP = 0;
|
||||||
address++;
|
for(int i=0;i<numLeaves;i++) {
|
||||||
|
long delta = in.readVLong();
|
||||||
|
leafBlockFPs[i] = lastFP + delta;
|
||||||
|
lastFP += delta;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Possibly rotate the leaf block FPs, if the index not fully balanced binary tree (only happens
|
||||||
|
// if it was created by BKDWriter.merge or OneDimWriter). In this case the leaf nodes may straddle the two bottom
|
||||||
|
// levels of the binary tree:
|
||||||
|
if (numDims == 1 && numLeaves > 1) {
|
||||||
|
int levelCount = 2;
|
||||||
|
while (true) {
|
||||||
|
if (numLeaves >= levelCount && numLeaves <= 2*levelCount) {
|
||||||
|
int lastLevel = 2*(numLeaves - levelCount);
|
||||||
|
assert lastLevel >= 0;
|
||||||
|
if (lastLevel != 0) {
|
||||||
|
// Last level is partially filled, so we must rotate the leaf FPs to match. We do this here, after loading
|
||||||
|
// at read-time, so that we can still delta code them on disk at write:
|
||||||
|
long[] newLeafBlockFPs = new long[numLeaves];
|
||||||
|
System.arraycopy(leafBlockFPs, lastLevel, newLeafBlockFPs, 0, leafBlockFPs.length - lastLevel);
|
||||||
|
System.arraycopy(leafBlockFPs, 0, newLeafBlockFPs, leafBlockFPs.length - lastLevel, lastLevel);
|
||||||
|
leafBlockFPs = newLeafBlockFPs;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
levelCount *= 2;
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
splitDim = splitPackedValues[address++] & 0xff;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
assert splitDim < numDims;
|
this.leafBlockFPs = leafBlockFPs;
|
||||||
|
packedIndex = null;
|
||||||
byte[] splitPackedValue = new byte[packedBytesLength];
|
|
||||||
|
|
||||||
// Recurse on left sub-tree:
|
|
||||||
System.arraycopy(cellMaxPacked, 0, splitPackedValue, 0, packedBytesLength);
|
|
||||||
System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
|
|
||||||
verify(state,
|
|
||||||
2*nodeID,
|
|
||||||
cellMinPacked, splitPackedValue);
|
|
||||||
|
|
||||||
// Recurse on right sub-tree:
|
|
||||||
System.arraycopy(cellMinPacked, 0, splitPackedValue, 0, packedBytesLength);
|
|
||||||
System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
|
|
||||||
verify(state,
|
|
||||||
2*nodeID+1,
|
|
||||||
splitPackedValue, cellMaxPacked);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
this.in = in;
|
||||||
|
}
|
||||||
|
|
||||||
|
long getMinLeafBlockFP() {
|
||||||
|
if (packedIndex != null) {
|
||||||
|
return new ByteArrayDataInput(packedIndex).readVLong();
|
||||||
|
} else {
|
||||||
|
long minFP = Long.MAX_VALUE;
|
||||||
|
for(long fp : leafBlockFPs) {
|
||||||
|
minFP = Math.min(minFP, fp);
|
||||||
|
}
|
||||||
|
return minFP;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Used to walk the in-heap index
|
||||||
|
*
|
||||||
|
* @lucene.internal */
|
||||||
|
public abstract class IndexTree implements Cloneable {
|
||||||
|
protected int nodeID;
|
||||||
|
// level is 1-based so that we can do level-1 w/o checking each time:
|
||||||
|
protected int level;
|
||||||
|
protected int splitDim;
|
||||||
|
protected final byte[][] splitPackedValueStack;
|
||||||
|
|
||||||
|
protected IndexTree() {
|
||||||
|
int treeDepth = getTreeDepth();
|
||||||
|
splitPackedValueStack = new byte[treeDepth+1][];
|
||||||
|
nodeID = 1;
|
||||||
|
level = 1;
|
||||||
|
splitPackedValueStack[level] = new byte[packedBytesLength];
|
||||||
|
}
|
||||||
|
|
||||||
|
public void pushLeft() {
|
||||||
|
nodeID *= 2;
|
||||||
|
level++;
|
||||||
|
if (splitPackedValueStack[level] == null) {
|
||||||
|
splitPackedValueStack[level] = new byte[packedBytesLength];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Clone, but you are not allowed to pop up past the point where the clone happened. */
|
||||||
|
public abstract IndexTree clone();
|
||||||
|
|
||||||
|
public void pushRight() {
|
||||||
|
nodeID = nodeID * 2 + 1;
|
||||||
|
level++;
|
||||||
|
if (splitPackedValueStack[level] == null) {
|
||||||
|
splitPackedValueStack[level] = new byte[packedBytesLength];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void pop() {
|
||||||
|
nodeID /= 2;
|
||||||
|
level--;
|
||||||
|
splitDim = -1;
|
||||||
|
//System.out.println(" pop nodeID=" + nodeID);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isLeafNode() {
|
||||||
|
return nodeID >= leafNodeOffset;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean nodeExists() {
|
||||||
|
return nodeID - leafNodeOffset < leafNodeOffset;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getNodeID() {
|
||||||
|
return nodeID;
|
||||||
|
}
|
||||||
|
|
||||||
|
public byte[] getSplitPackedValue() {
|
||||||
|
assert isLeafNode() == false;
|
||||||
|
assert splitPackedValueStack[level] != null: "level=" + level;
|
||||||
|
return splitPackedValueStack[level];
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Only valid after pushLeft or pushRight, not pop! */
|
||||||
|
public int getSplitDim() {
|
||||||
|
assert isLeafNode() == false;
|
||||||
|
return splitDim;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Only valid after pushLeft or pushRight, not pop! */
|
||||||
|
public abstract BytesRef getSplitDimValue();
|
||||||
|
|
||||||
|
/** Only valid after pushLeft or pushRight, not pop! */
|
||||||
|
public abstract long getLeafBlockFP();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Reads the original simple yet heap-heavy index format */
|
||||||
|
private final class LegacyIndexTree extends IndexTree {
|
||||||
|
|
||||||
|
private long leafBlockFP;
|
||||||
|
private final byte[] splitDimValue = new byte[bytesPerDim];
|
||||||
|
private final BytesRef scratch = new BytesRef();
|
||||||
|
|
||||||
|
public LegacyIndexTree() {
|
||||||
|
setNodeData();
|
||||||
|
scratch.bytes = splitDimValue;
|
||||||
|
scratch.length = bytesPerDim;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public LegacyIndexTree clone() {
|
||||||
|
LegacyIndexTree index = new LegacyIndexTree();
|
||||||
|
index.nodeID = nodeID;
|
||||||
|
index.level = level;
|
||||||
|
index.splitDim = splitDim;
|
||||||
|
index.leafBlockFP = leafBlockFP;
|
||||||
|
index.splitPackedValueStack[index.level] = splitPackedValueStack[index.level].clone();
|
||||||
|
|
||||||
|
return index;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void pushLeft() {
|
||||||
|
super.pushLeft();
|
||||||
|
setNodeData();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void pushRight() {
|
||||||
|
super.pushRight();
|
||||||
|
setNodeData();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void setNodeData() {
|
||||||
|
if (isLeafNode()) {
|
||||||
|
leafBlockFP = leafBlockFPs[nodeID - leafNodeOffset];
|
||||||
|
splitDim = -1;
|
||||||
|
} else {
|
||||||
|
leafBlockFP = -1;
|
||||||
|
int address = nodeID * bytesPerIndexEntry;
|
||||||
|
if (numDims == 1) {
|
||||||
|
splitDim = 0;
|
||||||
|
if (version < BKDWriter.VERSION_IMPLICIT_SPLIT_DIM_1D) {
|
||||||
|
// skip over wastefully encoded 0 splitDim:
|
||||||
|
assert splitPackedValues[address] == 0;
|
||||||
|
address++;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
splitDim = splitPackedValues[address++] & 0xff;
|
||||||
|
}
|
||||||
|
System.arraycopy(splitPackedValues, address, splitDimValue, 0, bytesPerDim);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long getLeafBlockFP() {
|
||||||
|
assert isLeafNode();
|
||||||
|
return leafBlockFP;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public BytesRef getSplitDimValue() {
|
||||||
|
assert isLeafNode() == false;
|
||||||
|
return scratch;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void pop() {
|
||||||
|
super.pop();
|
||||||
|
leafBlockFP = -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Reads the new packed byte[] index format which can be up to ~63% smaller than the legacy index format on 20M NYC taxis tests. This
|
||||||
|
* format takes advantage of the limited access pattern to the BKD tree at search time, i.e. starting at the root node and recursing
|
||||||
|
* downwards one child at a time. */
|
||||||
|
private final class PackedIndexTree extends IndexTree {
|
||||||
|
// used to read the packed byte[]
|
||||||
|
private final ByteArrayDataInput in;
|
||||||
|
// holds the minimum (left most) leaf block file pointer for each level we've recursed to:
|
||||||
|
private final long[] leafBlockFPStack;
|
||||||
|
// holds the address, in the packed byte[] index, of the left-node of each level:
|
||||||
|
private final int[] leftNodePositions;
|
||||||
|
// holds the address, in the packed byte[] index, of the right-node of each level:
|
||||||
|
private final int[] rightNodePositions;
|
||||||
|
// holds the splitDim for each level:
|
||||||
|
private final int[] splitDims;
|
||||||
|
// true if the per-dim delta we read for the node at this level is a negative offset vs. the last split on this dim; this is a packed
|
||||||
|
// 2D array, i.e. to access array[level][dim] you read from negativeDeltas[level*numDims+dim]. this will be true if the last time we
|
||||||
|
// split on this dimension, we next pushed to the left sub-tree:
|
||||||
|
private final boolean[] negativeDeltas;
|
||||||
|
// holds the packed per-level split values; the intersect method uses this to save the cell min/max as it recurses:
|
||||||
|
private final byte[][] splitValuesStack;
|
||||||
|
// scratch value to return from getPackedValue:
|
||||||
|
private final BytesRef scratch;
|
||||||
|
|
||||||
|
public PackedIndexTree() {
|
||||||
|
int treeDepth = getTreeDepth();
|
||||||
|
leafBlockFPStack = new long[treeDepth+1];
|
||||||
|
leftNodePositions = new int[treeDepth+1];
|
||||||
|
rightNodePositions = new int[treeDepth+1];
|
||||||
|
splitValuesStack = new byte[treeDepth+1][];
|
||||||
|
splitDims = new int[treeDepth+1];
|
||||||
|
negativeDeltas = new boolean[numDims*(treeDepth+1)];
|
||||||
|
|
||||||
|
in = new ByteArrayDataInput(packedIndex);
|
||||||
|
splitValuesStack[0] = new byte[packedBytesLength];
|
||||||
|
readNodeData(false);
|
||||||
|
scratch = new BytesRef();
|
||||||
|
scratch.length = bytesPerDim;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public PackedIndexTree clone() {
|
||||||
|
PackedIndexTree index = new PackedIndexTree();
|
||||||
|
index.nodeID = nodeID;
|
||||||
|
index.level = level;
|
||||||
|
index.splitDim = splitDim;
|
||||||
|
System.arraycopy(negativeDeltas, level*numDims, index.negativeDeltas, level*numDims, numDims);
|
||||||
|
index.leafBlockFPStack[level] = leafBlockFPStack[level];
|
||||||
|
index.leftNodePositions[level] = leftNodePositions[level];
|
||||||
|
index.rightNodePositions[level] = rightNodePositions[level];
|
||||||
|
index.splitValuesStack[index.level] = splitValuesStack[index.level].clone();
|
||||||
|
System.arraycopy(negativeDeltas, level*numDims, index.negativeDeltas, level*numDims, numDims);
|
||||||
|
index.splitDims[level] = splitDims[level];
|
||||||
|
return index;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void pushLeft() {
|
||||||
|
int nodePosition = leftNodePositions[level];
|
||||||
|
super.pushLeft();
|
||||||
|
System.arraycopy(negativeDeltas, (level-1)*numDims, negativeDeltas, level*numDims, numDims);
|
||||||
|
assert splitDim != -1;
|
||||||
|
negativeDeltas[level*numDims+splitDim] = true;
|
||||||
|
in.setPosition(nodePosition);
|
||||||
|
readNodeData(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void pushRight() {
|
||||||
|
int nodePosition = rightNodePositions[level];
|
||||||
|
super.pushRight();
|
||||||
|
System.arraycopy(negativeDeltas, (level-1)*numDims, negativeDeltas, level*numDims, numDims);
|
||||||
|
assert splitDim != -1;
|
||||||
|
negativeDeltas[level*numDims+splitDim] = false;
|
||||||
|
in.setPosition(nodePosition);
|
||||||
|
readNodeData(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void pop() {
|
||||||
|
super.pop();
|
||||||
|
splitDim = splitDims[level];
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long getLeafBlockFP() {
|
||||||
|
assert isLeafNode(): "nodeID=" + nodeID + " is not a leaf";
|
||||||
|
return leafBlockFPStack[level];
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public BytesRef getSplitDimValue() {
|
||||||
|
assert isLeafNode() == false;
|
||||||
|
scratch.bytes = splitValuesStack[level];
|
||||||
|
scratch.offset = splitDim * bytesPerDim;
|
||||||
|
return scratch;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void readNodeData(boolean isLeft) {
|
||||||
|
|
||||||
|
leafBlockFPStack[level] = leafBlockFPStack[level-1];
|
||||||
|
|
||||||
|
// read leaf block FP delta
|
||||||
|
if (isLeft == false) {
|
||||||
|
leafBlockFPStack[level] += in.readVLong();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isLeafNode()) {
|
||||||
|
splitDim = -1;
|
||||||
|
} else {
|
||||||
|
|
||||||
|
// read split dim, prefix, firstDiffByteDelta encoded as int:
|
||||||
|
int code = in.readVInt();
|
||||||
|
splitDim = code % numDims;
|
||||||
|
splitDims[level] = splitDim;
|
||||||
|
code /= numDims;
|
||||||
|
int prefix = code % (1+bytesPerDim);
|
||||||
|
int suffix = bytesPerDim - prefix;
|
||||||
|
|
||||||
|
if (splitValuesStack[level] == null) {
|
||||||
|
splitValuesStack[level] = new byte[packedBytesLength];
|
||||||
|
}
|
||||||
|
System.arraycopy(splitValuesStack[level-1], 0, splitValuesStack[level], 0, packedBytesLength);
|
||||||
|
if (suffix > 0) {
|
||||||
|
int firstDiffByteDelta = code / (1+bytesPerDim);
|
||||||
|
if (negativeDeltas[level*numDims + splitDim]) {
|
||||||
|
firstDiffByteDelta = -firstDiffByteDelta;
|
||||||
|
}
|
||||||
|
int oldByte = splitValuesStack[level][splitDim*bytesPerDim+prefix] & 0xFF;
|
||||||
|
splitValuesStack[level][splitDim*bytesPerDim+prefix] = (byte) (oldByte + firstDiffByteDelta);
|
||||||
|
in.readBytes(splitValuesStack[level], splitDim*bytesPerDim+prefix+1, suffix-1);
|
||||||
|
} else {
|
||||||
|
// our split value is == last split value in this dim, which can happen when there are many duplicate values
|
||||||
|
}
|
||||||
|
|
||||||
|
int leftNumBytes;
|
||||||
|
if (nodeID * 2 < leafNodeOffset) {
|
||||||
|
leftNumBytes = in.readVInt();
|
||||||
|
} else {
|
||||||
|
leftNumBytes = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
leftNodePositions[level] = in.getPosition();
|
||||||
|
rightNodePositions[level] = leftNodePositions[level] + leftNumBytes;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private int getTreeDepth() {
|
||||||
|
// First +1 because all the non-leave nodes makes another power
|
||||||
|
// of 2; e.g. to have a fully balanced tree with 4 leaves you
|
||||||
|
// need a depth=3 tree:
|
||||||
|
|
||||||
|
// Second +1 because MathUtil.log computes floor of the logarithm; e.g.
|
||||||
|
// with 5 leaves you need a depth=4 tree:
|
||||||
|
return MathUtil.log(numLeaves, 2) + 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Used to track all state for a single call to {@link #intersect}. */
|
/** Used to track all state for a single call to {@link #intersect}. */
|
||||||
@ -285,57 +467,73 @@ public class BKDReader extends PointValues implements Accountable {
|
|||||||
final int[] commonPrefixLengths;
|
final int[] commonPrefixLengths;
|
||||||
|
|
||||||
final IntersectVisitor visitor;
|
final IntersectVisitor visitor;
|
||||||
|
public final IndexTree index;
|
||||||
|
|
||||||
public IntersectState(IndexInput in, int numDims,
|
public IntersectState(IndexInput in, int numDims,
|
||||||
int packedBytesLength,
|
int packedBytesLength,
|
||||||
int maxPointsInLeafNode,
|
int maxPointsInLeafNode,
|
||||||
IntersectVisitor visitor) {
|
IntersectVisitor visitor,
|
||||||
|
IndexTree indexVisitor) {
|
||||||
this.in = in;
|
this.in = in;
|
||||||
this.visitor = visitor;
|
this.visitor = visitor;
|
||||||
this.commonPrefixLengths = new int[numDims];
|
this.commonPrefixLengths = new int[numDims];
|
||||||
this.scratchDocIDs = new int[maxPointsInLeafNode];
|
this.scratchDocIDs = new int[maxPointsInLeafNode];
|
||||||
this.scratchPackedValue = new byte[packedBytesLength];
|
this.scratchPackedValue = new byte[packedBytesLength];
|
||||||
|
this.index = indexVisitor;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void intersect(IntersectVisitor visitor) throws IOException {
|
public void intersect(IntersectVisitor visitor) throws IOException {
|
||||||
intersect(getIntersectState(visitor), 1, minPackedValue, maxPackedValue);
|
intersect(getIntersectState(visitor), minPackedValue, maxPackedValue);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Fast path: this is called when the query box fully encompasses all cells under this node. */
|
/** Fast path: this is called when the query box fully encompasses all cells under this node. */
|
||||||
private void addAll(IntersectState state, int nodeID) throws IOException {
|
private void addAll(IntersectState state) throws IOException {
|
||||||
//System.out.println("R: addAll nodeID=" + nodeID);
|
//System.out.println("R: addAll nodeID=" + nodeID);
|
||||||
|
|
||||||
if (nodeID >= leafNodeOffset) {
|
if (state.index.isLeafNode()) {
|
||||||
//System.out.println("ADDALL");
|
//System.out.println("ADDALL");
|
||||||
visitDocIDs(state.in, leafBlockFPs[nodeID-leafNodeOffset], state.visitor);
|
if (state.index.nodeExists()) {
|
||||||
|
visitDocIDs(state.in, state.index.getLeafBlockFP(), state.visitor);
|
||||||
|
}
|
||||||
// TODO: we can assert that the first value here in fact matches what the index claimed?
|
// TODO: we can assert that the first value here in fact matches what the index claimed?
|
||||||
} else {
|
} else {
|
||||||
addAll(state, 2*nodeID);
|
state.index.pushLeft();
|
||||||
addAll(state, 2*nodeID+1);
|
addAll(state);
|
||||||
|
state.index.pop();
|
||||||
|
|
||||||
|
state.index.pushRight();
|
||||||
|
addAll(state);
|
||||||
|
state.index.pop();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Create a new {@link IntersectState} */
|
/** Create a new {@link IntersectState} */
|
||||||
public IntersectState getIntersectState(IntersectVisitor visitor) {
|
public IntersectState getIntersectState(IntersectVisitor visitor) {
|
||||||
|
IndexTree index;
|
||||||
|
if (packedIndex != null) {
|
||||||
|
index = new PackedIndexTree();
|
||||||
|
} else {
|
||||||
|
index = new LegacyIndexTree();
|
||||||
|
}
|
||||||
return new IntersectState(in.clone(), numDims,
|
return new IntersectState(in.clone(), numDims,
|
||||||
packedBytesLength,
|
packedBytesLength,
|
||||||
maxPointsInLeafNode,
|
maxPointsInLeafNode,
|
||||||
visitor);
|
visitor,
|
||||||
|
index);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Visits all docIDs and packed values in a single leaf block */
|
/** Visits all docIDs and packed values in a single leaf block */
|
||||||
public void visitLeafBlockValues(int nodeID, IntersectState state) throws IOException {
|
public void visitLeafBlockValues(IndexTree index, IntersectState state) throws IOException {
|
||||||
int leafID = nodeID - leafNodeOffset;
|
|
||||||
|
|
||||||
// Leaf node; scan and filter all points in this block:
|
// Leaf node; scan and filter all points in this block:
|
||||||
int count = readDocIDs(state.in, leafBlockFPs[leafID], state.scratchDocIDs);
|
int count = readDocIDs(state.in, index.getLeafBlockFP(), state.scratchDocIDs);
|
||||||
|
|
||||||
// Again, this time reading values and checking with the visitor
|
// Again, this time reading values and checking with the visitor
|
||||||
visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor);
|
visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void visitDocIDs(IndexInput in, long blockFP, IntersectVisitor visitor) throws IOException {
|
private void visitDocIDs(IndexInput in, long blockFP, IntersectVisitor visitor) throws IOException {
|
||||||
// Leaf node
|
// Leaf node
|
||||||
in.seek(blockFP);
|
in.seek(blockFP);
|
||||||
|
|
||||||
@ -350,7 +548,7 @@ public class BKDReader extends PointValues implements Accountable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
protected int readDocIDs(IndexInput in, long blockFP, int[] docIDs) throws IOException {
|
int readDocIDs(IndexInput in, long blockFP, int[] docIDs) throws IOException {
|
||||||
in.seek(blockFP);
|
in.seek(blockFP);
|
||||||
|
|
||||||
// How many points are stored in this leaf cell:
|
// How many points are stored in this leaf cell:
|
||||||
@ -365,7 +563,7 @@ public class BKDReader extends PointValues implements Accountable {
|
|||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void visitDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
|
void visitDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
|
||||||
visitor.grow(count);
|
visitor.grow(count);
|
||||||
|
|
||||||
readCommonPrefixes(commonPrefixLengths, scratchPackedValue, in);
|
readCommonPrefixes(commonPrefixLengths, scratchPackedValue, in);
|
||||||
@ -434,13 +632,10 @@ public class BKDReader extends PointValues implements Accountable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void intersect(IntersectState state,
|
private void intersect(IntersectState state, byte[] cellMinPacked, byte[] cellMaxPacked) throws IOException {
|
||||||
int nodeID,
|
|
||||||
byte[] cellMinPacked, byte[] cellMaxPacked)
|
|
||||||
throws IOException {
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
System.out.println("\nR: intersect nodeID=" + nodeID);
|
System.out.println("\nR: intersect nodeID=" + state.index.getNodeID());
|
||||||
for(int dim=0;dim<numDims;dim++) {
|
for(int dim=0;dim<numDims;dim++) {
|
||||||
System.out.println(" dim=" + dim + "\n cellMin=" + new BytesRef(cellMinPacked, dim*bytesPerDim, bytesPerDim) + "\n cellMax=" + new BytesRef(cellMaxPacked, dim*bytesPerDim, bytesPerDim));
|
System.out.println(" dim=" + dim + "\n cellMin=" + new BytesRef(cellMinPacked, dim*bytesPerDim, bytesPerDim) + "\n cellMax=" + new BytesRef(cellMaxPacked, dim*bytesPerDim, bytesPerDim));
|
||||||
}
|
}
|
||||||
@ -450,24 +645,18 @@ public class BKDReader extends PointValues implements Accountable {
|
|||||||
|
|
||||||
if (r == Relation.CELL_OUTSIDE_QUERY) {
|
if (r == Relation.CELL_OUTSIDE_QUERY) {
|
||||||
// This cell is fully outside of the query shape: stop recursing
|
// This cell is fully outside of the query shape: stop recursing
|
||||||
return;
|
|
||||||
} else if (r == Relation.CELL_INSIDE_QUERY) {
|
} else if (r == Relation.CELL_INSIDE_QUERY) {
|
||||||
// This cell is fully inside of the query shape: recursively add all points in this cell without filtering
|
// This cell is fully inside of the query shape: recursively add all points in this cell without filtering
|
||||||
addAll(state, nodeID);
|
addAll(state);
|
||||||
return;
|
// The cell crosses the shape boundary, or the cell fully contains the query, so we fall through and do full filtering:
|
||||||
} else {
|
} else if (state.index.isLeafNode()) {
|
||||||
// The cell crosses the shape boundary, or the cell fully contains the query, so we fall through and do full filtering
|
|
||||||
}
|
|
||||||
|
|
||||||
if (nodeID >= leafNodeOffset) {
|
|
||||||
// TODO: we can assert that the first value here in fact matches what the index claimed?
|
// TODO: we can assert that the first value here in fact matches what the index claimed?
|
||||||
|
|
||||||
int leafID = nodeID - leafNodeOffset;
|
|
||||||
|
|
||||||
// In the unbalanced case it's possible the left most node only has one child:
|
// In the unbalanced case it's possible the left most node only has one child:
|
||||||
if (leafID < leafBlockFPs.length) {
|
if (state.index.nodeExists()) {
|
||||||
// Leaf node; scan and filter all points in this block:
|
// Leaf node; scan and filter all points in this block:
|
||||||
int count = readDocIDs(state.in, leafBlockFPs[leafID], state.scratchDocIDs);
|
int count = readDocIDs(state.in, state.index.getLeafBlockFP(), state.scratchDocIDs);
|
||||||
|
|
||||||
// Again, this time reading values and checking with the visitor
|
// Again, this time reading values and checking with the visitor
|
||||||
visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor);
|
visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor);
|
||||||
@ -476,65 +665,45 @@ public class BKDReader extends PointValues implements Accountable {
|
|||||||
} else {
|
} else {
|
||||||
|
|
||||||
// Non-leaf node: recurse on the split left and right nodes
|
// Non-leaf node: recurse on the split left and right nodes
|
||||||
|
int splitDim = state.index.getSplitDim();
|
||||||
int address = nodeID * bytesPerIndexEntry;
|
assert splitDim >= 0: "splitDim=" + splitDim;
|
||||||
int splitDim;
|
|
||||||
if (numDims == 1) {
|
|
||||||
splitDim = 0;
|
|
||||||
if (version < BKDWriter.VERSION_IMPLICIT_SPLIT_DIM_1D) {
|
|
||||||
// skip over wastefully encoded 0 splitDim:
|
|
||||||
assert splitPackedValues[address] == 0;
|
|
||||||
address++;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
splitDim = splitPackedValues[address++] & 0xff;
|
|
||||||
}
|
|
||||||
|
|
||||||
assert splitDim < numDims;
|
assert splitDim < numDims;
|
||||||
|
|
||||||
// TODO: can we alloc & reuse this up front?
|
byte[] splitPackedValue = state.index.getSplitPackedValue();
|
||||||
|
BytesRef splitDimValue = state.index.getSplitDimValue();
|
||||||
|
assert splitDimValue.length == bytesPerDim;
|
||||||
|
//System.out.println(" splitDimValue=" + splitDimValue + " splitDim=" + splitDim);
|
||||||
|
|
||||||
byte[] splitPackedValue = new byte[packedBytesLength];
|
// make sure cellMin <= splitValue <= cellMax:
|
||||||
|
assert StringHelper.compare(bytesPerDim, cellMinPacked, splitDim*bytesPerDim, splitDimValue.bytes, splitDimValue.offset) <= 0: "bytesPerDim=" + bytesPerDim + " splitDim=" + splitDim + " numDims=" + numDims;
|
||||||
|
assert StringHelper.compare(bytesPerDim, cellMaxPacked, splitDim*bytesPerDim, splitDimValue.bytes, splitDimValue.offset) >= 0: "bytesPerDim=" + bytesPerDim + " splitDim=" + splitDim + " numDims=" + numDims;
|
||||||
|
|
||||||
// Recurse on left sub-tree:
|
// Recurse on left sub-tree:
|
||||||
System.arraycopy(cellMaxPacked, 0, splitPackedValue, 0, packedBytesLength);
|
System.arraycopy(cellMaxPacked, 0, splitPackedValue, 0, packedBytesLength);
|
||||||
System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
|
System.arraycopy(splitDimValue.bytes, splitDimValue.offset, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
|
||||||
intersect(state,
|
state.index.pushLeft();
|
||||||
2*nodeID,
|
intersect(state, cellMinPacked, splitPackedValue);
|
||||||
cellMinPacked, splitPackedValue);
|
state.index.pop();
|
||||||
|
|
||||||
|
// Restore the split dim value since it may have been overwritten while recursing:
|
||||||
|
System.arraycopy(splitPackedValue, splitDim*bytesPerDim, splitDimValue.bytes, splitDimValue.offset, bytesPerDim);
|
||||||
|
|
||||||
// Recurse on right sub-tree:
|
// Recurse on right sub-tree:
|
||||||
System.arraycopy(cellMinPacked, 0, splitPackedValue, 0, packedBytesLength);
|
System.arraycopy(cellMinPacked, 0, splitPackedValue, 0, packedBytesLength);
|
||||||
System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
|
System.arraycopy(splitDimValue.bytes, splitDimValue.offset, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
|
||||||
intersect(state,
|
state.index.pushRight();
|
||||||
2*nodeID+1,
|
intersect(state, splitPackedValue, cellMaxPacked);
|
||||||
splitPackedValue, cellMaxPacked);
|
state.index.pop();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Copies the split value for this node into the provided byte array */
|
|
||||||
public void copySplitValue(int nodeID, byte[] splitPackedValue) {
|
|
||||||
int address = nodeID * bytesPerIndexEntry;
|
|
||||||
int splitDim;
|
|
||||||
if (numDims == 1) {
|
|
||||||
splitDim = 0;
|
|
||||||
if (version < BKDWriter.VERSION_IMPLICIT_SPLIT_DIM_1D) {
|
|
||||||
// skip over wastefully encoded 0 splitDim:
|
|
||||||
assert splitPackedValues[address] == 0;
|
|
||||||
address++;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
splitDim = splitPackedValues[address++] & 0xff;
|
|
||||||
}
|
|
||||||
|
|
||||||
assert splitDim < numDims;
|
|
||||||
System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public long ramBytesUsed() {
|
public long ramBytesUsed() {
|
||||||
return RamUsageEstimator.sizeOf(splitPackedValues) +
|
if (packedIndex != null) {
|
||||||
RamUsageEstimator.sizeOf(leafBlockFPs);
|
return packedIndex.length;
|
||||||
|
} else {
|
||||||
|
return RamUsageEstimator.sizeOf(splitPackedValues) + RamUsageEstimator.sizeOf(leafBlockFPs);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -33,6 +33,7 @@ import org.apache.lucene.store.ChecksumIndexInput;
|
|||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.store.IOContext;
|
import org.apache.lucene.store.IOContext;
|
||||||
import org.apache.lucene.store.IndexOutput;
|
import org.apache.lucene.store.IndexOutput;
|
||||||
|
import org.apache.lucene.store.RAMOutputStream;
|
||||||
import org.apache.lucene.store.TrackingDirectoryWrapper;
|
import org.apache.lucene.store.TrackingDirectoryWrapper;
|
||||||
import org.apache.lucene.util.ArrayUtil;
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
@ -83,7 +84,8 @@ public class BKDWriter implements Closeable {
|
|||||||
public static final int VERSION_COMPRESSED_DOC_IDS = 1;
|
public static final int VERSION_COMPRESSED_DOC_IDS = 1;
|
||||||
public static final int VERSION_COMPRESSED_VALUES = 2;
|
public static final int VERSION_COMPRESSED_VALUES = 2;
|
||||||
public static final int VERSION_IMPLICIT_SPLIT_DIM_1D = 3;
|
public static final int VERSION_IMPLICIT_SPLIT_DIM_1D = 3;
|
||||||
public static final int VERSION_CURRENT = VERSION_IMPLICIT_SPLIT_DIM_1D;
|
public static final int VERSION_PACKED_INDEX = 4;
|
||||||
|
public static final int VERSION_CURRENT = VERSION_PACKED_INDEX;
|
||||||
|
|
||||||
/** How many bytes each docs takes in the fixed-width offline format */
|
/** How many bytes each docs takes in the fixed-width offline format */
|
||||||
private final int bytesPerDoc;
|
private final int bytesPerDoc;
|
||||||
@ -325,15 +327,10 @@ public class BKDWriter implements Closeable {
|
|||||||
bkd.numDims,
|
bkd.numDims,
|
||||||
bkd.packedBytesLength,
|
bkd.packedBytesLength,
|
||||||
bkd.maxPointsInLeafNode,
|
bkd.maxPointsInLeafNode,
|
||||||
|
null,
|
||||||
null);
|
null);
|
||||||
this.docMap = docMap;
|
this.docMap = docMap;
|
||||||
long minFP = Long.MAX_VALUE;
|
state.in.seek(bkd.getMinLeafBlockFP());
|
||||||
//System.out.println("MR.init " + this + " bkdreader=" + bkd + " leafBlockFPs.length=" + bkd.leafBlockFPs.length);
|
|
||||||
for(long fp : bkd.leafBlockFPs) {
|
|
||||||
minFP = Math.min(minFP, fp);
|
|
||||||
//System.out.println(" leaf fp=" + fp);
|
|
||||||
}
|
|
||||||
state.in.seek(minFP);
|
|
||||||
this.packedValues = new byte[bkd.maxPointsInLeafNode * bkd.packedBytesLength];
|
this.packedValues = new byte[bkd.maxPointsInLeafNode * bkd.packedBytesLength];
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -341,7 +338,7 @@ public class BKDWriter implements Closeable {
|
|||||||
//System.out.println("MR.next this=" + this);
|
//System.out.println("MR.next this=" + this);
|
||||||
while (true) {
|
while (true) {
|
||||||
if (docBlockUpto == docsInBlock) {
|
if (docBlockUpto == docsInBlock) {
|
||||||
if (blockID == bkd.leafBlockFPs.length) {
|
if (blockID == bkd.leafNodeOffset) {
|
||||||
//System.out.println(" done!");
|
//System.out.println(" done!");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -489,7 +486,6 @@ public class BKDWriter implements Closeable {
|
|||||||
return indexFP;
|
return indexFP;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/* In the 1D case, we can simply sort points in ascending order and use the
|
/* In the 1D case, we can simply sort points in ascending order and use the
|
||||||
* same writing logic as we use at merge time. */
|
* same writing logic as we use at merge time. */
|
||||||
private long writeField1Dim(IndexOutput out, String fieldName, MutablePointValues reader) throws IOException {
|
private long writeField1Dim(IndexOutput out, String fieldName, MutablePointValues reader) throws IOException {
|
||||||
@ -648,6 +644,7 @@ public class BKDWriter implements Closeable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private void writeLeafBlock() throws IOException {
|
private void writeLeafBlock() throws IOException {
|
||||||
|
//System.out.println("writeLeafBlock pos=" + out.getFilePointer());
|
||||||
assert leafCount != 0;
|
assert leafCount != 0;
|
||||||
if (valueCount == 0) {
|
if (valueCount == 0) {
|
||||||
System.arraycopy(leafValues, 0, minPackedValue, 0, packedBytesLength);
|
System.arraycopy(leafValues, 0, minPackedValue, 0, packedBytesLength);
|
||||||
@ -811,6 +808,24 @@ public class BKDWriter implements Closeable {
|
|||||||
}.sort(0, pointCount);
|
}.sort(0, pointCount);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// useful for debugging:
|
||||||
|
/*
|
||||||
|
private void printPathSlice(String desc, PathSlice slice, int dim) throws IOException {
|
||||||
|
System.out.println(" " + desc + " dim=" + dim + " count=" + slice.count + ":");
|
||||||
|
try(PointReader r = slice.writer.getReader(slice.start, slice.count)) {
|
||||||
|
int count = 0;
|
||||||
|
while (r.next()) {
|
||||||
|
byte[] v = r.packedValue();
|
||||||
|
System.out.println(" " + count + ": " + new BytesRef(v, dim*bytesPerDim, bytesPerDim));
|
||||||
|
count++;
|
||||||
|
if (count == slice.count) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
private PointWriter sort(int dim) throws IOException {
|
private PointWriter sort(int dim) throws IOException {
|
||||||
assert dim >= 0 && dim < numDims;
|
assert dim >= 0 && dim < numDims;
|
||||||
|
|
||||||
@ -1019,46 +1034,238 @@ public class BKDWriter implements Closeable {
|
|||||||
return indexFP;
|
return indexFP;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Subclass can change how it writes the index. */
|
/** Packs the two arrays, representing a balanced binary tree, into a compact byte[] structure. */
|
||||||
protected void writeIndex(IndexOutput out, long[] leafBlockFPs, byte[] splitPackedValues) throws IOException {
|
private byte[] packIndex(long[] leafBlockFPs, byte[] splitPackedValues) throws IOException {
|
||||||
|
|
||||||
|
int numLeaves = leafBlockFPs.length;
|
||||||
|
|
||||||
|
// Possibly rotate the leaf block FPs, if the index not fully balanced binary tree (only happens
|
||||||
|
// if it was created by OneDimensionBKDWriter). In this case the leaf nodes may straddle the two bottom
|
||||||
|
// levels of the binary tree:
|
||||||
|
if (numDims == 1 && numLeaves > 1) {
|
||||||
|
int levelCount = 2;
|
||||||
|
while (true) {
|
||||||
|
if (numLeaves >= levelCount && numLeaves <= 2*levelCount) {
|
||||||
|
int lastLevel = 2*(numLeaves - levelCount);
|
||||||
|
assert lastLevel >= 0;
|
||||||
|
if (lastLevel != 0) {
|
||||||
|
// Last level is partially filled, so we must rotate the leaf FPs to match. We do this here, after loading
|
||||||
|
// at read-time, so that we can still delta code them on disk at write:
|
||||||
|
long[] newLeafBlockFPs = new long[numLeaves];
|
||||||
|
System.arraycopy(leafBlockFPs, lastLevel, newLeafBlockFPs, 0, leafBlockFPs.length - lastLevel);
|
||||||
|
System.arraycopy(leafBlockFPs, 0, newLeafBlockFPs, leafBlockFPs.length - lastLevel, lastLevel);
|
||||||
|
leafBlockFPs = newLeafBlockFPs;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
levelCount *= 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Reused while packing the index */
|
||||||
|
RAMOutputStream writeBuffer = new RAMOutputStream();
|
||||||
|
|
||||||
|
// This is the "file" we append the byte[] to:
|
||||||
|
List<byte[]> blocks = new ArrayList<>();
|
||||||
|
byte[] lastSplitValues = new byte[bytesPerDim * numDims];
|
||||||
|
//System.out.println("\npack index");
|
||||||
|
int totalSize = recursePackIndex(writeBuffer, leafBlockFPs, splitPackedValues, 0l, blocks, 1, lastSplitValues, new boolean[numDims], false);
|
||||||
|
|
||||||
|
// Compact the byte[] blocks into single byte index:
|
||||||
|
byte[] index = new byte[totalSize];
|
||||||
|
int upto = 0;
|
||||||
|
for(byte[] block : blocks) {
|
||||||
|
System.arraycopy(block, 0, index, upto, block.length);
|
||||||
|
upto += block.length;
|
||||||
|
}
|
||||||
|
assert upto == totalSize;
|
||||||
|
|
||||||
|
return index;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Appends the current contents of writeBuffer as another block on the growing in-memory file */
|
||||||
|
private int appendBlock(RAMOutputStream writeBuffer, List<byte[]> blocks) throws IOException {
|
||||||
|
int pos = Math.toIntExact(writeBuffer.getFilePointer());
|
||||||
|
byte[] bytes = new byte[pos];
|
||||||
|
writeBuffer.writeTo(bytes, 0);
|
||||||
|
writeBuffer.reset();
|
||||||
|
blocks.add(bytes);
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* lastSplitValues is per-dimension split value previously seen; we use this to prefix-code the split byte[] on each inner node
|
||||||
|
*/
|
||||||
|
private int recursePackIndex(RAMOutputStream writeBuffer, long[] leafBlockFPs, byte[] splitPackedValues, long minBlockFP, List<byte[]> blocks,
|
||||||
|
int nodeID, byte[] lastSplitValues, boolean[] negativeDeltas, boolean isLeft) throws IOException {
|
||||||
|
if (nodeID >= leafBlockFPs.length) {
|
||||||
|
int leafID = nodeID - leafBlockFPs.length;
|
||||||
|
//System.out.println("recursePack leaf nodeID=" + nodeID);
|
||||||
|
|
||||||
|
// In the unbalanced case it's possible the left most node only has one child:
|
||||||
|
if (leafID < leafBlockFPs.length) {
|
||||||
|
long delta = leafBlockFPs[leafID] - minBlockFP;
|
||||||
|
if (isLeft) {
|
||||||
|
assert delta == 0;
|
||||||
|
return 0;
|
||||||
|
} else {
|
||||||
|
assert nodeID == 1 || delta > 0: "nodeID=" + nodeID;
|
||||||
|
writeBuffer.writeVLong(delta);
|
||||||
|
return appendBlock(writeBuffer, blocks);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
long leftBlockFP;
|
||||||
|
if (isLeft == false) {
|
||||||
|
leftBlockFP = getLeftMostLeafBlockFP(leafBlockFPs, nodeID);
|
||||||
|
long delta = leftBlockFP - minBlockFP;
|
||||||
|
assert nodeID == 1 || delta > 0;
|
||||||
|
writeBuffer.writeVLong(delta);
|
||||||
|
} else {
|
||||||
|
// The left tree's left most leaf block FP is always the minimal FP:
|
||||||
|
leftBlockFP = minBlockFP;
|
||||||
|
}
|
||||||
|
|
||||||
|
int address = nodeID * (1+bytesPerDim);
|
||||||
|
int splitDim = splitPackedValues[address++] & 0xff;
|
||||||
|
|
||||||
|
//System.out.println("recursePack inner nodeID=" + nodeID + " splitDim=" + splitDim + " splitValue=" + new BytesRef(splitPackedValues, address, bytesPerDim));
|
||||||
|
|
||||||
|
// find common prefix with last split value in this dim:
|
||||||
|
int prefix = 0;
|
||||||
|
for(;prefix<bytesPerDim;prefix++) {
|
||||||
|
if (splitPackedValues[address+prefix] != lastSplitValues[splitDim * bytesPerDim + prefix]) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//System.out.println("writeNodeData nodeID=" + nodeID + " splitDim=" + splitDim + " numDims=" + numDims + " bytesPerDim=" + bytesPerDim + " prefix=" + prefix);
|
||||||
|
|
||||||
|
int firstDiffByteDelta;
|
||||||
|
if (prefix < bytesPerDim) {
|
||||||
|
//System.out.println(" delta byte cur=" + Integer.toHexString(splitPackedValues[address+prefix]&0xFF) + " prev=" + Integer.toHexString(lastSplitValues[splitDim * bytesPerDim + prefix]&0xFF) + " negated?=" + negativeDeltas[splitDim]);
|
||||||
|
firstDiffByteDelta = (splitPackedValues[address+prefix]&0xFF) - (lastSplitValues[splitDim * bytesPerDim + prefix]&0xFF);
|
||||||
|
if (negativeDeltas[splitDim]) {
|
||||||
|
firstDiffByteDelta = -firstDiffByteDelta;
|
||||||
|
}
|
||||||
|
//System.out.println(" delta=" + firstDiffByteDelta);
|
||||||
|
assert firstDiffByteDelta > 0;
|
||||||
|
} else {
|
||||||
|
firstDiffByteDelta = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// pack the prefix, splitDim and delta first diff byte into a single vInt:
|
||||||
|
int code = (firstDiffByteDelta * (1+bytesPerDim) + prefix) * numDims + splitDim;
|
||||||
|
|
||||||
|
//System.out.println(" code=" + code);
|
||||||
|
//System.out.println(" splitValue=" + new BytesRef(splitPackedValues, address, bytesPerDim));
|
||||||
|
|
||||||
|
writeBuffer.writeVInt(code);
|
||||||
|
|
||||||
|
// write the split value, prefix coded vs. our parent's split value:
|
||||||
|
int suffix = bytesPerDim - prefix;
|
||||||
|
byte[] savSplitValue = new byte[suffix];
|
||||||
|
if (suffix > 1) {
|
||||||
|
writeBuffer.writeBytes(splitPackedValues, address+prefix+1, suffix-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
byte[] cmp = lastSplitValues.clone();
|
||||||
|
|
||||||
|
System.arraycopy(lastSplitValues, splitDim * bytesPerDim + prefix, savSplitValue, 0, suffix);
|
||||||
|
|
||||||
|
// copy our split value into lastSplitValues for our children to prefix-code against
|
||||||
|
System.arraycopy(splitPackedValues, address+prefix, lastSplitValues, splitDim * bytesPerDim + prefix, suffix);
|
||||||
|
|
||||||
|
int numBytes = appendBlock(writeBuffer, blocks);
|
||||||
|
|
||||||
|
// placeholder for left-tree numBytes; we need this so that at search time if we only need to recurse into the right sub-tree we can
|
||||||
|
// quickly seek to its starting point
|
||||||
|
int idxSav = blocks.size();
|
||||||
|
blocks.add(null);
|
||||||
|
|
||||||
|
boolean savNegativeDelta = negativeDeltas[splitDim];
|
||||||
|
negativeDeltas[splitDim] = true;
|
||||||
|
|
||||||
|
int leftNumBytes = recursePackIndex(writeBuffer, leafBlockFPs, splitPackedValues, leftBlockFP, blocks, 2*nodeID, lastSplitValues, negativeDeltas, true);
|
||||||
|
|
||||||
|
if (nodeID * 2 < leafBlockFPs.length) {
|
||||||
|
writeBuffer.writeVInt(leftNumBytes);
|
||||||
|
} else {
|
||||||
|
assert leftNumBytes == 0: "leftNumBytes=" + leftNumBytes;
|
||||||
|
}
|
||||||
|
int numBytes2 = Math.toIntExact(writeBuffer.getFilePointer());
|
||||||
|
byte[] bytes2 = new byte[numBytes2];
|
||||||
|
writeBuffer.writeTo(bytes2, 0);
|
||||||
|
writeBuffer.reset();
|
||||||
|
// replace our placeholder:
|
||||||
|
blocks.set(idxSav, bytes2);
|
||||||
|
|
||||||
|
negativeDeltas[splitDim] = false;
|
||||||
|
int rightNumBytes = recursePackIndex(writeBuffer, leafBlockFPs, splitPackedValues, leftBlockFP, blocks, 2*nodeID+1, lastSplitValues, negativeDeltas, false);
|
||||||
|
|
||||||
|
negativeDeltas[splitDim] = savNegativeDelta;
|
||||||
|
|
||||||
|
// restore lastSplitValues to what caller originally passed us:
|
||||||
|
System.arraycopy(savSplitValue, 0, lastSplitValues, splitDim * bytesPerDim + prefix, suffix);
|
||||||
|
|
||||||
|
assert Arrays.equals(lastSplitValues, cmp);
|
||||||
|
|
||||||
|
return numBytes + numBytes2 + leftNumBytes + rightNumBytes;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private long getLeftMostLeafBlockFP(long[] leafBlockFPs, int nodeID) {
|
||||||
|
int nodeIDIn = nodeID;
|
||||||
|
// TODO: can we do this cheaper, e.g. a closed form solution instead of while loop? Or
|
||||||
|
// change the recursion while packing the index to return this left-most leaf block FP
|
||||||
|
// from each recursion instead?
|
||||||
|
//
|
||||||
|
// Still, the overall cost here is minor: this method's cost is O(log(N)), and while writing
|
||||||
|
// we call it O(N) times (N = number of leaf blocks)
|
||||||
|
while (nodeID < leafBlockFPs.length) {
|
||||||
|
nodeID *= 2;
|
||||||
|
}
|
||||||
|
int leafID = nodeID - leafBlockFPs.length;
|
||||||
|
long result = leafBlockFPs[leafID];
|
||||||
|
if (result < 0) {
|
||||||
|
throw new AssertionError(result + " for leaf " + leafID);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void writeIndex(IndexOutput out, long[] leafBlockFPs, byte[] splitPackedValues) throws IOException {
|
||||||
|
byte[] packedIndex = packIndex(leafBlockFPs, splitPackedValues);
|
||||||
|
writeIndex(out, leafBlockFPs.length, packedIndex);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void writeIndex(IndexOutput out, int numLeaves, byte[] packedIndex) throws IOException {
|
||||||
|
|
||||||
CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT);
|
CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT);
|
||||||
out.writeVInt(numDims);
|
out.writeVInt(numDims);
|
||||||
out.writeVInt(maxPointsInLeafNode);
|
out.writeVInt(maxPointsInLeafNode);
|
||||||
out.writeVInt(bytesPerDim);
|
out.writeVInt(bytesPerDim);
|
||||||
|
|
||||||
assert leafBlockFPs.length > 0;
|
assert numLeaves > 0;
|
||||||
out.writeVInt(leafBlockFPs.length);
|
out.writeVInt(numLeaves);
|
||||||
out.writeBytes(minPackedValue, 0, packedBytesLength);
|
out.writeBytes(minPackedValue, 0, packedBytesLength);
|
||||||
out.writeBytes(maxPackedValue, 0, packedBytesLength);
|
out.writeBytes(maxPackedValue, 0, packedBytesLength);
|
||||||
|
|
||||||
out.writeVLong(pointCount);
|
out.writeVLong(pointCount);
|
||||||
out.writeVInt(docsSeen.cardinality());
|
out.writeVInt(docsSeen.cardinality());
|
||||||
|
out.writeVInt(packedIndex.length);
|
||||||
// NOTE: splitPackedValues[0] is unused, because nodeID is 1-based:
|
out.writeBytes(packedIndex, 0, packedIndex.length);
|
||||||
if (numDims == 1) {
|
|
||||||
// write the index, skipping the byte used to store the split dim since it is always 0
|
|
||||||
for (int i = 1; i < splitPackedValues.length; i += 1 + bytesPerDim) {
|
|
||||||
out.writeBytes(splitPackedValues, i, bytesPerDim);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
out.writeBytes(splitPackedValues, 0, splitPackedValues.length);
|
|
||||||
}
|
|
||||||
|
|
||||||
long lastFP = 0;
|
|
||||||
for (int i=0;i<leafBlockFPs.length;i++) {
|
|
||||||
long delta = leafBlockFPs[i]-lastFP;
|
|
||||||
out.writeVLong(delta);
|
|
||||||
lastFP = leafBlockFPs[i];
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void writeLeafBlockDocs(IndexOutput out, int[] docIDs, int start, int count) throws IOException {
|
private void writeLeafBlockDocs(IndexOutput out, int[] docIDs, int start, int count) throws IOException {
|
||||||
assert count > 0: "maxPointsInLeafNode=" + maxPointsInLeafNode;
|
assert count > 0: "maxPointsInLeafNode=" + maxPointsInLeafNode;
|
||||||
out.writeVInt(count);
|
out.writeVInt(count);
|
||||||
DocIdsWriter.writeDocIds(docIDs, start, count, out);
|
DocIdsWriter.writeDocIds(docIDs, start, count, out);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void writeLeafBlockPackedValues(IndexOutput out, int[] commonPrefixLengths, int count, int sortedDim, IntFunction<BytesRef> packedValues) throws IOException {
|
private void writeLeafBlockPackedValues(IndexOutput out, int[] commonPrefixLengths, int count, int sortedDim, IntFunction<BytesRef> packedValues) throws IOException {
|
||||||
int prefixLenSum = Arrays.stream(commonPrefixLengths).sum();
|
int prefixLenSum = Arrays.stream(commonPrefixLengths).sum();
|
||||||
if (prefixLenSum == packedBytesLength) {
|
if (prefixLenSum == packedBytesLength) {
|
||||||
// all values in this block are equal
|
// all values in this block are equal
|
||||||
@ -1109,7 +1316,7 @@ public class BKDWriter implements Closeable {
|
|||||||
return end - start;
|
return end - start;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void writeCommonPrefixes(IndexOutput out, int[] commonPrefixes, byte[] packedValue) throws IOException {
|
private void writeCommonPrefixes(IndexOutput out, int[] commonPrefixes, byte[] packedValue) throws IOException {
|
||||||
for(int dim=0;dim<numDims;dim++) {
|
for(int dim=0;dim<numDims;dim++) {
|
||||||
out.writeVInt(commonPrefixes[dim]);
|
out.writeVInt(commonPrefixes[dim]);
|
||||||
//System.out.println(commonPrefixes[dim] + " of " + bytesPerDim);
|
//System.out.println(commonPrefixes[dim] + " of " + bytesPerDim);
|
||||||
@ -1177,7 +1384,7 @@ public class BKDWriter implements Closeable {
|
|||||||
// TODO: find a way to also checksum this reader? If we changed to markLeftTree, and scanned the final chunk, it could work?
|
// TODO: find a way to also checksum this reader? If we changed to markLeftTree, and scanned the final chunk, it could work?
|
||||||
try (PointReader reader = source.writer.getReader(source.start + source.count - rightCount, rightCount)) {
|
try (PointReader reader = source.writer.getReader(source.start + source.count - rightCount, rightCount)) {
|
||||||
boolean result = reader.next();
|
boolean result = reader.next();
|
||||||
assert result;
|
assert result: "rightCount=" + rightCount + " source.count=" + source.count + " source.writer=" + source.writer;
|
||||||
System.arraycopy(reader.packedValue(), splitDim*bytesPerDim, scratch1, 0, bytesPerDim);
|
System.arraycopy(reader.packedValue(), splitDim*bytesPerDim, scratch1, 0, bytesPerDim);
|
||||||
if (numDims > 1) {
|
if (numDims > 1) {
|
||||||
assert ordBitSet.get(reader.ord()) == false;
|
assert ordBitSet.get(reader.ord()) == false;
|
||||||
@ -1244,12 +1451,12 @@ public class BKDWriter implements Closeable {
|
|||||||
|
|
||||||
/* Recursively reorders the provided reader and writes the bkd-tree on the fly. */
|
/* Recursively reorders the provided reader and writes the bkd-tree on the fly. */
|
||||||
private void build(int nodeID, int leafNodeOffset,
|
private void build(int nodeID, int leafNodeOffset,
|
||||||
MutablePointValues reader, int from, int to,
|
MutablePointValues reader, int from, int to,
|
||||||
IndexOutput out,
|
IndexOutput out,
|
||||||
byte[] minPackedValue, byte[] maxPackedValue,
|
byte[] minPackedValue, byte[] maxPackedValue,
|
||||||
byte[] splitPackedValues,
|
byte[] splitPackedValues,
|
||||||
long[] leafBlockFPs,
|
long[] leafBlockFPs,
|
||||||
int[] spareDocIds) throws IOException {
|
int[] spareDocIds) throws IOException {
|
||||||
|
|
||||||
if (nodeID >= leafNodeOffset) {
|
if (nodeID >= leafNodeOffset) {
|
||||||
// leaf node
|
// leaf node
|
||||||
@ -1311,6 +1518,7 @@ public class BKDWriter implements Closeable {
|
|||||||
for (int i = from; i < to; ++i) {
|
for (int i = from; i < to; ++i) {
|
||||||
docIDs[i - from] = reader.getDocID(i);
|
docIDs[i - from] = reader.getDocID(i);
|
||||||
}
|
}
|
||||||
|
//System.out.println("writeLeafBlock pos=" + out.getFilePointer());
|
||||||
writeLeafBlockDocs(out, docIDs, 0, count);
|
writeLeafBlockDocs(out, docIDs, 0, count);
|
||||||
|
|
||||||
// Write the common prefixes:
|
// Write the common prefixes:
|
||||||
@ -1344,6 +1552,7 @@ public class BKDWriter implements Closeable {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
MutablePointsReaderUtils.partition(maxDoc, splitDim, bytesPerDim, commonPrefixLen,
|
MutablePointsReaderUtils.partition(maxDoc, splitDim, bytesPerDim, commonPrefixLen,
|
||||||
reader, from, to, mid, scratchBytesRef1, scratchBytesRef2);
|
reader, from, to, mid, scratchBytesRef1, scratchBytesRef2);
|
||||||
|
|
||||||
@ -1381,7 +1590,7 @@ public class BKDWriter implements Closeable {
|
|||||||
for(PathSlice slice : slices) {
|
for(PathSlice slice : slices) {
|
||||||
assert slice.count == slices[0].count;
|
assert slice.count == slices[0].count;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (numDims == 1 && slices[0].writer instanceof OfflinePointWriter && slices[0].count <= maxPointsSortInHeap) {
|
if (numDims == 1 && slices[0].writer instanceof OfflinePointWriter && slices[0].count <= maxPointsSortInHeap) {
|
||||||
// Special case for 1D, to cutover to heap once we recurse deeply enough:
|
// Special case for 1D, to cutover to heap once we recurse deeply enough:
|
||||||
slices[0] = switchToHeap(slices[0], toCloseHeroically);
|
slices[0] = switchToHeap(slices[0], toCloseHeroically);
|
||||||
|
@ -18,7 +18,10 @@ package org.apache.lucene.util.bkd;
|
|||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
final class HeapPointReader extends PointReader {
|
/** Utility class to read buffered points from in-heap arrays.
|
||||||
|
*
|
||||||
|
* @lucene.internal */
|
||||||
|
public final class HeapPointReader extends PointReader {
|
||||||
private int curRead;
|
private int curRead;
|
||||||
final List<byte[]> blocks;
|
final List<byte[]> blocks;
|
||||||
final int valuesPerBlock;
|
final int valuesPerBlock;
|
||||||
@ -30,7 +33,7 @@ final class HeapPointReader extends PointReader {
|
|||||||
final byte[] scratch;
|
final byte[] scratch;
|
||||||
final boolean singleValuePerDoc;
|
final boolean singleValuePerDoc;
|
||||||
|
|
||||||
HeapPointReader(List<byte[]> blocks, int valuesPerBlock, int packedBytesLength, int[] ords, long[] ordsLong, int[] docIDs, int start, int end, boolean singleValuePerDoc) {
|
public HeapPointReader(List<byte[]> blocks, int valuesPerBlock, int packedBytesLength, int[] ords, long[] ordsLong, int[] docIDs, int start, int end, boolean singleValuePerDoc) {
|
||||||
this.blocks = blocks;
|
this.blocks = blocks;
|
||||||
this.valuesPerBlock = valuesPerBlock;
|
this.valuesPerBlock = valuesPerBlock;
|
||||||
this.singleValuePerDoc = singleValuePerDoc;
|
this.singleValuePerDoc = singleValuePerDoc;
|
||||||
|
@ -24,18 +24,21 @@ import java.util.List;
|
|||||||
import org.apache.lucene.util.ArrayUtil;
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
|
||||||
final class HeapPointWriter implements PointWriter {
|
/** Utility class to write new points into in-heap arrays.
|
||||||
int[] docIDs;
|
*
|
||||||
long[] ordsLong;
|
* @lucene.internal */
|
||||||
int[] ords;
|
public final class HeapPointWriter implements PointWriter {
|
||||||
|
public int[] docIDs;
|
||||||
|
public long[] ordsLong;
|
||||||
|
public int[] ords;
|
||||||
private int nextWrite;
|
private int nextWrite;
|
||||||
private boolean closed;
|
private boolean closed;
|
||||||
final int maxSize;
|
final int maxSize;
|
||||||
final int valuesPerBlock;
|
public final int valuesPerBlock;
|
||||||
final int packedBytesLength;
|
final int packedBytesLength;
|
||||||
final boolean singleValuePerDoc;
|
final boolean singleValuePerDoc;
|
||||||
// NOTE: can't use ByteBlockPool because we need random-write access when sorting in heap
|
// NOTE: can't use ByteBlockPool because we need random-write access when sorting in heap
|
||||||
final List<byte[]> blocks = new ArrayList<>();
|
public final List<byte[]> blocks = new ArrayList<>();
|
||||||
|
|
||||||
public HeapPointWriter(int initSize, int maxSize, int packedBytesLength, boolean longOrds, boolean singleValuePerDoc) {
|
public HeapPointWriter(int initSize, int maxSize, int packedBytesLength, boolean longOrds, boolean singleValuePerDoc) {
|
||||||
docIDs = new int[initSize];
|
docIDs = new int[initSize];
|
||||||
@ -77,7 +80,7 @@ final class HeapPointWriter implements PointWriter {
|
|||||||
nextWrite = other.nextWrite;
|
nextWrite = other.nextWrite;
|
||||||
}
|
}
|
||||||
|
|
||||||
void readPackedValue(int index, byte[] bytes) {
|
public void readPackedValue(int index, byte[] bytes) {
|
||||||
assert bytes.length == packedBytesLength;
|
assert bytes.length == packedBytesLength;
|
||||||
int block = index / valuesPerBlock;
|
int block = index / valuesPerBlock;
|
||||||
int blockIndex = index % valuesPerBlock;
|
int blockIndex = index % valuesPerBlock;
|
||||||
@ -85,7 +88,7 @@ final class HeapPointWriter implements PointWriter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/** Returns a reference, in <code>result</code>, to the byte[] slice holding this value */
|
/** Returns a reference, in <code>result</code>, to the byte[] slice holding this value */
|
||||||
void getPackedValueSlice(int index, BytesRef result) {
|
public void getPackedValueSlice(int index, BytesRef result) {
|
||||||
int block = index / valuesPerBlock;
|
int block = index / valuesPerBlock;
|
||||||
int blockIndex = index % valuesPerBlock;
|
int blockIndex = index % valuesPerBlock;
|
||||||
result.bytes = blocks.get(block);
|
result.bytes = blocks.get(block);
|
||||||
@ -138,7 +141,8 @@ final class HeapPointWriter implements PointWriter {
|
|||||||
@Override
|
@Override
|
||||||
public PointReader getReader(long start, long length) {
|
public PointReader getReader(long start, long length) {
|
||||||
assert start + length <= docIDs.length: "start=" + start + " length=" + length + " docIDs.length=" + docIDs.length;
|
assert start + length <= docIDs.length: "start=" + start + " length=" + length + " docIDs.length=" + docIDs.length;
|
||||||
return new HeapPointReader(blocks, valuesPerBlock, packedBytesLength, ords, ordsLong, docIDs, (int) start, nextWrite, singleValuePerDoc);
|
assert start + length <= nextWrite: "start=" + start + " length=" + length + " nextWrite=" + nextWrite;
|
||||||
|
return new HeapPointReader(blocks, valuesPerBlock, packedBytesLength, ords, ordsLong, docIDs, (int) start, Math.toIntExact(start+length), singleValuePerDoc);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -26,13 +26,16 @@ import org.apache.lucene.util.Selector;
|
|||||||
import org.apache.lucene.util.StringHelper;
|
import org.apache.lucene.util.StringHelper;
|
||||||
import org.apache.lucene.util.packed.PackedInts;
|
import org.apache.lucene.util.packed.PackedInts;
|
||||||
|
|
||||||
final class MutablePointsReaderUtils {
|
/** Utility APIs for sorting and partitioning buffered points.
|
||||||
|
*
|
||||||
|
* @lucene.internal */
|
||||||
|
public final class MutablePointsReaderUtils {
|
||||||
|
|
||||||
MutablePointsReaderUtils() {}
|
MutablePointsReaderUtils() {}
|
||||||
|
|
||||||
/** Sort the given {@link MutablePointValues} based on its packed value then doc ID. */
|
/** Sort the given {@link MutablePointValues} based on its packed value then doc ID. */
|
||||||
static void sort(int maxDoc, int packedBytesLength,
|
public static void sort(int maxDoc, int packedBytesLength,
|
||||||
MutablePointValues reader, int from, int to) {
|
MutablePointValues reader, int from, int to) {
|
||||||
final int bitsPerDocId = PackedInts.bitsRequired(maxDoc - 1);
|
final int bitsPerDocId = PackedInts.bitsRequired(maxDoc - 1);
|
||||||
new MSBRadixSorter(packedBytesLength + (bitsPerDocId + 7) / 8) {
|
new MSBRadixSorter(packedBytesLength + (bitsPerDocId + 7) / 8) {
|
||||||
|
|
||||||
@ -88,9 +91,9 @@ final class MutablePointsReaderUtils {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/** Sort points on the given dimension. */
|
/** Sort points on the given dimension. */
|
||||||
static void sortByDim(int sortedDim, int bytesPerDim, int[] commonPrefixLengths,
|
public static void sortByDim(int sortedDim, int bytesPerDim, int[] commonPrefixLengths,
|
||||||
MutablePointValues reader, int from, int to,
|
MutablePointValues reader, int from, int to,
|
||||||
BytesRef scratch1, BytesRef scratch2) {
|
BytesRef scratch1, BytesRef scratch2) {
|
||||||
|
|
||||||
// No need for a fancy radix sort here, this is called on the leaves only so
|
// No need for a fancy radix sort here, this is called on the leaves only so
|
||||||
// there are not many values to sort
|
// there are not many values to sort
|
||||||
@ -127,9 +130,9 @@ final class MutablePointsReaderUtils {
|
|||||||
/** Partition points around {@code mid}. All values on the left must be less
|
/** Partition points around {@code mid}. All values on the left must be less
|
||||||
* than or equal to it and all values on the right must be greater than or
|
* than or equal to it and all values on the right must be greater than or
|
||||||
* equal to it. */
|
* equal to it. */
|
||||||
static void partition(int maxDoc, int splitDim, int bytesPerDim, int commonPrefixLen,
|
public static void partition(int maxDoc, int splitDim, int bytesPerDim, int commonPrefixLen,
|
||||||
MutablePointValues reader, int from, int to, int mid,
|
MutablePointValues reader, int from, int to, int mid,
|
||||||
BytesRef scratch1, BytesRef scratch2) {
|
BytesRef scratch1, BytesRef scratch2) {
|
||||||
final int offset = splitDim * bytesPerDim + commonPrefixLen;
|
final int offset = splitDim * bytesPerDim + commonPrefixLen;
|
||||||
final int cmpBytes = bytesPerDim - commonPrefixLen;
|
final int cmpBytes = bytesPerDim - commonPrefixLen;
|
||||||
final int bitsPerDocId = PackedInts.bitsRequired(maxDoc - 1);
|
final int bitsPerDocId = PackedInts.bitsRequired(maxDoc - 1);
|
||||||
|
@ -27,8 +27,10 @@ import org.apache.lucene.store.IndexInput;
|
|||||||
import org.apache.lucene.store.IndexOutput;
|
import org.apache.lucene.store.IndexOutput;
|
||||||
import org.apache.lucene.util.LongBitSet;
|
import org.apache.lucene.util.LongBitSet;
|
||||||
|
|
||||||
/** Reads points from disk in a fixed-with format, previously written with {@link OfflinePointWriter}. */
|
/** Reads points from disk in a fixed-with format, previously written with {@link OfflinePointWriter}.
|
||||||
final class OfflinePointReader extends PointReader {
|
*
|
||||||
|
* @lucene.internal */
|
||||||
|
public final class OfflinePointReader extends PointReader {
|
||||||
long countLeft;
|
long countLeft;
|
||||||
final IndexInput in;
|
final IndexInput in;
|
||||||
private final byte[] packedValue;
|
private final byte[] packedValue;
|
||||||
@ -43,7 +45,7 @@ final class OfflinePointReader extends PointReader {
|
|||||||
// File name we are reading
|
// File name we are reading
|
||||||
final String name;
|
final String name;
|
||||||
|
|
||||||
OfflinePointReader(Directory tempDir, String tempFileName, int packedBytesLength, long start, long length,
|
public OfflinePointReader(Directory tempDir, String tempFileName, int packedBytesLength, long start, long length,
|
||||||
boolean longOrds, boolean singleValuePerDoc) throws IOException {
|
boolean longOrds, boolean singleValuePerDoc) throws IOException {
|
||||||
this.singleValuePerDoc = singleValuePerDoc;
|
this.singleValuePerDoc = singleValuePerDoc;
|
||||||
int bytesPerDoc = packedBytesLength + Integer.BYTES;
|
int bytesPerDoc = packedBytesLength + Integer.BYTES;
|
||||||
|
@ -26,12 +26,14 @@ import org.apache.lucene.store.Directory;
|
|||||||
import org.apache.lucene.store.IOContext;
|
import org.apache.lucene.store.IOContext;
|
||||||
import org.apache.lucene.store.IndexOutput;
|
import org.apache.lucene.store.IndexOutput;
|
||||||
|
|
||||||
/** Writes points to disk in a fixed-with format. */
|
/** Writes points to disk in a fixed-with format.
|
||||||
final class OfflinePointWriter implements PointWriter {
|
*
|
||||||
|
* @lucene.internal */
|
||||||
|
public final class OfflinePointWriter implements PointWriter {
|
||||||
|
|
||||||
final Directory tempDir;
|
final Directory tempDir;
|
||||||
final IndexOutput out;
|
public final IndexOutput out;
|
||||||
final String name;
|
public final String name;
|
||||||
final int packedBytesLength;
|
final int packedBytesLength;
|
||||||
final boolean singleValuePerDoc;
|
final boolean singleValuePerDoc;
|
||||||
long count;
|
long count;
|
||||||
|
@ -24,20 +24,22 @@ import org.apache.lucene.util.LongBitSet;
|
|||||||
|
|
||||||
/** One pass iterator through all points previously written with a
|
/** One pass iterator through all points previously written with a
|
||||||
* {@link PointWriter}, abstracting away whether points a read
|
* {@link PointWriter}, abstracting away whether points a read
|
||||||
* from (offline) disk or simple arrays in heap. */
|
* from (offline) disk or simple arrays in heap.
|
||||||
abstract class PointReader implements Closeable {
|
*
|
||||||
|
* @lucene.internal */
|
||||||
|
public abstract class PointReader implements Closeable {
|
||||||
|
|
||||||
/** Returns false once iteration is done, else true. */
|
/** Returns false once iteration is done, else true. */
|
||||||
abstract boolean next() throws IOException;
|
public abstract boolean next() throws IOException;
|
||||||
|
|
||||||
/** Returns the packed byte[] value */
|
/** Returns the packed byte[] value */
|
||||||
abstract byte[] packedValue();
|
public abstract byte[] packedValue();
|
||||||
|
|
||||||
/** Point ordinal */
|
/** Point ordinal */
|
||||||
abstract long ord();
|
public abstract long ord();
|
||||||
|
|
||||||
/** DocID for this point */
|
/** DocID for this point */
|
||||||
abstract int docID();
|
public abstract int docID();
|
||||||
|
|
||||||
/** Iterates through the next {@code count} ords, marking them in the provided {@code ordBitSet}. */
|
/** Iterates through the next {@code count} ords, marking them in the provided {@code ordBitSet}. */
|
||||||
public void markOrds(long count, LongBitSet ordBitSet) throws IOException {
|
public void markOrds(long count, LongBitSet ordBitSet) throws IOException {
|
||||||
|
@ -23,8 +23,10 @@ import java.util.List;
|
|||||||
|
|
||||||
/** Appends many points, and then at the end provides a {@link PointReader} to iterate
|
/** Appends many points, and then at the end provides a {@link PointReader} to iterate
|
||||||
* those points. This abstracts away whether we write to disk, or use simple arrays
|
* those points. This abstracts away whether we write to disk, or use simple arrays
|
||||||
* in heap. */
|
* in heap.
|
||||||
interface PointWriter extends Closeable {
|
*
|
||||||
|
* @lucene.internal */
|
||||||
|
public interface PointWriter extends Closeable {
|
||||||
/** Add a new point */
|
/** Add a new point */
|
||||||
void append(byte[] packedValue, long ord, int docID) throws IOException;
|
void append(byte[] packedValue, long ord, int docID) throws IOException;
|
||||||
|
|
||||||
|
@ -621,6 +621,9 @@ public class TestPointQueries extends LuceneTestCase {
|
|||||||
int numDims = TestUtil.nextInt(random(), 1, PointValues.MAX_DIMENSIONS);
|
int numDims = TestUtil.nextInt(random(), 1, PointValues.MAX_DIMENSIONS);
|
||||||
|
|
||||||
int sameValuePct = random().nextInt(100);
|
int sameValuePct = random().nextInt(100);
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println("TEST: sameValuePct=" + sameValuePct);
|
||||||
|
}
|
||||||
|
|
||||||
byte[][][] docValues = new byte[numValues][][];
|
byte[][][] docValues = new byte[numValues][][];
|
||||||
|
|
||||||
|
@ -16,6 +16,7 @@
|
|||||||
*/
|
*/
|
||||||
package org.apache.lucene.util.bkd;
|
package org.apache.lucene.util.bkd;
|
||||||
|
|
||||||
|
import org.apache.lucene.index.CheckIndex;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.store.FSDirectory;
|
import org.apache.lucene.store.FSDirectory;
|
||||||
import org.apache.lucene.store.IOContext;
|
import org.apache.lucene.store.IOContext;
|
||||||
@ -64,7 +65,10 @@ public class Test2BBKDPoints extends LuceneTestCase {
|
|||||||
IndexInput in = dir.openInput("1d.bkd", IOContext.DEFAULT);
|
IndexInput in = dir.openInput("1d.bkd", IOContext.DEFAULT);
|
||||||
in.seek(indexFP);
|
in.seek(indexFP);
|
||||||
BKDReader r = new BKDReader(in);
|
BKDReader r = new BKDReader(in);
|
||||||
r.verify(numDocs);
|
CheckIndex.VerifyPointsVisitor visitor = new CheckIndex.VerifyPointsVisitor("1d", numDocs, r);
|
||||||
|
r.intersect(visitor);
|
||||||
|
assertEquals(r.size(), visitor.getPointCountSeen());
|
||||||
|
assertEquals(r.getDocCount(), visitor.getDocCountSeen());
|
||||||
in.close();
|
in.close();
|
||||||
dir.close();
|
dir.close();
|
||||||
}
|
}
|
||||||
@ -101,7 +105,10 @@ public class Test2BBKDPoints extends LuceneTestCase {
|
|||||||
IndexInput in = dir.openInput("2d.bkd", IOContext.DEFAULT);
|
IndexInput in = dir.openInput("2d.bkd", IOContext.DEFAULT);
|
||||||
in.seek(indexFP);
|
in.seek(indexFP);
|
||||||
BKDReader r = new BKDReader(in);
|
BKDReader r = new BKDReader(in);
|
||||||
r.verify(numDocs);
|
CheckIndex.VerifyPointsVisitor visitor = new CheckIndex.VerifyPointsVisitor("2d", numDocs, r);
|
||||||
|
r.intersect(visitor);
|
||||||
|
assertEquals(r.size(), visitor.getPointCountSeen());
|
||||||
|
assertEquals(r.getDocCount(), visitor.getDocCountSeen());
|
||||||
in.close();
|
in.close();
|
||||||
dir.close();
|
dir.close();
|
||||||
}
|
}
|
||||||
|
@ -28,6 +28,7 @@ import org.apache.lucene.index.CorruptIndexException;
|
|||||||
import org.apache.lucene.index.MergeState;
|
import org.apache.lucene.index.MergeState;
|
||||||
import org.apache.lucene.index.PointValues.IntersectVisitor;
|
import org.apache.lucene.index.PointValues.IntersectVisitor;
|
||||||
import org.apache.lucene.index.PointValues.Relation;
|
import org.apache.lucene.index.PointValues.Relation;
|
||||||
|
import org.apache.lucene.index.PointValues;
|
||||||
import org.apache.lucene.store.CorruptingIndexOutput;
|
import org.apache.lucene.store.CorruptingIndexOutput;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.store.FilterDirectory;
|
import org.apache.lucene.store.FilterDirectory;
|
||||||
@ -1010,4 +1011,57 @@ public class TestBKD extends LuceneTestCase {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Claims 16 bytes per dim, but only use the bottom N 1-3 bytes; this would happen e.g. if a user indexes what are actually just short
|
||||||
|
// values as a LongPoint:
|
||||||
|
public void testWastedLeadingBytes() throws Exception {
|
||||||
|
int numDims = TestUtil.nextInt(random(), 1, PointValues.MAX_DIMENSIONS);
|
||||||
|
int bytesPerDim = PointValues.MAX_NUM_BYTES;
|
||||||
|
int bytesUsed = TestUtil.nextInt(random(), 1, 3);
|
||||||
|
|
||||||
|
Directory dir = newFSDirectory(createTempDir());
|
||||||
|
int numDocs = 100000;
|
||||||
|
BKDWriter w = new BKDWriter(numDocs+1, dir, "tmp", numDims, bytesPerDim, 32, 1f, numDocs, true);
|
||||||
|
byte[] tmp = new byte[bytesUsed];
|
||||||
|
byte[] buffer = new byte[numDims * bytesPerDim];
|
||||||
|
for(int i=0;i<numDocs;i++) {
|
||||||
|
for(int dim=0;dim<numDims;dim++) {
|
||||||
|
random().nextBytes(tmp);
|
||||||
|
System.arraycopy(tmp, 0, buffer, dim*bytesPerDim+(bytesPerDim-bytesUsed), tmp.length);
|
||||||
|
}
|
||||||
|
w.add(buffer, i);
|
||||||
|
}
|
||||||
|
|
||||||
|
IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT);
|
||||||
|
long fp = w.finish(out);
|
||||||
|
out.close();
|
||||||
|
|
||||||
|
IndexInput in = dir.openInput("bkd", IOContext.DEFAULT);
|
||||||
|
in.seek(fp);
|
||||||
|
BKDReader r = new BKDReader(in);
|
||||||
|
int[] count = new int[1];
|
||||||
|
r.intersect(new IntersectVisitor() {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void visit(int docID) {
|
||||||
|
count[0]++;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void visit(int docID, byte[] packedValue) {
|
||||||
|
visit(docID);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Relation compare(byte[] minPacked, byte[] maxPacked) {
|
||||||
|
if (random().nextInt(7) == 1) {
|
||||||
|
return Relation.CELL_CROSSES_QUERY;
|
||||||
|
} else {
|
||||||
|
return Relation.CELL_INSIDE_QUERY;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
assertEquals(numDocs, count[0]);
|
||||||
|
in.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -228,7 +228,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||||||
final long value = lastOutput + TestUtil.nextInt(random(), 1, 1000);
|
final long value = lastOutput + TestUtil.nextInt(random(), 1, 1000);
|
||||||
lastOutput = value;
|
lastOutput = value;
|
||||||
pairs.add(new FSTTester.InputOutput<>(terms[idx],
|
pairs.add(new FSTTester.InputOutput<>(terms[idx],
|
||||||
outputs.newPair((long) idx, value)));
|
outputs.newPair((long) idx, value)));
|
||||||
}
|
}
|
||||||
new FSTTester<>(random(), dir, inputMode, pairs, outputs, false).doTest(true);
|
new FSTTester<>(random(), dir, inputMode, pairs, outputs, false).doTest(true);
|
||||||
}
|
}
|
||||||
|
@ -26,7 +26,10 @@ import org.apache.lucene.geo.Rectangle;
|
|||||||
import org.apache.lucene.index.PointValues.IntersectVisitor;
|
import org.apache.lucene.index.PointValues.IntersectVisitor;
|
||||||
import org.apache.lucene.index.PointValues.Relation;
|
import org.apache.lucene.index.PointValues.Relation;
|
||||||
import org.apache.lucene.util.Bits;
|
import org.apache.lucene.util.Bits;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.SloppyMath;
|
import org.apache.lucene.util.SloppyMath;
|
||||||
|
import org.apache.lucene.util.bkd.BKDReader.IndexTree;
|
||||||
|
import org.apache.lucene.util.bkd.BKDReader.IntersectState;
|
||||||
import org.apache.lucene.util.bkd.BKDReader;
|
import org.apache.lucene.util.bkd.BKDReader;
|
||||||
|
|
||||||
import static org.apache.lucene.geo.GeoEncodingUtils.decodeLatitude;
|
import static org.apache.lucene.geo.GeoEncodingUtils.decodeLatitude;
|
||||||
@ -41,16 +44,16 @@ class NearestNeighbor {
|
|||||||
|
|
||||||
static class Cell implements Comparable<Cell> {
|
static class Cell implements Comparable<Cell> {
|
||||||
final int readerIndex;
|
final int readerIndex;
|
||||||
final int nodeID;
|
|
||||||
final byte[] minPacked;
|
final byte[] minPacked;
|
||||||
final byte[] maxPacked;
|
final byte[] maxPacked;
|
||||||
|
final IndexTree index;
|
||||||
|
|
||||||
/** The closest possible distance of all points in this cell */
|
/** The closest possible distance of all points in this cell */
|
||||||
final double distanceMeters;
|
final double distanceMeters;
|
||||||
|
|
||||||
public Cell(int readerIndex, int nodeID, byte[] minPacked, byte[] maxPacked, double distanceMeters) {
|
public Cell(IndexTree index, int readerIndex, byte[] minPacked, byte[] maxPacked, double distanceMeters) {
|
||||||
|
this.index = index;
|
||||||
this.readerIndex = readerIndex;
|
this.readerIndex = readerIndex;
|
||||||
this.nodeID = nodeID;
|
|
||||||
this.minPacked = minPacked.clone();
|
this.minPacked = minPacked.clone();
|
||||||
this.maxPacked = maxPacked.clone();
|
this.maxPacked = maxPacked.clone();
|
||||||
this.distanceMeters = distanceMeters;
|
this.distanceMeters = distanceMeters;
|
||||||
@ -66,7 +69,7 @@ class NearestNeighbor {
|
|||||||
double minLon = decodeLongitude(minPacked, Integer.BYTES);
|
double minLon = decodeLongitude(minPacked, Integer.BYTES);
|
||||||
double maxLat = decodeLatitude(maxPacked, 0);
|
double maxLat = decodeLatitude(maxPacked, 0);
|
||||||
double maxLon = decodeLongitude(maxPacked, Integer.BYTES);
|
double maxLon = decodeLongitude(maxPacked, Integer.BYTES);
|
||||||
return "Cell(readerIndex=" + readerIndex + " lat=" + minLat + " TO " + maxLat + ", lon=" + minLon + " TO " + maxLon + "; distanceMeters=" + distanceMeters + ")";
|
return "Cell(readerIndex=" + readerIndex + " nodeID=" + index.getNodeID() + " isLeaf=" + index.isLeafNode() + " lat=" + minLat + " TO " + maxLat + ", lon=" + minLon + " TO " + maxLon + "; distanceMeters=" + distanceMeters + ")";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -219,13 +222,21 @@ class NearestNeighbor {
|
|||||||
List<BKDReader.IntersectState> states = new ArrayList<>();
|
List<BKDReader.IntersectState> states = new ArrayList<>();
|
||||||
|
|
||||||
// Add root cell for each reader into the queue:
|
// Add root cell for each reader into the queue:
|
||||||
|
int bytesPerDim = -1;
|
||||||
|
|
||||||
for(int i=0;i<readers.size();i++) {
|
for(int i=0;i<readers.size();i++) {
|
||||||
BKDReader reader = readers.get(i);
|
BKDReader reader = readers.get(i);
|
||||||
|
if (bytesPerDim == -1) {
|
||||||
|
bytesPerDim = reader.getBytesPerDimension();
|
||||||
|
} else if (bytesPerDim != reader.getBytesPerDimension()) {
|
||||||
|
throw new IllegalStateException("bytesPerDim changed from " + bytesPerDim + " to " + reader.getBytesPerDimension() + " across readers");
|
||||||
|
}
|
||||||
byte[] minPackedValue = reader.getMinPackedValue();
|
byte[] minPackedValue = reader.getMinPackedValue();
|
||||||
byte[] maxPackedValue = reader.getMaxPackedValue();
|
byte[] maxPackedValue = reader.getMaxPackedValue();
|
||||||
states.add(reader.getIntersectState(visitor));
|
IntersectState state = reader.getIntersectState(visitor);
|
||||||
|
states.add(state);
|
||||||
|
|
||||||
cellQueue.offer(new Cell(i, 1, reader.getMinPackedValue(), reader.getMaxPackedValue(),
|
cellQueue.offer(new Cell(state.index, i, reader.getMinPackedValue(), reader.getMaxPackedValue(),
|
||||||
approxBestDistance(minPackedValue, maxPackedValue, pointLat, pointLon)));
|
approxBestDistance(minPackedValue, maxPackedValue, pointLat, pointLon)));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -236,12 +247,12 @@ class NearestNeighbor {
|
|||||||
// TODO: if we replace approxBestDistance with actualBestDistance, we can put an opto here to break once this "best" cell is fully outside of the hitQueue bottom's radius:
|
// TODO: if we replace approxBestDistance with actualBestDistance, we can put an opto here to break once this "best" cell is fully outside of the hitQueue bottom's radius:
|
||||||
BKDReader reader = readers.get(cell.readerIndex);
|
BKDReader reader = readers.get(cell.readerIndex);
|
||||||
|
|
||||||
if (reader.isLeafNode(cell.nodeID)) {
|
if (cell.index.isLeafNode()) {
|
||||||
//System.out.println(" leaf");
|
//System.out.println(" leaf");
|
||||||
// Leaf block: visit all points and possibly collect them:
|
// Leaf block: visit all points and possibly collect them:
|
||||||
visitor.curDocBase = docBases.get(cell.readerIndex);
|
visitor.curDocBase = docBases.get(cell.readerIndex);
|
||||||
visitor.curLiveDocs = liveDocs.get(cell.readerIndex);
|
visitor.curLiveDocs = liveDocs.get(cell.readerIndex);
|
||||||
reader.visitLeafBlockValues(cell.nodeID, states.get(cell.readerIndex));
|
reader.visitLeafBlockValues(cell.index, states.get(cell.readerIndex));
|
||||||
//System.out.println(" now " + hitQueue.size() + " hits");
|
//System.out.println(" now " + hitQueue.size() + " hits");
|
||||||
} else {
|
} else {
|
||||||
//System.out.println(" non-leaf");
|
//System.out.println(" non-leaf");
|
||||||
@ -257,14 +268,23 @@ class NearestNeighbor {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
BytesRef splitValue = BytesRef.deepCopyOf(cell.index.getSplitDimValue());
|
||||||
|
int splitDim = cell.index.getSplitDim();
|
||||||
|
|
||||||
|
// we must clone the index so that we we can recurse left and right "concurrently":
|
||||||
|
IndexTree newIndex = cell.index.clone();
|
||||||
byte[] splitPackedValue = cell.maxPacked.clone();
|
byte[] splitPackedValue = cell.maxPacked.clone();
|
||||||
reader.copySplitValue(cell.nodeID, splitPackedValue);
|
System.arraycopy(splitValue.bytes, splitValue.offset, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
|
||||||
cellQueue.offer(new Cell(cell.readerIndex, 2*cell.nodeID, cell.minPacked, splitPackedValue,
|
|
||||||
|
cell.index.pushLeft();
|
||||||
|
cellQueue.offer(new Cell(cell.index, cell.readerIndex, cell.minPacked, splitPackedValue,
|
||||||
approxBestDistance(cell.minPacked, splitPackedValue, pointLat, pointLon)));
|
approxBestDistance(cell.minPacked, splitPackedValue, pointLat, pointLon)));
|
||||||
|
|
||||||
splitPackedValue = cell.minPacked.clone();
|
splitPackedValue = cell.minPacked.clone();
|
||||||
reader.copySplitValue(cell.nodeID, splitPackedValue);
|
System.arraycopy(splitValue.bytes, splitValue.offset, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
|
||||||
cellQueue.offer(new Cell(cell.readerIndex, 2*cell.nodeID+1, splitPackedValue, cell.maxPacked,
|
|
||||||
|
newIndex.pushRight();
|
||||||
|
cellQueue.offer(new Cell(newIndex, cell.readerIndex, splitPackedValue, cell.maxPacked,
|
||||||
approxBestDistance(splitPackedValue, cell.maxPacked, pointLat, pointLon)));
|
approxBestDistance(splitPackedValue, cell.maxPacked, pointLat, pointLon)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user