mirror of https://github.com/apache/lucene.git
LUCENE-9820: PointTree#size() should handle the case of balanced tree in pre-8.6 indexes (#462)
Handle properly the case where trees are fully balanced for number of dimension > 1
This commit is contained in:
parent
8710252116
commit
800f002e44
|
@ -21,6 +21,7 @@ import java.util.ArrayList;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import org.apache.lucene.backward_codecs.lucene60.bkd.BKDWriter60;
|
||||||
import org.apache.lucene.backward_codecs.store.EndiannessReverserUtil;
|
import org.apache.lucene.backward_codecs.store.EndiannessReverserUtil;
|
||||||
import org.apache.lucene.codecs.CodecUtil;
|
import org.apache.lucene.codecs.CodecUtil;
|
||||||
import org.apache.lucene.codecs.MutablePointTree;
|
import org.apache.lucene.codecs.MutablePointTree;
|
||||||
|
@ -36,8 +37,6 @@ import org.apache.lucene.index.PointValues.Relation;
|
||||||
import org.apache.lucene.index.SegmentWriteState;
|
import org.apache.lucene.index.SegmentWriteState;
|
||||||
import org.apache.lucene.store.IndexOutput;
|
import org.apache.lucene.store.IndexOutput;
|
||||||
import org.apache.lucene.util.IOUtils;
|
import org.apache.lucene.util.IOUtils;
|
||||||
import org.apache.lucene.util.bkd.BKDConfig;
|
|
||||||
import org.apache.lucene.util.bkd.BKDWriter;
|
|
||||||
|
|
||||||
/** Writes dimensional values */
|
/** Writes dimensional values */
|
||||||
public class Lucene60PointsWriter extends PointsWriter {
|
public class Lucene60PointsWriter extends PointsWriter {
|
||||||
|
@ -91,8 +90,8 @@ public class Lucene60PointsWriter extends PointsWriter {
|
||||||
public Lucene60PointsWriter(SegmentWriteState writeState) throws IOException {
|
public Lucene60PointsWriter(SegmentWriteState writeState) throws IOException {
|
||||||
this(
|
this(
|
||||||
writeState,
|
writeState,
|
||||||
BKDConfig.DEFAULT_MAX_POINTS_IN_LEAF_NODE,
|
BKDWriter60.DEFAULT_MAX_POINTS_IN_LEAF_NODE,
|
||||||
BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP);
|
BKDWriter60.DEFAULT_MAX_MB_SORT_IN_HEAP);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -100,28 +99,22 @@ public class Lucene60PointsWriter extends PointsWriter {
|
||||||
|
|
||||||
PointValues.PointTree values = reader.getValues(fieldInfo.name).getPointTree();
|
PointValues.PointTree values = reader.getValues(fieldInfo.name).getPointTree();
|
||||||
|
|
||||||
BKDConfig config =
|
try (BKDWriter60 writer =
|
||||||
new BKDConfig(
|
new BKDWriter60(
|
||||||
fieldInfo.getPointDimensionCount(),
|
|
||||||
fieldInfo.getPointIndexDimensionCount(),
|
|
||||||
fieldInfo.getPointNumBytes(),
|
|
||||||
maxPointsInLeafNode);
|
|
||||||
|
|
||||||
try (BKDWriter writer =
|
|
||||||
new BKDWriter(
|
|
||||||
writeState.segmentInfo.maxDoc(),
|
writeState.segmentInfo.maxDoc(),
|
||||||
writeState.directory,
|
writeState.directory,
|
||||||
writeState.segmentInfo.name,
|
writeState.segmentInfo.name,
|
||||||
config,
|
fieldInfo.getPointDimensionCount(),
|
||||||
|
fieldInfo.getPointIndexDimensionCount(),
|
||||||
|
fieldInfo.getPointNumBytes(),
|
||||||
|
maxPointsInLeafNode,
|
||||||
maxMBSortInHeap,
|
maxMBSortInHeap,
|
||||||
values.size())) {
|
values.size())) {
|
||||||
|
|
||||||
if (values instanceof MutablePointTree) {
|
if (values instanceof MutablePointTree) {
|
||||||
Runnable finalizer =
|
final long fp = writer.writeField(dataOut, fieldInfo.name, (MutablePointTree) values);
|
||||||
writer.writeField(dataOut, dataOut, dataOut, fieldInfo.name, (MutablePointTree) values);
|
if (fp != -1) {
|
||||||
if (finalizer != null) {
|
indexFPs.put(fieldInfo.name, fp);
|
||||||
indexFPs.put(fieldInfo.name, dataOut.getFilePointer());
|
|
||||||
finalizer.run();
|
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -145,10 +138,8 @@ public class Lucene60PointsWriter extends PointsWriter {
|
||||||
});
|
});
|
||||||
|
|
||||||
// We could have 0 points on merge since all docs with dimensional fields may be deleted:
|
// We could have 0 points on merge since all docs with dimensional fields may be deleted:
|
||||||
Runnable finalizer = writer.finish(dataOut, dataOut, dataOut);
|
if (writer.getPointCount() > 0) {
|
||||||
if (finalizer != null) {
|
indexFPs.put(fieldInfo.name, writer.finish(dataOut));
|
||||||
indexFPs.put(fieldInfo.name, dataOut.getFilePointer());
|
|
||||||
finalizer.run();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -193,26 +184,22 @@ public class Lucene60PointsWriter extends PointsWriter {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
BKDConfig config =
|
|
||||||
new BKDConfig(
|
|
||||||
fieldInfo.getPointDimensionCount(),
|
|
||||||
fieldInfo.getPointIndexDimensionCount(),
|
|
||||||
fieldInfo.getPointNumBytes(),
|
|
||||||
maxPointsInLeafNode);
|
|
||||||
|
|
||||||
// System.out.println("MERGE: field=" + fieldInfo.name);
|
// System.out.println("MERGE: field=" + fieldInfo.name);
|
||||||
// Optimize the 1D case to use BKDWriter.merge, which does a single merge sort of the
|
// Optimize the 1D case to use BKDWriter.merge, which does a single merge sort of the
|
||||||
// already sorted incoming segments, instead of trying to sort all points again as if
|
// already sorted incoming segments, instead of trying to sort all points again as if
|
||||||
// we were simply reindexing them:
|
// we were simply reindexing them:
|
||||||
try (BKDWriter writer =
|
try (BKDWriter60 writer =
|
||||||
new BKDWriter(
|
new BKDWriter60(
|
||||||
writeState.segmentInfo.maxDoc(),
|
writeState.segmentInfo.maxDoc(),
|
||||||
writeState.directory,
|
writeState.directory,
|
||||||
writeState.segmentInfo.name,
|
writeState.segmentInfo.name,
|
||||||
config,
|
fieldInfo.getPointDimensionCount(),
|
||||||
|
fieldInfo.getPointIndexDimensionCount(),
|
||||||
|
fieldInfo.getPointNumBytes(),
|
||||||
|
maxPointsInLeafNode,
|
||||||
maxMBSortInHeap,
|
maxMBSortInHeap,
|
||||||
totMaxSize)) {
|
totMaxSize)) {
|
||||||
List<PointValues> pointValues = new ArrayList<>();
|
List<PointValues> bkdReaders = new ArrayList<>();
|
||||||
List<MergeState.DocMap> docMaps = new ArrayList<>();
|
List<MergeState.DocMap> docMaps = new ArrayList<>();
|
||||||
for (int i = 0; i < mergeState.pointsReaders.length; i++) {
|
for (int i = 0; i < mergeState.pointsReaders.length; i++) {
|
||||||
PointsReader reader = mergeState.pointsReaders[i];
|
PointsReader reader = mergeState.pointsReaders[i];
|
||||||
|
@ -231,19 +218,18 @@ public class Lucene60PointsWriter extends PointsWriter {
|
||||||
FieldInfos readerFieldInfos = mergeState.fieldInfos[i];
|
FieldInfos readerFieldInfos = mergeState.fieldInfos[i];
|
||||||
FieldInfo readerFieldInfo = readerFieldInfos.fieldInfo(fieldInfo.name);
|
FieldInfo readerFieldInfo = readerFieldInfos.fieldInfo(fieldInfo.name);
|
||||||
if (readerFieldInfo != null && readerFieldInfo.getPointDimensionCount() > 0) {
|
if (readerFieldInfo != null && readerFieldInfo.getPointDimensionCount() > 0) {
|
||||||
PointValues aPointValues = reader60.readers.get(readerFieldInfo.number);
|
PointValues bkdReader = reader60.readers.get(readerFieldInfo.number);
|
||||||
if (aPointValues != null) {
|
if (bkdReader != null) {
|
||||||
pointValues.add(aPointValues);
|
bkdReaders.add(bkdReader);
|
||||||
docMaps.add(mergeState.docMaps[i]);
|
docMaps.add(mergeState.docMaps[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Runnable finalizer = writer.merge(dataOut, dataOut, dataOut, docMaps, pointValues);
|
long fp = writer.merge(dataOut, docMaps, bkdReaders);
|
||||||
if (finalizer != null) {
|
if (fp != -1) {
|
||||||
indexFPs.put(fieldInfo.name, dataOut.getFilePointer());
|
indexFPs.put(fieldInfo.name, fp);
|
||||||
finalizer.run();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.lucene.backward_codecs.lucene60;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
import org.apache.lucene.backward_codecs.lucene60.bkd.BKDWriter60;
|
||||||
import org.apache.lucene.backward_codecs.lucene84.Lucene84RWCodec;
|
import org.apache.lucene.backward_codecs.lucene84.Lucene84RWCodec;
|
||||||
import org.apache.lucene.codecs.Codec;
|
import org.apache.lucene.codecs.Codec;
|
||||||
import org.apache.lucene.document.BinaryPoint;
|
import org.apache.lucene.document.BinaryPoint;
|
||||||
|
@ -35,7 +36,6 @@ import org.apache.lucene.index.PointValues.Relation;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.util.LuceneTestCase.Nightly;
|
import org.apache.lucene.util.LuceneTestCase.Nightly;
|
||||||
import org.apache.lucene.util.TestUtil;
|
import org.apache.lucene.util.TestUtil;
|
||||||
import org.apache.lucene.util.bkd.BKDConfig;
|
|
||||||
|
|
||||||
/** Tests Lucene60PointsFormat */
|
/** Tests Lucene60PointsFormat */
|
||||||
@Nightly // N-2 formats are only tested on nightly runs
|
@Nightly // N-2 formats are only tested on nightly runs
|
||||||
|
@ -45,7 +45,7 @@ public class TestLucene60PointsFormat extends BasePointsFormatTestCase {
|
||||||
|
|
||||||
public TestLucene60PointsFormat() {
|
public TestLucene60PointsFormat() {
|
||||||
codec = new Lucene84RWCodec();
|
codec = new Lucene84RWCodec();
|
||||||
maxPointsInLeafNode = BKDConfig.DEFAULT_MAX_POINTS_IN_LEAF_NODE;
|
maxPointsInLeafNode = BKDWriter60.DEFAULT_MAX_POINTS_IN_LEAF_NODE;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -280,16 +280,23 @@ public class TestLucene60PointsFormat extends BasePointsFormatTestCase {
|
||||||
};
|
};
|
||||||
|
|
||||||
final long pointCount = points.estimatePointCount(onePointMatchVisitor);
|
final long pointCount = points.estimatePointCount(onePointMatchVisitor);
|
||||||
final long lastNodePointCount = totalValues % maxPointsInLeafNode;
|
// With >1 dims, the tree is balanced
|
||||||
|
long actualMaxPointsInLeafNode = points.size();
|
||||||
|
while (actualMaxPointsInLeafNode > maxPointsInLeafNode) {
|
||||||
|
actualMaxPointsInLeafNode = (actualMaxPointsInLeafNode + 1) / 2;
|
||||||
|
}
|
||||||
|
final long countPerFullLeaf = (actualMaxPointsInLeafNode + 1) / 2;
|
||||||
|
final long countPerNotFullLeaf = (actualMaxPointsInLeafNode) / 2;
|
||||||
assertTrue(
|
assertTrue(
|
||||||
"" + pointCount,
|
pointCount + " vs " + actualMaxPointsInLeafNode,
|
||||||
pointCount == (maxPointsInLeafNode + 1) / 2 // common case
|
// common case, point in one leaf.
|
||||||
|| pointCount == (lastNodePointCount + 1) / 2 // not fully populated leaf
|
pointCount >= countPerNotFullLeaf && pointCount <= countPerFullLeaf
|
||||||
|| pointCount == 2 * ((maxPointsInLeafNode + 1) / 2) // if the point is a split value
|
||
|
||||||
|| pointCount == ((maxPointsInLeafNode + 1) / 2) + ((lastNodePointCount + 1) / 2)
|
// one dimension is a split value
|
||||||
// in extreme cases, a point can be shared by 4 leaves
|
pointCount >= 2 * countPerNotFullLeaf && pointCount <= 2 * countPerFullLeaf
|
||||||
|| pointCount == 4 * ((maxPointsInLeafNode + 1) / 2)
|
||
|
||||||
|| pointCount == 3 * ((maxPointsInLeafNode + 1) / 2) + ((lastNodePointCount + 1) / 2));
|
// both dimensions are split values
|
||||||
|
pointCount >= 4 * countPerNotFullLeaf && pointCount <= 4 * countPerFullLeaf);
|
||||||
|
|
||||||
final long docCount = points.estimateDocCount(onePointMatchVisitor);
|
final long docCount = points.estimateDocCount(onePointMatchVisitor);
|
||||||
if (multiValues) {
|
if (multiValues) {
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,85 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.backward_codecs.lucene60.bkd;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import org.apache.lucene.store.DataOutput;
|
||||||
|
|
||||||
|
class DocIdsWriter {
|
||||||
|
|
||||||
|
private DocIdsWriter() {}
|
||||||
|
|
||||||
|
static void writeDocIds(int[] docIds, int start, int count, DataOutput out) throws IOException {
|
||||||
|
// docs can be sorted either when all docs in a block have the same value
|
||||||
|
// or when a segment is sorted
|
||||||
|
boolean sorted = true;
|
||||||
|
for (int i = 1; i < count; ++i) {
|
||||||
|
if (docIds[start + i - 1] > docIds[start + i]) {
|
||||||
|
sorted = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (sorted) {
|
||||||
|
out.writeByte((byte) 0);
|
||||||
|
int previous = 0;
|
||||||
|
for (int i = 0; i < count; ++i) {
|
||||||
|
int doc = docIds[start + i];
|
||||||
|
out.writeVInt(doc - previous);
|
||||||
|
previous = doc;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
long max = 0;
|
||||||
|
for (int i = 0; i < count; ++i) {
|
||||||
|
max |= Integer.toUnsignedLong(docIds[start + i]);
|
||||||
|
}
|
||||||
|
if (max <= 0xffffff) {
|
||||||
|
out.writeByte((byte) 24);
|
||||||
|
// write them the same way we are reading them.
|
||||||
|
int i;
|
||||||
|
for (i = 0; i < count - 7; i += 8) {
|
||||||
|
int doc1 = docIds[start + i];
|
||||||
|
int doc2 = docIds[start + i + 1];
|
||||||
|
int doc3 = docIds[start + i + 2];
|
||||||
|
int doc4 = docIds[start + i + 3];
|
||||||
|
int doc5 = docIds[start + i + 4];
|
||||||
|
int doc6 = docIds[start + i + 5];
|
||||||
|
int doc7 = docIds[start + i + 6];
|
||||||
|
int doc8 = docIds[start + i + 7];
|
||||||
|
long l1 = (doc1 & 0xffffffL) << 40 | (doc2 & 0xffffffL) << 16 | ((doc3 >>> 8) & 0xffffL);
|
||||||
|
long l2 =
|
||||||
|
(doc3 & 0xffL) << 56
|
||||||
|
| (doc4 & 0xffffffL) << 32
|
||||||
|
| (doc5 & 0xffffffL) << 8
|
||||||
|
| ((doc6 >> 16) & 0xffL);
|
||||||
|
long l3 = (doc6 & 0xffffL) << 48 | (doc7 & 0xffffffL) << 24 | (doc8 & 0xffffffL);
|
||||||
|
out.writeLong(l1);
|
||||||
|
out.writeLong(l2);
|
||||||
|
out.writeLong(l3);
|
||||||
|
}
|
||||||
|
for (; i < count; ++i) {
|
||||||
|
out.writeShort((short) (docIds[start + i] >>> 8));
|
||||||
|
out.writeByte((byte) docIds[start + i]);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
out.writeByte((byte) 32);
|
||||||
|
for (int i = 0; i < count; ++i) {
|
||||||
|
out.writeInt(docIds[start + i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -87,7 +87,6 @@ final class SimpleTextBKDReader extends PointValues {
|
||||||
int nodeID;
|
int nodeID;
|
||||||
int level;
|
int level;
|
||||||
final int rootNode;
|
final int rootNode;
|
||||||
final int lastLeafNodeCount;
|
|
||||||
// holds the min / max value of the current node.
|
// holds the min / max value of the current node.
|
||||||
private final byte[] minPackedValue, maxPackedValue;
|
private final byte[] minPackedValue, maxPackedValue;
|
||||||
// holds the previous value of the split dimension
|
// holds the previous value of the split dimension
|
||||||
|
@ -107,9 +106,6 @@ final class SimpleTextBKDReader extends PointValues {
|
||||||
int treeDepth = getTreeDepth(leafNodeOffset);
|
int treeDepth = getTreeDepth(leafNodeOffset);
|
||||||
splitDimValueStack = new byte[treeDepth + 1][];
|
splitDimValueStack = new byte[treeDepth + 1][];
|
||||||
splitDims = new int[treeDepth + 1];
|
splitDims = new int[treeDepth + 1];
|
||||||
int lastLeafNodeCount = Math.toIntExact(pointCount % config.maxPointsInLeafNode);
|
|
||||||
this.lastLeafNodeCount =
|
|
||||||
lastLeafNodeCount == 0 ? config.maxPointsInLeafNode : lastLeafNodeCount;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private int getTreeDepth(int numLeaves) {
|
private int getTreeDepth(int numLeaves) {
|
||||||
|
@ -285,9 +281,39 @@ final class SimpleTextBKDReader extends PointValues {
|
||||||
numLeaves = rightMostLeafNode - leftMostLeafNode + 1 + leafNodeOffset;
|
numLeaves = rightMostLeafNode - leftMostLeafNode + 1 + leafNodeOffset;
|
||||||
}
|
}
|
||||||
assert numLeaves == getNumLeavesSlow(nodeID) : numLeaves + " " + getNumLeavesSlow(nodeID);
|
assert numLeaves == getNumLeavesSlow(nodeID) : numLeaves + " " + getNumLeavesSlow(nodeID);
|
||||||
return rightMostLeafNode == (1 << getTreeDepth(leafNodeOffset) - 1) - 1
|
return sizeFromBalancedTree(leftMostLeafNode, rightMostLeafNode);
|
||||||
? (long) (numLeaves - 1) * config.maxPointsInLeafNode + lastLeafNodeCount
|
}
|
||||||
: (long) numLeaves * config.maxPointsInLeafNode;
|
|
||||||
|
private long sizeFromBalancedTree(int leftMostLeafNode, int rightMostLeafNode) {
|
||||||
|
// number of points that need to be distributed between leaves, one per leaf
|
||||||
|
final int extraPoints =
|
||||||
|
Math.toIntExact(((long) config.maxPointsInLeafNode * leafNodeOffset) - pointCount);
|
||||||
|
assert extraPoints < leafNodeOffset : "point excess should be lower than leafNodeOffset";
|
||||||
|
// offset where we stop adding one point to the leaves
|
||||||
|
final int nodeOffset = leafNodeOffset - extraPoints;
|
||||||
|
long count = 0;
|
||||||
|
for (int node = leftMostLeafNode; node <= rightMostLeafNode; node++) {
|
||||||
|
// offsetPosition provides which extra point will be added to this node
|
||||||
|
if (balanceTreeNodePosition(0, leafNodeOffset, node - leafNodeOffset, 0, 0) < nodeOffset) {
|
||||||
|
count += config.maxPointsInLeafNode;
|
||||||
|
} else {
|
||||||
|
count += config.maxPointsInLeafNode - 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int balanceTreeNodePosition(
|
||||||
|
int minNode, int maxNode, int node, int position, int level) {
|
||||||
|
if (maxNode - minNode == 1) {
|
||||||
|
return position;
|
||||||
|
}
|
||||||
|
final int mid = (minNode + maxNode + 1) >>> 1;
|
||||||
|
if (mid > node) {
|
||||||
|
return balanceTreeNodePosition(minNode, mid, node, position, level + 1);
|
||||||
|
} else {
|
||||||
|
return balanceTreeNodePosition(mid, maxNode, node, position + (1 << level), level + 1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private int getNumLeavesSlow(int node) {
|
private int getNumLeavesSlow(int node) {
|
||||||
|
|
|
@ -349,7 +349,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
new int[config.maxPointsInLeafNode]);
|
new int[config.maxPointsInLeafNode]);
|
||||||
|
|
||||||
long indexFP = out.getFilePointer();
|
long indexFP = out.getFilePointer();
|
||||||
writeIndex(out, leafBlockFPs, splitPackedValues);
|
writeIndex(out, leafBlockFPs, splitPackedValues, Math.toIntExact(countPerLeaf));
|
||||||
return indexFP;
|
return indexFP;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -478,7 +478,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
for (int i = 0; i < leafBlockFPs.size(); i++) {
|
for (int i = 0; i < leafBlockFPs.size(); i++) {
|
||||||
arr[i] = leafBlockFPs.get(i);
|
arr[i] = leafBlockFPs.get(i);
|
||||||
}
|
}
|
||||||
writeIndex(out, arr, index);
|
writeIndex(out, arr, index, config.maxPointsInLeafNode);
|
||||||
return indexFP;
|
return indexFP;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -714,16 +714,15 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// System.out.println("Total nodes: " + innerNodeCount);
|
|
||||||
|
|
||||||
// Write index:
|
// Write index:
|
||||||
long indexFP = out.getFilePointer();
|
long indexFP = out.getFilePointer();
|
||||||
writeIndex(out, leafBlockFPs, splitPackedValues);
|
writeIndex(out, leafBlockFPs, splitPackedValues, Math.toIntExact(countPerLeaf));
|
||||||
return indexFP;
|
return indexFP;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Subclass can change how it writes the index. */
|
/** Subclass can change how it writes the index. */
|
||||||
private void writeIndex(IndexOutput out, long[] leafBlockFPs, byte[] splitPackedValues)
|
private void writeIndex(
|
||||||
|
IndexOutput out, long[] leafBlockFPs, byte[] splitPackedValues, int maxPointsInLeafNode)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
write(out, NUM_DATA_DIMS);
|
write(out, NUM_DATA_DIMS);
|
||||||
writeInt(out, config.numDims);
|
writeInt(out, config.numDims);
|
||||||
|
@ -738,7 +737,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
newline(out);
|
newline(out);
|
||||||
|
|
||||||
write(out, MAX_LEAF_POINTS);
|
write(out, MAX_LEAF_POINTS);
|
||||||
writeInt(out, config.maxPointsInLeafNode);
|
writeInt(out, maxPointsInLeafNode);
|
||||||
newline(out);
|
newline(out);
|
||||||
|
|
||||||
write(out, INDEX_COUNT);
|
write(out, INDEX_COUNT);
|
||||||
|
|
|
@ -154,6 +154,8 @@ public class BKDReader extends PointValues {
|
||||||
private final int leafNodeOffset;
|
private final int leafNodeOffset;
|
||||||
// version of the index
|
// version of the index
|
||||||
private final int version;
|
private final int version;
|
||||||
|
// total number of points
|
||||||
|
final long pointCount;
|
||||||
// last node might not be fully populated
|
// last node might not be fully populated
|
||||||
private final int lastLeafNodePointCount;
|
private final int lastLeafNodePointCount;
|
||||||
// right most leaf node ID
|
// right most leaf node ID
|
||||||
|
@ -181,7 +183,7 @@ public class BKDReader extends PointValues {
|
||||||
config,
|
config,
|
||||||
numLeaves,
|
numLeaves,
|
||||||
version,
|
version,
|
||||||
Math.toIntExact(pointCount % config.maxPointsInLeafNode),
|
pointCount,
|
||||||
1,
|
1,
|
||||||
1,
|
1,
|
||||||
minPackedValue,
|
minPackedValue,
|
||||||
|
@ -201,7 +203,7 @@ public class BKDReader extends PointValues {
|
||||||
BKDConfig config,
|
BKDConfig config,
|
||||||
int numLeaves,
|
int numLeaves,
|
||||||
int version,
|
int version,
|
||||||
int lastLeafNodePointCount,
|
long pointCount,
|
||||||
int nodeID,
|
int nodeID,
|
||||||
int level,
|
int level,
|
||||||
byte[] minPackedValue,
|
byte[] minPackedValue,
|
||||||
|
@ -231,7 +233,9 @@ public class BKDReader extends PointValues {
|
||||||
splitDimsPos = new int[treeDepth];
|
splitDimsPos = new int[treeDepth];
|
||||||
negativeDeltas = new boolean[config.numIndexDims * treeDepth];
|
negativeDeltas = new boolean[config.numIndexDims * treeDepth];
|
||||||
// information about the unbalance of the tree so we can report the exact size below a node
|
// information about the unbalance of the tree so we can report the exact size below a node
|
||||||
|
this.pointCount = pointCount;
|
||||||
rightMostLeafNode = (1 << treeDepth - 1) - 1;
|
rightMostLeafNode = (1 << treeDepth - 1) - 1;
|
||||||
|
int lastLeafNodePointCount = Math.toIntExact(pointCount % config.maxPointsInLeafNode);
|
||||||
this.lastLeafNodePointCount =
|
this.lastLeafNodePointCount =
|
||||||
lastLeafNodePointCount == 0 ? config.maxPointsInLeafNode : lastLeafNodePointCount;
|
lastLeafNodePointCount == 0 ? config.maxPointsInLeafNode : lastLeafNodePointCount;
|
||||||
// scratch objects, reused between clones so NN search are not creating those objects
|
// scratch objects, reused between clones so NN search are not creating those objects
|
||||||
|
@ -252,7 +256,7 @@ public class BKDReader extends PointValues {
|
||||||
config,
|
config,
|
||||||
leafNodeOffset,
|
leafNodeOffset,
|
||||||
version,
|
version,
|
||||||
lastLeafNodePointCount,
|
pointCount,
|
||||||
nodeID,
|
nodeID,
|
||||||
level,
|
level,
|
||||||
minPackedValue,
|
minPackedValue,
|
||||||
|
@ -437,11 +441,48 @@ public class BKDReader extends PointValues {
|
||||||
numLeaves = rightMostLeafNode - leftMostLeafNode + 1 + leafNodeOffset;
|
numLeaves = rightMostLeafNode - leftMostLeafNode + 1 + leafNodeOffset;
|
||||||
}
|
}
|
||||||
assert numLeaves == getNumLeavesSlow(nodeID) : numLeaves + " " + getNumLeavesSlow(nodeID);
|
assert numLeaves == getNumLeavesSlow(nodeID) : numLeaves + " " + getNumLeavesSlow(nodeID);
|
||||||
|
if (version < BKDWriter.VERSION_META_FILE && config.numDims > 1) {
|
||||||
|
// before lucene 8.6, high dimensional trees were constructed as fully balanced trees.
|
||||||
|
return sizeFromBalancedTree(leftMostLeafNode, rightMostLeafNode);
|
||||||
|
}
|
||||||
|
// size for an unbalanced tree.
|
||||||
return rightMostLeafNode == this.rightMostLeafNode
|
return rightMostLeafNode == this.rightMostLeafNode
|
||||||
? (long) (numLeaves - 1) * config.maxPointsInLeafNode + lastLeafNodePointCount
|
? (long) (numLeaves - 1) * config.maxPointsInLeafNode + lastLeafNodePointCount
|
||||||
: (long) numLeaves * config.maxPointsInLeafNode;
|
: (long) numLeaves * config.maxPointsInLeafNode;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private long sizeFromBalancedTree(int leftMostLeafNode, int rightMostLeafNode) {
|
||||||
|
// number of points that need to be distributed between leaves, one per leaf
|
||||||
|
final int extraPoints =
|
||||||
|
Math.toIntExact(((long) config.maxPointsInLeafNode * this.leafNodeOffset) - pointCount);
|
||||||
|
assert extraPoints < leafNodeOffset : "point excess should be lower than leafNodeOffset";
|
||||||
|
// offset where we stop adding one point to the leaves
|
||||||
|
final int nodeOffset = leafNodeOffset - extraPoints;
|
||||||
|
long count = 0;
|
||||||
|
for (int node = leftMostLeafNode; node <= rightMostLeafNode; node++) {
|
||||||
|
// offsetPosition provides which extra point will be added to this node
|
||||||
|
if (balanceTreeNodePosition(0, leafNodeOffset, node - leafNodeOffset, 0, 0) < nodeOffset) {
|
||||||
|
count += config.maxPointsInLeafNode;
|
||||||
|
} else {
|
||||||
|
count += config.maxPointsInLeafNode - 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int balanceTreeNodePosition(
|
||||||
|
int minNode, int maxNode, int node, int position, int level) {
|
||||||
|
if (maxNode - minNode == 1) {
|
||||||
|
return position;
|
||||||
|
}
|
||||||
|
final int mid = (minNode + maxNode + 1) >>> 1;
|
||||||
|
if (mid > node) {
|
||||||
|
return balanceTreeNodePosition(minNode, mid, node, position, level + 1);
|
||||||
|
} else {
|
||||||
|
return balanceTreeNodePosition(mid, maxNode, node, position + (1 << level), level + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void visitDocIDs(PointValues.IntersectVisitor visitor) throws IOException {
|
public void visitDocIDs(PointValues.IntersectVisitor visitor) throws IOException {
|
||||||
addAll(visitor, false);
|
addAll(visitor, false);
|
||||||
|
|
|
@ -822,7 +822,7 @@ public abstract class BasePointsFormatTestCase extends BaseIndexFileFormatTestCa
|
||||||
if (dimValues == null) {
|
if (dimValues == null) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
assertSize(dimValues.getPointTree());
|
||||||
byte[] leafMinValues = dimValues.getMinPackedValue();
|
byte[] leafMinValues = dimValues.getMinPackedValue();
|
||||||
byte[] leafMaxValues = dimValues.getMaxPackedValue();
|
byte[] leafMaxValues = dimValues.getMaxPackedValue();
|
||||||
for (int dim = 0; dim < numIndexDims; dim++) {
|
for (int dim = 0; dim < numIndexDims; dim++) {
|
||||||
|
@ -1063,6 +1063,36 @@ public abstract class BasePointsFormatTestCase extends BaseIndexFileFormatTestCa
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void assertSize(PointValues.PointTree tree) throws IOException {
|
||||||
|
final PointValues.PointTree clone = tree.clone();
|
||||||
|
assertEquals(clone.size(), tree.size());
|
||||||
|
final long[] size = new long[] {0};
|
||||||
|
clone.visitDocIDs(
|
||||||
|
new IntersectVisitor() {
|
||||||
|
@Override
|
||||||
|
public void visit(int docID) {
|
||||||
|
size[0]++;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void visit(int docID, byte[] packedValue) {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
assertEquals(size[0], tree.size());
|
||||||
|
if (tree.moveToChild()) {
|
||||||
|
do {
|
||||||
|
assertSize(tree);
|
||||||
|
} while (tree.moveToSibling());
|
||||||
|
tree.moveToParent();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public void testAddIndexes() throws IOException {
|
public void testAddIndexes() throws IOException {
|
||||||
Directory dir1 = newDirectory();
|
Directory dir1 = newDirectory();
|
||||||
RandomIndexWriter w = new RandomIndexWriter(random(), dir1);
|
RandomIndexWriter w = new RandomIndexWriter(random(), dir1);
|
||||||
|
|
Loading…
Reference in New Issue