LUCENE-9820: PointTree#size() should handle the case of balanced tree in pre-8.6 indexes (#462)

Handle properly the case where trees are fully balanced for number of dimension > 1
This commit is contained in:
Ignacio Vera 2021-11-25 11:03:16 +01:00 committed by GitHub
parent 8710252116
commit 800f002e44
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 2546 additions and 70 deletions

View File

@ -21,6 +21,7 @@ import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.backward_codecs.lucene60.bkd.BKDWriter60;
import org.apache.lucene.backward_codecs.store.EndiannessReverserUtil;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.MutablePointTree;
@ -36,8 +37,6 @@ import org.apache.lucene.index.PointValues.Relation;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.bkd.BKDConfig;
import org.apache.lucene.util.bkd.BKDWriter;
/** Writes dimensional values */
public class Lucene60PointsWriter extends PointsWriter {
@ -91,8 +90,8 @@ public class Lucene60PointsWriter extends PointsWriter {
public Lucene60PointsWriter(SegmentWriteState writeState) throws IOException {
this(
writeState,
BKDConfig.DEFAULT_MAX_POINTS_IN_LEAF_NODE,
BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP);
BKDWriter60.DEFAULT_MAX_POINTS_IN_LEAF_NODE,
BKDWriter60.DEFAULT_MAX_MB_SORT_IN_HEAP);
}
@Override
@ -100,28 +99,22 @@ public class Lucene60PointsWriter extends PointsWriter {
PointValues.PointTree values = reader.getValues(fieldInfo.name).getPointTree();
BKDConfig config =
new BKDConfig(
fieldInfo.getPointDimensionCount(),
fieldInfo.getPointIndexDimensionCount(),
fieldInfo.getPointNumBytes(),
maxPointsInLeafNode);
try (BKDWriter writer =
new BKDWriter(
try (BKDWriter60 writer =
new BKDWriter60(
writeState.segmentInfo.maxDoc(),
writeState.directory,
writeState.segmentInfo.name,
config,
fieldInfo.getPointDimensionCount(),
fieldInfo.getPointIndexDimensionCount(),
fieldInfo.getPointNumBytes(),
maxPointsInLeafNode,
maxMBSortInHeap,
values.size())) {
if (values instanceof MutablePointTree) {
Runnable finalizer =
writer.writeField(dataOut, dataOut, dataOut, fieldInfo.name, (MutablePointTree) values);
if (finalizer != null) {
indexFPs.put(fieldInfo.name, dataOut.getFilePointer());
finalizer.run();
final long fp = writer.writeField(dataOut, fieldInfo.name, (MutablePointTree) values);
if (fp != -1) {
indexFPs.put(fieldInfo.name, fp);
}
return;
}
@ -145,10 +138,8 @@ public class Lucene60PointsWriter extends PointsWriter {
});
// We could have 0 points on merge since all docs with dimensional fields may be deleted:
Runnable finalizer = writer.finish(dataOut, dataOut, dataOut);
if (finalizer != null) {
indexFPs.put(fieldInfo.name, dataOut.getFilePointer());
finalizer.run();
if (writer.getPointCount() > 0) {
indexFPs.put(fieldInfo.name, writer.finish(dataOut));
}
}
}
@ -193,26 +184,22 @@ public class Lucene60PointsWriter extends PointsWriter {
}
}
BKDConfig config =
new BKDConfig(
fieldInfo.getPointDimensionCount(),
fieldInfo.getPointIndexDimensionCount(),
fieldInfo.getPointNumBytes(),
maxPointsInLeafNode);
// System.out.println("MERGE: field=" + fieldInfo.name);
// Optimize the 1D case to use BKDWriter.merge, which does a single merge sort of the
// already sorted incoming segments, instead of trying to sort all points again as if
// we were simply reindexing them:
try (BKDWriter writer =
new BKDWriter(
try (BKDWriter60 writer =
new BKDWriter60(
writeState.segmentInfo.maxDoc(),
writeState.directory,
writeState.segmentInfo.name,
config,
fieldInfo.getPointDimensionCount(),
fieldInfo.getPointIndexDimensionCount(),
fieldInfo.getPointNumBytes(),
maxPointsInLeafNode,
maxMBSortInHeap,
totMaxSize)) {
List<PointValues> pointValues = new ArrayList<>();
List<PointValues> bkdReaders = new ArrayList<>();
List<MergeState.DocMap> docMaps = new ArrayList<>();
for (int i = 0; i < mergeState.pointsReaders.length; i++) {
PointsReader reader = mergeState.pointsReaders[i];
@ -231,19 +218,18 @@ public class Lucene60PointsWriter extends PointsWriter {
FieldInfos readerFieldInfos = mergeState.fieldInfos[i];
FieldInfo readerFieldInfo = readerFieldInfos.fieldInfo(fieldInfo.name);
if (readerFieldInfo != null && readerFieldInfo.getPointDimensionCount() > 0) {
PointValues aPointValues = reader60.readers.get(readerFieldInfo.number);
if (aPointValues != null) {
pointValues.add(aPointValues);
PointValues bkdReader = reader60.readers.get(readerFieldInfo.number);
if (bkdReader != null) {
bkdReaders.add(bkdReader);
docMaps.add(mergeState.docMaps[i]);
}
}
}
}
Runnable finalizer = writer.merge(dataOut, dataOut, dataOut, docMaps, pointValues);
if (finalizer != null) {
indexFPs.put(fieldInfo.name, dataOut.getFilePointer());
finalizer.run();
long fp = writer.merge(dataOut, docMaps, bkdReaders);
if (fp != -1) {
indexFPs.put(fieldInfo.name, fp);
}
}
} else {

View File

@ -18,6 +18,7 @@ package org.apache.lucene.backward_codecs.lucene60;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.backward_codecs.lucene60.bkd.BKDWriter60;
import org.apache.lucene.backward_codecs.lucene84.Lucene84RWCodec;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.document.BinaryPoint;
@ -35,7 +36,6 @@ import org.apache.lucene.index.PointValues.Relation;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase.Nightly;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.bkd.BKDConfig;
/** Tests Lucene60PointsFormat */
@Nightly // N-2 formats are only tested on nightly runs
@ -45,7 +45,7 @@ public class TestLucene60PointsFormat extends BasePointsFormatTestCase {
public TestLucene60PointsFormat() {
codec = new Lucene84RWCodec();
maxPointsInLeafNode = BKDConfig.DEFAULT_MAX_POINTS_IN_LEAF_NODE;
maxPointsInLeafNode = BKDWriter60.DEFAULT_MAX_POINTS_IN_LEAF_NODE;
}
@Override
@ -280,16 +280,23 @@ public class TestLucene60PointsFormat extends BasePointsFormatTestCase {
};
final long pointCount = points.estimatePointCount(onePointMatchVisitor);
final long lastNodePointCount = totalValues % maxPointsInLeafNode;
// With >1 dims, the tree is balanced
long actualMaxPointsInLeafNode = points.size();
while (actualMaxPointsInLeafNode > maxPointsInLeafNode) {
actualMaxPointsInLeafNode = (actualMaxPointsInLeafNode + 1) / 2;
}
final long countPerFullLeaf = (actualMaxPointsInLeafNode + 1) / 2;
final long countPerNotFullLeaf = (actualMaxPointsInLeafNode) / 2;
assertTrue(
"" + pointCount,
pointCount == (maxPointsInLeafNode + 1) / 2 // common case
|| pointCount == (lastNodePointCount + 1) / 2 // not fully populated leaf
|| pointCount == 2 * ((maxPointsInLeafNode + 1) / 2) // if the point is a split value
|| pointCount == ((maxPointsInLeafNode + 1) / 2) + ((lastNodePointCount + 1) / 2)
// in extreme cases, a point can be shared by 4 leaves
|| pointCount == 4 * ((maxPointsInLeafNode + 1) / 2)
|| pointCount == 3 * ((maxPointsInLeafNode + 1) / 2) + ((lastNodePointCount + 1) / 2));
pointCount + " vs " + actualMaxPointsInLeafNode,
// common case, point in one leaf.
pointCount >= countPerNotFullLeaf && pointCount <= countPerFullLeaf
||
// one dimension is a split value
pointCount >= 2 * countPerNotFullLeaf && pointCount <= 2 * countPerFullLeaf
||
// both dimensions are split values
pointCount >= 4 * countPerNotFullLeaf && pointCount <= 4 * countPerFullLeaf);
final long docCount = points.estimateDocCount(onePointMatchVisitor);
if (multiValues) {

View File

@ -0,0 +1,85 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.backward_codecs.lucene60.bkd;
import java.io.IOException;
import org.apache.lucene.store.DataOutput;
class DocIdsWriter {
private DocIdsWriter() {}
static void writeDocIds(int[] docIds, int start, int count, DataOutput out) throws IOException {
// docs can be sorted either when all docs in a block have the same value
// or when a segment is sorted
boolean sorted = true;
for (int i = 1; i < count; ++i) {
if (docIds[start + i - 1] > docIds[start + i]) {
sorted = false;
break;
}
}
if (sorted) {
out.writeByte((byte) 0);
int previous = 0;
for (int i = 0; i < count; ++i) {
int doc = docIds[start + i];
out.writeVInt(doc - previous);
previous = doc;
}
} else {
long max = 0;
for (int i = 0; i < count; ++i) {
max |= Integer.toUnsignedLong(docIds[start + i]);
}
if (max <= 0xffffff) {
out.writeByte((byte) 24);
// write them the same way we are reading them.
int i;
for (i = 0; i < count - 7; i += 8) {
int doc1 = docIds[start + i];
int doc2 = docIds[start + i + 1];
int doc3 = docIds[start + i + 2];
int doc4 = docIds[start + i + 3];
int doc5 = docIds[start + i + 4];
int doc6 = docIds[start + i + 5];
int doc7 = docIds[start + i + 6];
int doc8 = docIds[start + i + 7];
long l1 = (doc1 & 0xffffffL) << 40 | (doc2 & 0xffffffL) << 16 | ((doc3 >>> 8) & 0xffffL);
long l2 =
(doc3 & 0xffL) << 56
| (doc4 & 0xffffffL) << 32
| (doc5 & 0xffffffL) << 8
| ((doc6 >> 16) & 0xffL);
long l3 = (doc6 & 0xffffL) << 48 | (doc7 & 0xffffffL) << 24 | (doc8 & 0xffffffL);
out.writeLong(l1);
out.writeLong(l2);
out.writeLong(l3);
}
for (; i < count; ++i) {
out.writeShort((short) (docIds[start + i] >>> 8));
out.writeByte((byte) docIds[start + i]);
}
} else {
out.writeByte((byte) 32);
for (int i = 0; i < count; ++i) {
out.writeInt(docIds[start + i]);
}
}
}
}
}

View File

@ -87,7 +87,6 @@ final class SimpleTextBKDReader extends PointValues {
int nodeID;
int level;
final int rootNode;
final int lastLeafNodeCount;
// holds the min / max value of the current node.
private final byte[] minPackedValue, maxPackedValue;
// holds the previous value of the split dimension
@ -107,9 +106,6 @@ final class SimpleTextBKDReader extends PointValues {
int treeDepth = getTreeDepth(leafNodeOffset);
splitDimValueStack = new byte[treeDepth + 1][];
splitDims = new int[treeDepth + 1];
int lastLeafNodeCount = Math.toIntExact(pointCount % config.maxPointsInLeafNode);
this.lastLeafNodeCount =
lastLeafNodeCount == 0 ? config.maxPointsInLeafNode : lastLeafNodeCount;
}
private int getTreeDepth(int numLeaves) {
@ -285,9 +281,39 @@ final class SimpleTextBKDReader extends PointValues {
numLeaves = rightMostLeafNode - leftMostLeafNode + 1 + leafNodeOffset;
}
assert numLeaves == getNumLeavesSlow(nodeID) : numLeaves + " " + getNumLeavesSlow(nodeID);
return rightMostLeafNode == (1 << getTreeDepth(leafNodeOffset) - 1) - 1
? (long) (numLeaves - 1) * config.maxPointsInLeafNode + lastLeafNodeCount
: (long) numLeaves * config.maxPointsInLeafNode;
return sizeFromBalancedTree(leftMostLeafNode, rightMostLeafNode);
}
private long sizeFromBalancedTree(int leftMostLeafNode, int rightMostLeafNode) {
// number of points that need to be distributed between leaves, one per leaf
final int extraPoints =
Math.toIntExact(((long) config.maxPointsInLeafNode * leafNodeOffset) - pointCount);
assert extraPoints < leafNodeOffset : "point excess should be lower than leafNodeOffset";
// offset where we stop adding one point to the leaves
final int nodeOffset = leafNodeOffset - extraPoints;
long count = 0;
for (int node = leftMostLeafNode; node <= rightMostLeafNode; node++) {
// offsetPosition provides which extra point will be added to this node
if (balanceTreeNodePosition(0, leafNodeOffset, node - leafNodeOffset, 0, 0) < nodeOffset) {
count += config.maxPointsInLeafNode;
} else {
count += config.maxPointsInLeafNode - 1;
}
}
return count;
}
private int balanceTreeNodePosition(
int minNode, int maxNode, int node, int position, int level) {
if (maxNode - minNode == 1) {
return position;
}
final int mid = (minNode + maxNode + 1) >>> 1;
if (mid > node) {
return balanceTreeNodePosition(minNode, mid, node, position, level + 1);
} else {
return balanceTreeNodePosition(mid, maxNode, node, position + (1 << level), level + 1);
}
}
private int getNumLeavesSlow(int node) {

View File

@ -349,7 +349,7 @@ final class SimpleTextBKDWriter implements Closeable {
new int[config.maxPointsInLeafNode]);
long indexFP = out.getFilePointer();
writeIndex(out, leafBlockFPs, splitPackedValues);
writeIndex(out, leafBlockFPs, splitPackedValues, Math.toIntExact(countPerLeaf));
return indexFP;
}
@ -478,7 +478,7 @@ final class SimpleTextBKDWriter implements Closeable {
for (int i = 0; i < leafBlockFPs.size(); i++) {
arr[i] = leafBlockFPs.get(i);
}
writeIndex(out, arr, index);
writeIndex(out, arr, index, config.maxPointsInLeafNode);
return indexFP;
}
@ -714,16 +714,15 @@ final class SimpleTextBKDWriter implements Closeable {
}
}
// System.out.println("Total nodes: " + innerNodeCount);
// Write index:
long indexFP = out.getFilePointer();
writeIndex(out, leafBlockFPs, splitPackedValues);
writeIndex(out, leafBlockFPs, splitPackedValues, Math.toIntExact(countPerLeaf));
return indexFP;
}
/** Subclass can change how it writes the index. */
private void writeIndex(IndexOutput out, long[] leafBlockFPs, byte[] splitPackedValues)
private void writeIndex(
IndexOutput out, long[] leafBlockFPs, byte[] splitPackedValues, int maxPointsInLeafNode)
throws IOException {
write(out, NUM_DATA_DIMS);
writeInt(out, config.numDims);
@ -738,7 +737,7 @@ final class SimpleTextBKDWriter implements Closeable {
newline(out);
write(out, MAX_LEAF_POINTS);
writeInt(out, config.maxPointsInLeafNode);
writeInt(out, maxPointsInLeafNode);
newline(out);
write(out, INDEX_COUNT);

View File

@ -154,6 +154,8 @@ public class BKDReader extends PointValues {
private final int leafNodeOffset;
// version of the index
private final int version;
// total number of points
final long pointCount;
// last node might not be fully populated
private final int lastLeafNodePointCount;
// right most leaf node ID
@ -181,7 +183,7 @@ public class BKDReader extends PointValues {
config,
numLeaves,
version,
Math.toIntExact(pointCount % config.maxPointsInLeafNode),
pointCount,
1,
1,
minPackedValue,
@ -201,7 +203,7 @@ public class BKDReader extends PointValues {
BKDConfig config,
int numLeaves,
int version,
int lastLeafNodePointCount,
long pointCount,
int nodeID,
int level,
byte[] minPackedValue,
@ -231,7 +233,9 @@ public class BKDReader extends PointValues {
splitDimsPos = new int[treeDepth];
negativeDeltas = new boolean[config.numIndexDims * treeDepth];
// information about the unbalance of the tree so we can report the exact size below a node
this.pointCount = pointCount;
rightMostLeafNode = (1 << treeDepth - 1) - 1;
int lastLeafNodePointCount = Math.toIntExact(pointCount % config.maxPointsInLeafNode);
this.lastLeafNodePointCount =
lastLeafNodePointCount == 0 ? config.maxPointsInLeafNode : lastLeafNodePointCount;
// scratch objects, reused between clones so NN search are not creating those objects
@ -252,7 +256,7 @@ public class BKDReader extends PointValues {
config,
leafNodeOffset,
version,
lastLeafNodePointCount,
pointCount,
nodeID,
level,
minPackedValue,
@ -437,11 +441,48 @@ public class BKDReader extends PointValues {
numLeaves = rightMostLeafNode - leftMostLeafNode + 1 + leafNodeOffset;
}
assert numLeaves == getNumLeavesSlow(nodeID) : numLeaves + " " + getNumLeavesSlow(nodeID);
if (version < BKDWriter.VERSION_META_FILE && config.numDims > 1) {
// before lucene 8.6, high dimensional trees were constructed as fully balanced trees.
return sizeFromBalancedTree(leftMostLeafNode, rightMostLeafNode);
}
// size for an unbalanced tree.
return rightMostLeafNode == this.rightMostLeafNode
? (long) (numLeaves - 1) * config.maxPointsInLeafNode + lastLeafNodePointCount
: (long) numLeaves * config.maxPointsInLeafNode;
}
private long sizeFromBalancedTree(int leftMostLeafNode, int rightMostLeafNode) {
// number of points that need to be distributed between leaves, one per leaf
final int extraPoints =
Math.toIntExact(((long) config.maxPointsInLeafNode * this.leafNodeOffset) - pointCount);
assert extraPoints < leafNodeOffset : "point excess should be lower than leafNodeOffset";
// offset where we stop adding one point to the leaves
final int nodeOffset = leafNodeOffset - extraPoints;
long count = 0;
for (int node = leftMostLeafNode; node <= rightMostLeafNode; node++) {
// offsetPosition provides which extra point will be added to this node
if (balanceTreeNodePosition(0, leafNodeOffset, node - leafNodeOffset, 0, 0) < nodeOffset) {
count += config.maxPointsInLeafNode;
} else {
count += config.maxPointsInLeafNode - 1;
}
}
return count;
}
private int balanceTreeNodePosition(
int minNode, int maxNode, int node, int position, int level) {
if (maxNode - minNode == 1) {
return position;
}
final int mid = (minNode + maxNode + 1) >>> 1;
if (mid > node) {
return balanceTreeNodePosition(minNode, mid, node, position, level + 1);
} else {
return balanceTreeNodePosition(mid, maxNode, node, position + (1 << level), level + 1);
}
}
@Override
public void visitDocIDs(PointValues.IntersectVisitor visitor) throws IOException {
addAll(visitor, false);

View File

@ -822,7 +822,7 @@ public abstract class BasePointsFormatTestCase extends BaseIndexFileFormatTestCa
if (dimValues == null) {
continue;
}
assertSize(dimValues.getPointTree());
byte[] leafMinValues = dimValues.getMinPackedValue();
byte[] leafMaxValues = dimValues.getMaxPackedValue();
for (int dim = 0; dim < numIndexDims; dim++) {
@ -1063,6 +1063,36 @@ public abstract class BasePointsFormatTestCase extends BaseIndexFileFormatTestCa
}
}
private void assertSize(PointValues.PointTree tree) throws IOException {
final PointValues.PointTree clone = tree.clone();
assertEquals(clone.size(), tree.size());
final long[] size = new long[] {0};
clone.visitDocIDs(
new IntersectVisitor() {
@Override
public void visit(int docID) {
size[0]++;
}
@Override
public void visit(int docID, byte[] packedValue) {
throw new UnsupportedOperationException();
}
@Override
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
throw new UnsupportedOperationException();
}
});
assertEquals(size[0], tree.size());
if (tree.moveToChild()) {
do {
assertSize(tree);
} while (tree.moveToSibling());
tree.moveToParent();
}
}
public void testAddIndexes() throws IOException {
Directory dir1 = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir1);