mirror of https://github.com/apache/lucene.git
LUCENE-7142: BKDWriter wasn't splitting correctly with long ords; improve tests so we sometimes long ords even for small number of points
This commit is contained in:
parent
9f9c2f74b3
commit
28a4335813
|
@ -128,21 +128,31 @@ public class BKDWriter implements Closeable {
|
||||||
protected long pointCount;
|
protected long pointCount;
|
||||||
|
|
||||||
/** true if we have so many values that we must write ords using long (8 bytes) instead of int (4 bytes) */
|
/** true if we have so many values that we must write ords using long (8 bytes) instead of int (4 bytes) */
|
||||||
private final boolean longOrds;
|
protected final boolean longOrds;
|
||||||
|
|
||||||
/** An upper bound on how many points the caller will add (includes deletions) */
|
/** An upper bound on how many points the caller will add (includes deletions) */
|
||||||
private final long totalPointCount;
|
private final long totalPointCount;
|
||||||
|
|
||||||
/** True if every document has at most one value. We specialize this case by not bothering to store the ord since it's redundant with docID. */
|
/** True if every document has at most one value. We specialize this case by not bothering to store the ord since it's redundant with docID. */
|
||||||
private final boolean singleValuePerDoc;
|
protected final boolean singleValuePerDoc;
|
||||||
|
|
||||||
|
/** How much heap OfflineSorter is allowed to use */
|
||||||
|
protected final OfflineSorter.BufferSize offlineSorterBufferMB;
|
||||||
|
|
||||||
|
/** How much heap OfflineSorter is allowed to use */
|
||||||
|
protected final int offlineSorterMaxTempFiles;
|
||||||
|
|
||||||
private final int maxDoc;
|
private final int maxDoc;
|
||||||
|
|
||||||
public BKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDims, int bytesPerDim, long totalPointCount, boolean singleValuePerDoc) throws IOException {
|
public BKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDims, int bytesPerDim,
|
||||||
this(maxDoc, tempDir, tempFileNamePrefix, numDims, bytesPerDim, DEFAULT_MAX_POINTS_IN_LEAF_NODE, DEFAULT_MAX_MB_SORT_IN_HEAP, totalPointCount, singleValuePerDoc);
|
int maxPointsInLeafNode, double maxMBSortInHeap, long totalPointCount, boolean singleValuePerDoc) throws IOException {
|
||||||
|
this(maxDoc, tempDir, tempFileNamePrefix, numDims, bytesPerDim, maxPointsInLeafNode, maxMBSortInHeap, totalPointCount, singleValuePerDoc,
|
||||||
|
totalPointCount > Integer.MAX_VALUE, Math.max(1, (long) maxMBSortInHeap), OfflineSorter.MAX_TEMPFILES);
|
||||||
}
|
}
|
||||||
|
|
||||||
public BKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDims, int bytesPerDim, int maxPointsInLeafNode, double maxMBSortInHeap, long totalPointCount, boolean singleValuePerDoc) throws IOException {
|
protected BKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDims, int bytesPerDim,
|
||||||
|
int maxPointsInLeafNode, double maxMBSortInHeap, long totalPointCount,
|
||||||
|
boolean singleValuePerDoc, boolean longOrds, long offlineSorterBufferMB, int offlineSorterMaxTempFiles) throws IOException {
|
||||||
verifyParams(numDims, maxPointsInLeafNode, maxMBSortInHeap, totalPointCount);
|
verifyParams(numDims, maxPointsInLeafNode, maxMBSortInHeap, totalPointCount);
|
||||||
// We use tracking dir to deal with removing files on exception, so each place that
|
// We use tracking dir to deal with removing files on exception, so each place that
|
||||||
// creates temp files doesn't need crazy try/finally/sucess logic:
|
// creates temp files doesn't need crazy try/finally/sucess logic:
|
||||||
|
@ -153,6 +163,8 @@ public class BKDWriter implements Closeable {
|
||||||
this.bytesPerDim = bytesPerDim;
|
this.bytesPerDim = bytesPerDim;
|
||||||
this.totalPointCount = totalPointCount;
|
this.totalPointCount = totalPointCount;
|
||||||
this.maxDoc = maxDoc;
|
this.maxDoc = maxDoc;
|
||||||
|
this.offlineSorterBufferMB = OfflineSorter.BufferSize.megabytes(offlineSorterBufferMB);
|
||||||
|
this.offlineSorterMaxTempFiles = offlineSorterMaxTempFiles;
|
||||||
docsSeen = new FixedBitSet(maxDoc);
|
docsSeen = new FixedBitSet(maxDoc);
|
||||||
packedBytesLength = numDims * bytesPerDim;
|
packedBytesLength = numDims * bytesPerDim;
|
||||||
|
|
||||||
|
@ -166,7 +178,8 @@ public class BKDWriter implements Closeable {
|
||||||
maxPackedValue = new byte[packedBytesLength];
|
maxPackedValue = new byte[packedBytesLength];
|
||||||
|
|
||||||
// If we may have more than 1+Integer.MAX_VALUE values, then we must encode ords with long (8 bytes), else we can use int (4 bytes).
|
// If we may have more than 1+Integer.MAX_VALUE values, then we must encode ords with long (8 bytes), else we can use int (4 bytes).
|
||||||
longOrds = totalPointCount > Integer.MAX_VALUE;
|
this.longOrds = longOrds;
|
||||||
|
|
||||||
this.singleValuePerDoc = singleValuePerDoc;
|
this.singleValuePerDoc = singleValuePerDoc;
|
||||||
|
|
||||||
// dimensional values (numDims * bytesPerDim) + ord (int or long) + docID (int)
|
// dimensional values (numDims * bytesPerDim) + ord (int or long) + docID (int)
|
||||||
|
@ -735,7 +748,7 @@ public class BKDWriter implements Closeable {
|
||||||
// TODO: this is sort of sneaky way to get the final OfflinePointWriter from OfflineSorter:
|
// TODO: this is sort of sneaky way to get the final OfflinePointWriter from OfflineSorter:
|
||||||
IndexOutput[] lastWriter = new IndexOutput[1];
|
IndexOutput[] lastWriter = new IndexOutput[1];
|
||||||
|
|
||||||
OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix + "_bkd" + dim, cmp, OfflineSorter.BufferSize.megabytes(Math.max(1, (long) maxMBSortInHeap)), OfflineSorter.MAX_TEMPFILES) {
|
OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix + "_bkd" + dim, cmp, offlineSorterBufferMB, offlineSorterMaxTempFiles) {
|
||||||
|
|
||||||
/** We write/read fixed-byte-width file that {@link OfflinePointReader} can read. */
|
/** We write/read fixed-byte-width file that {@link OfflinePointReader} can read. */
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -198,12 +198,15 @@ final class OfflinePointReader extends PointReader {
|
||||||
in.readBytes(buffer, 0, buffer.length);
|
in.readBytes(buffer, 0, buffer.length);
|
||||||
|
|
||||||
long ord;
|
long ord;
|
||||||
if (singleValuePerDoc == false) {
|
if (longOrds) {
|
||||||
ord = readInt(buffer, packedBytesLength+Integer.BYTES);
|
// A long ord, after the docID:
|
||||||
} else if (longOrds) {
|
|
||||||
ord = readLong(buffer, packedBytesLength+Integer.BYTES);
|
ord = readLong(buffer, packedBytesLength+Integer.BYTES);
|
||||||
} else {
|
} else if (singleValuePerDoc) {
|
||||||
|
// docID is the ord:
|
||||||
ord = readInt(buffer, packedBytesLength);
|
ord = readInt(buffer, packedBytesLength);
|
||||||
|
} else {
|
||||||
|
// An int ord, after the docID:
|
||||||
|
ord = readInt(buffer, packedBytesLength+Integer.BYTES);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (rightTree.get(ord)) {
|
if (rightTree.get(ord)) {
|
||||||
|
|
|
@ -40,7 +40,8 @@ public class Test2BBKDPoints extends LuceneTestCase {
|
||||||
|
|
||||||
final int numDocs = (Integer.MAX_VALUE / 26) + 100;
|
final int numDocs = (Integer.MAX_VALUE / 26) + 100;
|
||||||
|
|
||||||
BKDWriter w = new BKDWriter(numDocs, dir, "_0", 1, Long.BYTES, 26L * numDocs, false);
|
BKDWriter w = new BKDWriter(numDocs, dir, "_0", 1, Long.BYTES,
|
||||||
|
BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP, 26L * numDocs, false);
|
||||||
int counter = 0;
|
int counter = 0;
|
||||||
byte[] packedBytes = new byte[Long.BYTES];
|
byte[] packedBytes = new byte[Long.BYTES];
|
||||||
for (int docID = 0; docID < numDocs; docID++) {
|
for (int docID = 0; docID < numDocs; docID++) {
|
||||||
|
@ -73,7 +74,8 @@ public class Test2BBKDPoints extends LuceneTestCase {
|
||||||
|
|
||||||
final int numDocs = (Integer.MAX_VALUE / 26) + 100;
|
final int numDocs = (Integer.MAX_VALUE / 26) + 100;
|
||||||
|
|
||||||
BKDWriter w = new BKDWriter(numDocs, dir, "_0", 2, Long.BYTES, 26L * numDocs, false);
|
BKDWriter w = new BKDWriter(numDocs, dir, "_0", 2, Long.BYTES,
|
||||||
|
BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP, 26L * numDocs, false);
|
||||||
int counter = 0;
|
int counter = 0;
|
||||||
byte[] packedBytes = new byte[2*Long.BYTES];
|
byte[] packedBytes = new byte[2*Long.BYTES];
|
||||||
for (int docID = 0; docID < numDocs; docID++) {
|
for (int docID = 0; docID < numDocs; docID++) {
|
||||||
|
|
|
@ -891,7 +891,7 @@ public class TestBKD extends LuceneTestCase {
|
||||||
public void testTieBreakOrder() throws Exception {
|
public void testTieBreakOrder() throws Exception {
|
||||||
try (Directory dir = newDirectory()) {
|
try (Directory dir = newDirectory()) {
|
||||||
int numDocs = 10000;
|
int numDocs = 10000;
|
||||||
BKDWriter w = new BKDWriter(numDocs+1, dir, "tmp", 1, 4, 2, 0.01f, numDocs, true);
|
BKDWriter w = new BKDWriter(numDocs+1, dir, "tmp", 1, Integer.BYTES, 2, 0.01f, numDocs, true);
|
||||||
for(int i=0;i<numDocs;i++) {
|
for(int i=0;i<numDocs;i++) {
|
||||||
w.add(new byte[Integer.BYTES], i);
|
w.add(new byte[Integer.BYTES], i);
|
||||||
}
|
}
|
||||||
|
@ -925,4 +925,53 @@ public class TestBKD extends LuceneTestCase {
|
||||||
in.close();
|
in.close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void test2DLongOrdsOffline() throws Exception {
|
||||||
|
try (Directory dir = newDirectory()) {
|
||||||
|
int numDocs = 100000;
|
||||||
|
boolean singleValuePerDoc = false;
|
||||||
|
boolean longOrds = true;
|
||||||
|
int offlineSorterMaxTempFiles = TestUtil.nextInt(random(), 2, 20);
|
||||||
|
BKDWriter w = new BKDWriter(numDocs+1, dir, "tmp", 2, Integer.BYTES, 2, 0.01f, numDocs,
|
||||||
|
singleValuePerDoc, longOrds, 1, offlineSorterMaxTempFiles);
|
||||||
|
byte[] buffer = new byte[2*Integer.BYTES];
|
||||||
|
for(int i=0;i<numDocs;i++) {
|
||||||
|
random().nextBytes(buffer);
|
||||||
|
w.add(buffer, i);
|
||||||
|
}
|
||||||
|
|
||||||
|
IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT);
|
||||||
|
long fp = w.finish(out);
|
||||||
|
out.close();
|
||||||
|
|
||||||
|
IndexInput in = dir.openInput("bkd", IOContext.DEFAULT);
|
||||||
|
in.seek(fp);
|
||||||
|
BKDReader r = new BKDReader(in);
|
||||||
|
int[] count = new int[1];
|
||||||
|
r.intersect(new IntersectVisitor() {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void visit(int docID) {
|
||||||
|
count[0]++;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void visit(int docID, byte[] packedValue) {
|
||||||
|
visit(docID);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Relation compare(byte[] minPacked, byte[] maxPacked) {
|
||||||
|
if (random().nextInt(7) == 1) {
|
||||||
|
return Relation.CELL_CROSSES_QUERY;
|
||||||
|
} else {
|
||||||
|
return Relation.CELL_INSIDE_QUERY;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
assertEquals(numDocs, count[0]);
|
||||||
|
in.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -277,10 +277,32 @@ public class RandomCodec extends AssertingCodec {
|
||||||
public RandomlySplittingBKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDims,
|
public RandomlySplittingBKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDims,
|
||||||
int bytesPerDim, int maxPointsInLeafNode, double maxMBSortInHeap,
|
int bytesPerDim, int maxPointsInLeafNode, double maxMBSortInHeap,
|
||||||
long totalPointCount, boolean singleValuePerDoc, int randomSeed) throws IOException {
|
long totalPointCount, boolean singleValuePerDoc, int randomSeed) throws IOException {
|
||||||
super(maxDoc, tempDir, tempFileNamePrefix, numDims, bytesPerDim, maxPointsInLeafNode, maxMBSortInHeap, totalPointCount, singleValuePerDoc);
|
super(maxDoc, tempDir, tempFileNamePrefix, numDims, bytesPerDim, maxPointsInLeafNode, maxMBSortInHeap, totalPointCount,
|
||||||
|
getRandomSingleValuePerDoc(singleValuePerDoc, randomSeed),
|
||||||
|
getRandomLongOrds(totalPointCount, singleValuePerDoc, randomSeed),
|
||||||
|
getRandomOfflineSorterBufferMB(randomSeed),
|
||||||
|
getRandomOfflineSorterMaxTempFiles(randomSeed));
|
||||||
this.random = new Random(randomSeed);
|
this.random = new Random(randomSeed);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static boolean getRandomSingleValuePerDoc(boolean singleValuePerDoc, int randomSeed) {
|
||||||
|
// If we are single valued, sometimes pretend we aren't:
|
||||||
|
return singleValuePerDoc && (new Random(randomSeed).nextBoolean());
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean getRandomLongOrds(long totalPointCount, boolean singleValuePerDoc, int randomSeed) {
|
||||||
|
// Always use long ords if we have too many points, but sometimes randomly use it anyway when singleValuePerDoc is false:
|
||||||
|
return totalPointCount > Integer.MAX_VALUE || (getRandomSingleValuePerDoc(singleValuePerDoc, randomSeed) == false && new Random(randomSeed).nextBoolean());
|
||||||
|
}
|
||||||
|
|
||||||
|
private static long getRandomOfflineSorterBufferMB(int randomSeed) {
|
||||||
|
return TestUtil.nextInt(new Random(randomSeed), 1, 8);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int getRandomOfflineSorterMaxTempFiles(int randomSeed) {
|
||||||
|
return TestUtil.nextInt(new Random(randomSeed), 2, 20);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected int split(byte[] minPackedValue, byte[] maxPackedValue) {
|
protected int split(byte[] minPackedValue, byte[] maxPackedValue) {
|
||||||
// BKD normally defaults by the widest dimension, to try to make as squarish cells as possible, but we just pick a random one ;)
|
// BKD normally defaults by the widest dimension, to try to make as squarish cells as possible, but we just pick a random one ;)
|
||||||
|
|
Loading…
Reference in New Issue