mirror of https://github.com/apache/lucene.git
LUCENE-7142: BKDWriter wasn't splitting correctly with long ords; improve tests so we sometimes long ords even for small number of points
This commit is contained in:
parent
6d288704c4
commit
7b93fc46b1
|
@ -128,21 +128,31 @@ public class BKDWriter implements Closeable {
|
|||
protected long pointCount;
|
||||
|
||||
/** true if we have so many values that we must write ords using long (8 bytes) instead of int (4 bytes) */
|
||||
private final boolean longOrds;
|
||||
protected final boolean longOrds;
|
||||
|
||||
/** An upper bound on how many points the caller will add (includes deletions) */
|
||||
private final long totalPointCount;
|
||||
|
||||
/** True if every document has at most one value. We specialize this case by not bothering to store the ord since it's redundant with docID. */
|
||||
private final boolean singleValuePerDoc;
|
||||
protected final boolean singleValuePerDoc;
|
||||
|
||||
/** How much heap OfflineSorter is allowed to use */
|
||||
protected final OfflineSorter.BufferSize offlineSorterBufferMB;
|
||||
|
||||
/** How much heap OfflineSorter is allowed to use */
|
||||
protected final int offlineSorterMaxTempFiles;
|
||||
|
||||
private final int maxDoc;
|
||||
|
||||
public BKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDims, int bytesPerDim, long totalPointCount, boolean singleValuePerDoc) throws IOException {
|
||||
this(maxDoc, tempDir, tempFileNamePrefix, numDims, bytesPerDim, DEFAULT_MAX_POINTS_IN_LEAF_NODE, DEFAULT_MAX_MB_SORT_IN_HEAP, totalPointCount, singleValuePerDoc);
|
||||
public BKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDims, int bytesPerDim,
|
||||
int maxPointsInLeafNode, double maxMBSortInHeap, long totalPointCount, boolean singleValuePerDoc) throws IOException {
|
||||
this(maxDoc, tempDir, tempFileNamePrefix, numDims, bytesPerDim, maxPointsInLeafNode, maxMBSortInHeap, totalPointCount, singleValuePerDoc,
|
||||
totalPointCount > Integer.MAX_VALUE, Math.max(1, (long) maxMBSortInHeap), OfflineSorter.MAX_TEMPFILES);
|
||||
}
|
||||
|
||||
public BKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDims, int bytesPerDim, int maxPointsInLeafNode, double maxMBSortInHeap, long totalPointCount, boolean singleValuePerDoc) throws IOException {
|
||||
protected BKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDims, int bytesPerDim,
|
||||
int maxPointsInLeafNode, double maxMBSortInHeap, long totalPointCount,
|
||||
boolean singleValuePerDoc, boolean longOrds, long offlineSorterBufferMB, int offlineSorterMaxTempFiles) throws IOException {
|
||||
verifyParams(numDims, maxPointsInLeafNode, maxMBSortInHeap, totalPointCount);
|
||||
// We use tracking dir to deal with removing files on exception, so each place that
|
||||
// creates temp files doesn't need crazy try/finally/sucess logic:
|
||||
|
@ -153,6 +163,8 @@ public class BKDWriter implements Closeable {
|
|||
this.bytesPerDim = bytesPerDim;
|
||||
this.totalPointCount = totalPointCount;
|
||||
this.maxDoc = maxDoc;
|
||||
this.offlineSorterBufferMB = OfflineSorter.BufferSize.megabytes(offlineSorterBufferMB);
|
||||
this.offlineSorterMaxTempFiles = offlineSorterMaxTempFiles;
|
||||
docsSeen = new FixedBitSet(maxDoc);
|
||||
packedBytesLength = numDims * bytesPerDim;
|
||||
|
||||
|
@ -166,7 +178,8 @@ public class BKDWriter implements Closeable {
|
|||
maxPackedValue = new byte[packedBytesLength];
|
||||
|
||||
// If we may have more than 1+Integer.MAX_VALUE values, then we must encode ords with long (8 bytes), else we can use int (4 bytes).
|
||||
longOrds = totalPointCount > Integer.MAX_VALUE;
|
||||
this.longOrds = longOrds;
|
||||
|
||||
this.singleValuePerDoc = singleValuePerDoc;
|
||||
|
||||
// dimensional values (numDims * bytesPerDim) + ord (int or long) + docID (int)
|
||||
|
@ -735,7 +748,7 @@ public class BKDWriter implements Closeable {
|
|||
// TODO: this is sort of sneaky way to get the final OfflinePointWriter from OfflineSorter:
|
||||
IndexOutput[] lastWriter = new IndexOutput[1];
|
||||
|
||||
OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix + "_bkd" + dim, cmp, OfflineSorter.BufferSize.megabytes(Math.max(1, (long) maxMBSortInHeap)), OfflineSorter.MAX_TEMPFILES) {
|
||||
OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix + "_bkd" + dim, cmp, offlineSorterBufferMB, offlineSorterMaxTempFiles) {
|
||||
|
||||
/** We write/read fixed-byte-width file that {@link OfflinePointReader} can read. */
|
||||
@Override
|
||||
|
|
|
@ -198,12 +198,15 @@ final class OfflinePointReader extends PointReader {
|
|||
in.readBytes(buffer, 0, buffer.length);
|
||||
|
||||
long ord;
|
||||
if (singleValuePerDoc == false) {
|
||||
ord = readInt(buffer, packedBytesLength+Integer.BYTES);
|
||||
} else if (longOrds) {
|
||||
if (longOrds) {
|
||||
// A long ord, after the docID:
|
||||
ord = readLong(buffer, packedBytesLength+Integer.BYTES);
|
||||
} else {
|
||||
} else if (singleValuePerDoc) {
|
||||
// docID is the ord:
|
||||
ord = readInt(buffer, packedBytesLength);
|
||||
} else {
|
||||
// An int ord, after the docID:
|
||||
ord = readInt(buffer, packedBytesLength+Integer.BYTES);
|
||||
}
|
||||
|
||||
if (rightTree.get(ord)) {
|
||||
|
|
|
@ -40,7 +40,8 @@ public class Test2BBKDPoints extends LuceneTestCase {
|
|||
|
||||
final int numDocs = (Integer.MAX_VALUE / 26) + 100;
|
||||
|
||||
BKDWriter w = new BKDWriter(numDocs, dir, "_0", 1, Long.BYTES, 26L * numDocs, false);
|
||||
BKDWriter w = new BKDWriter(numDocs, dir, "_0", 1, Long.BYTES,
|
||||
BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP, 26L * numDocs, false);
|
||||
int counter = 0;
|
||||
byte[] packedBytes = new byte[Long.BYTES];
|
||||
for (int docID = 0; docID < numDocs; docID++) {
|
||||
|
@ -73,7 +74,8 @@ public class Test2BBKDPoints extends LuceneTestCase {
|
|||
|
||||
final int numDocs = (Integer.MAX_VALUE / 26) + 100;
|
||||
|
||||
BKDWriter w = new BKDWriter(numDocs, dir, "_0", 2, Long.BYTES, 26L * numDocs, false);
|
||||
BKDWriter w = new BKDWriter(numDocs, dir, "_0", 2, Long.BYTES,
|
||||
BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP, 26L * numDocs, false);
|
||||
int counter = 0;
|
||||
byte[] packedBytes = new byte[2*Long.BYTES];
|
||||
for (int docID = 0; docID < numDocs; docID++) {
|
||||
|
|
|
@ -891,7 +891,7 @@ public class TestBKD extends LuceneTestCase {
|
|||
public void testTieBreakOrder() throws Exception {
|
||||
try (Directory dir = newDirectory()) {
|
||||
int numDocs = 10000;
|
||||
BKDWriter w = new BKDWriter(numDocs+1, dir, "tmp", 1, 4, 2, 0.01f, numDocs, true);
|
||||
BKDWriter w = new BKDWriter(numDocs+1, dir, "tmp", 1, Integer.BYTES, 2, 0.01f, numDocs, true);
|
||||
for(int i=0;i<numDocs;i++) {
|
||||
w.add(new byte[Integer.BYTES], i);
|
||||
}
|
||||
|
@ -925,4 +925,53 @@ public class TestBKD extends LuceneTestCase {
|
|||
in.close();
|
||||
}
|
||||
}
|
||||
|
||||
public void test2DLongOrdsOffline() throws Exception {
|
||||
try (Directory dir = newDirectory()) {
|
||||
int numDocs = 100000;
|
||||
boolean singleValuePerDoc = false;
|
||||
boolean longOrds = true;
|
||||
int offlineSorterMaxTempFiles = TestUtil.nextInt(random(), 2, 20);
|
||||
BKDWriter w = new BKDWriter(numDocs+1, dir, "tmp", 2, Integer.BYTES, 2, 0.01f, numDocs,
|
||||
singleValuePerDoc, longOrds, 1, offlineSorterMaxTempFiles);
|
||||
byte[] buffer = new byte[2*Integer.BYTES];
|
||||
for(int i=0;i<numDocs;i++) {
|
||||
random().nextBytes(buffer);
|
||||
w.add(buffer, i);
|
||||
}
|
||||
|
||||
IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT);
|
||||
long fp = w.finish(out);
|
||||
out.close();
|
||||
|
||||
IndexInput in = dir.openInput("bkd", IOContext.DEFAULT);
|
||||
in.seek(fp);
|
||||
BKDReader r = new BKDReader(in);
|
||||
int[] count = new int[1];
|
||||
r.intersect(new IntersectVisitor() {
|
||||
|
||||
@Override
|
||||
public void visit(int docID) {
|
||||
count[0]++;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void visit(int docID, byte[] packedValue) {
|
||||
visit(docID);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Relation compare(byte[] minPacked, byte[] maxPacked) {
|
||||
if (random().nextInt(7) == 1) {
|
||||
return Relation.CELL_CROSSES_QUERY;
|
||||
} else {
|
||||
return Relation.CELL_INSIDE_QUERY;
|
||||
}
|
||||
}
|
||||
});
|
||||
assertEquals(numDocs, count[0]);
|
||||
in.close();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -277,10 +277,32 @@ public class RandomCodec extends AssertingCodec {
|
|||
public RandomlySplittingBKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDims,
|
||||
int bytesPerDim, int maxPointsInLeafNode, double maxMBSortInHeap,
|
||||
long totalPointCount, boolean singleValuePerDoc, int randomSeed) throws IOException {
|
||||
super(maxDoc, tempDir, tempFileNamePrefix, numDims, bytesPerDim, maxPointsInLeafNode, maxMBSortInHeap, totalPointCount, singleValuePerDoc);
|
||||
super(maxDoc, tempDir, tempFileNamePrefix, numDims, bytesPerDim, maxPointsInLeafNode, maxMBSortInHeap, totalPointCount,
|
||||
getRandomSingleValuePerDoc(singleValuePerDoc, randomSeed),
|
||||
getRandomLongOrds(totalPointCount, singleValuePerDoc, randomSeed),
|
||||
getRandomOfflineSorterBufferMB(randomSeed),
|
||||
getRandomOfflineSorterMaxTempFiles(randomSeed));
|
||||
this.random = new Random(randomSeed);
|
||||
}
|
||||
|
||||
private static boolean getRandomSingleValuePerDoc(boolean singleValuePerDoc, int randomSeed) {
|
||||
// If we are single valued, sometimes pretend we aren't:
|
||||
return singleValuePerDoc && (new Random(randomSeed).nextBoolean());
|
||||
}
|
||||
|
||||
private static boolean getRandomLongOrds(long totalPointCount, boolean singleValuePerDoc, int randomSeed) {
|
||||
// Always use long ords if we have too many points, but sometimes randomly use it anyway when singleValuePerDoc is false:
|
||||
return totalPointCount > Integer.MAX_VALUE || (getRandomSingleValuePerDoc(singleValuePerDoc, randomSeed) == false && new Random(randomSeed).nextBoolean());
|
||||
}
|
||||
|
||||
private static long getRandomOfflineSorterBufferMB(int randomSeed) {
|
||||
return TestUtil.nextInt(new Random(randomSeed), 1, 8);
|
||||
}
|
||||
|
||||
private static int getRandomOfflineSorterMaxTempFiles(int randomSeed) {
|
||||
return TestUtil.nextInt(new Random(randomSeed), 2, 20);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int split(byte[] minPackedValue, byte[] maxPackedValue) {
|
||||
// BKD normally defaults by the widest dimension, to try to make as squarish cells as possible, but we just pick a random one ;)
|
||||
|
|
Loading…
Reference in New Issue