LUCENE-7142: BKDWriter wasn't splitting correctly with long ords; improve tests so we sometimes long ords even for small number of points

This commit is contained in:
Mike McCandless 2016-03-27 05:52:00 -04:00
parent 9f9c2f74b3
commit 28a4335813
5 changed files with 104 additions and 15 deletions

View File

@ -128,21 +128,31 @@ public class BKDWriter implements Closeable {
protected long pointCount; protected long pointCount;
/** true if we have so many values that we must write ords using long (8 bytes) instead of int (4 bytes) */ /** true if we have so many values that we must write ords using long (8 bytes) instead of int (4 bytes) */
private final boolean longOrds; protected final boolean longOrds;
/** An upper bound on how many points the caller will add (includes deletions) */ /** An upper bound on how many points the caller will add (includes deletions) */
private final long totalPointCount; private final long totalPointCount;
/** True if every document has at most one value. We specialize this case by not bothering to store the ord since it's redundant with docID. */ /** True if every document has at most one value. We specialize this case by not bothering to store the ord since it's redundant with docID. */
private final boolean singleValuePerDoc; protected final boolean singleValuePerDoc;
/** How much heap OfflineSorter is allowed to use */
protected final OfflineSorter.BufferSize offlineSorterBufferMB;
/** How much heap OfflineSorter is allowed to use */
protected final int offlineSorterMaxTempFiles;
private final int maxDoc; private final int maxDoc;
public BKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDims, int bytesPerDim, long totalPointCount, boolean singleValuePerDoc) throws IOException { public BKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDims, int bytesPerDim,
this(maxDoc, tempDir, tempFileNamePrefix, numDims, bytesPerDim, DEFAULT_MAX_POINTS_IN_LEAF_NODE, DEFAULT_MAX_MB_SORT_IN_HEAP, totalPointCount, singleValuePerDoc); int maxPointsInLeafNode, double maxMBSortInHeap, long totalPointCount, boolean singleValuePerDoc) throws IOException {
this(maxDoc, tempDir, tempFileNamePrefix, numDims, bytesPerDim, maxPointsInLeafNode, maxMBSortInHeap, totalPointCount, singleValuePerDoc,
totalPointCount > Integer.MAX_VALUE, Math.max(1, (long) maxMBSortInHeap), OfflineSorter.MAX_TEMPFILES);
} }
public BKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDims, int bytesPerDim, int maxPointsInLeafNode, double maxMBSortInHeap, long totalPointCount, boolean singleValuePerDoc) throws IOException { protected BKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDims, int bytesPerDim,
int maxPointsInLeafNode, double maxMBSortInHeap, long totalPointCount,
boolean singleValuePerDoc, boolean longOrds, long offlineSorterBufferMB, int offlineSorterMaxTempFiles) throws IOException {
verifyParams(numDims, maxPointsInLeafNode, maxMBSortInHeap, totalPointCount); verifyParams(numDims, maxPointsInLeafNode, maxMBSortInHeap, totalPointCount);
// We use tracking dir to deal with removing files on exception, so each place that // We use tracking dir to deal with removing files on exception, so each place that
// creates temp files doesn't need crazy try/finally/sucess logic: // creates temp files doesn't need crazy try/finally/sucess logic:
@ -153,6 +163,8 @@ public class BKDWriter implements Closeable {
this.bytesPerDim = bytesPerDim; this.bytesPerDim = bytesPerDim;
this.totalPointCount = totalPointCount; this.totalPointCount = totalPointCount;
this.maxDoc = maxDoc; this.maxDoc = maxDoc;
this.offlineSorterBufferMB = OfflineSorter.BufferSize.megabytes(offlineSorterBufferMB);
this.offlineSorterMaxTempFiles = offlineSorterMaxTempFiles;
docsSeen = new FixedBitSet(maxDoc); docsSeen = new FixedBitSet(maxDoc);
packedBytesLength = numDims * bytesPerDim; packedBytesLength = numDims * bytesPerDim;
@ -166,7 +178,8 @@ public class BKDWriter implements Closeable {
maxPackedValue = new byte[packedBytesLength]; maxPackedValue = new byte[packedBytesLength];
// If we may have more than 1+Integer.MAX_VALUE values, then we must encode ords with long (8 bytes), else we can use int (4 bytes). // If we may have more than 1+Integer.MAX_VALUE values, then we must encode ords with long (8 bytes), else we can use int (4 bytes).
longOrds = totalPointCount > Integer.MAX_VALUE; this.longOrds = longOrds;
this.singleValuePerDoc = singleValuePerDoc; this.singleValuePerDoc = singleValuePerDoc;
// dimensional values (numDims * bytesPerDim) + ord (int or long) + docID (int) // dimensional values (numDims * bytesPerDim) + ord (int or long) + docID (int)
@ -735,7 +748,7 @@ public class BKDWriter implements Closeable {
// TODO: this is sort of sneaky way to get the final OfflinePointWriter from OfflineSorter: // TODO: this is sort of sneaky way to get the final OfflinePointWriter from OfflineSorter:
IndexOutput[] lastWriter = new IndexOutput[1]; IndexOutput[] lastWriter = new IndexOutput[1];
OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix + "_bkd" + dim, cmp, OfflineSorter.BufferSize.megabytes(Math.max(1, (long) maxMBSortInHeap)), OfflineSorter.MAX_TEMPFILES) { OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix + "_bkd" + dim, cmp, offlineSorterBufferMB, offlineSorterMaxTempFiles) {
/** We write/read fixed-byte-width file that {@link OfflinePointReader} can read. */ /** We write/read fixed-byte-width file that {@link OfflinePointReader} can read. */
@Override @Override

View File

@ -198,12 +198,15 @@ final class OfflinePointReader extends PointReader {
in.readBytes(buffer, 0, buffer.length); in.readBytes(buffer, 0, buffer.length);
long ord; long ord;
if (singleValuePerDoc == false) { if (longOrds) {
ord = readInt(buffer, packedBytesLength+Integer.BYTES); // A long ord, after the docID:
} else if (longOrds) {
ord = readLong(buffer, packedBytesLength+Integer.BYTES); ord = readLong(buffer, packedBytesLength+Integer.BYTES);
} else { } else if (singleValuePerDoc) {
// docID is the ord:
ord = readInt(buffer, packedBytesLength); ord = readInt(buffer, packedBytesLength);
} else {
// An int ord, after the docID:
ord = readInt(buffer, packedBytesLength+Integer.BYTES);
} }
if (rightTree.get(ord)) { if (rightTree.get(ord)) {

View File

@ -40,7 +40,8 @@ public class Test2BBKDPoints extends LuceneTestCase {
final int numDocs = (Integer.MAX_VALUE / 26) + 100; final int numDocs = (Integer.MAX_VALUE / 26) + 100;
BKDWriter w = new BKDWriter(numDocs, dir, "_0", 1, Long.BYTES, 26L * numDocs, false); BKDWriter w = new BKDWriter(numDocs, dir, "_0", 1, Long.BYTES,
BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP, 26L * numDocs, false);
int counter = 0; int counter = 0;
byte[] packedBytes = new byte[Long.BYTES]; byte[] packedBytes = new byte[Long.BYTES];
for (int docID = 0; docID < numDocs; docID++) { for (int docID = 0; docID < numDocs; docID++) {
@ -73,7 +74,8 @@ public class Test2BBKDPoints extends LuceneTestCase {
final int numDocs = (Integer.MAX_VALUE / 26) + 100; final int numDocs = (Integer.MAX_VALUE / 26) + 100;
BKDWriter w = new BKDWriter(numDocs, dir, "_0", 2, Long.BYTES, 26L * numDocs, false); BKDWriter w = new BKDWriter(numDocs, dir, "_0", 2, Long.BYTES,
BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP, 26L * numDocs, false);
int counter = 0; int counter = 0;
byte[] packedBytes = new byte[2*Long.BYTES]; byte[] packedBytes = new byte[2*Long.BYTES];
for (int docID = 0; docID < numDocs; docID++) { for (int docID = 0; docID < numDocs; docID++) {

View File

@ -891,7 +891,7 @@ public class TestBKD extends LuceneTestCase {
public void testTieBreakOrder() throws Exception { public void testTieBreakOrder() throws Exception {
try (Directory dir = newDirectory()) { try (Directory dir = newDirectory()) {
int numDocs = 10000; int numDocs = 10000;
BKDWriter w = new BKDWriter(numDocs+1, dir, "tmp", 1, 4, 2, 0.01f, numDocs, true); BKDWriter w = new BKDWriter(numDocs+1, dir, "tmp", 1, Integer.BYTES, 2, 0.01f, numDocs, true);
for(int i=0;i<numDocs;i++) { for(int i=0;i<numDocs;i++) {
w.add(new byte[Integer.BYTES], i); w.add(new byte[Integer.BYTES], i);
} }
@ -925,4 +925,53 @@ public class TestBKD extends LuceneTestCase {
in.close(); in.close();
} }
} }
public void test2DLongOrdsOffline() throws Exception {
try (Directory dir = newDirectory()) {
int numDocs = 100000;
boolean singleValuePerDoc = false;
boolean longOrds = true;
int offlineSorterMaxTempFiles = TestUtil.nextInt(random(), 2, 20);
BKDWriter w = new BKDWriter(numDocs+1, dir, "tmp", 2, Integer.BYTES, 2, 0.01f, numDocs,
singleValuePerDoc, longOrds, 1, offlineSorterMaxTempFiles);
byte[] buffer = new byte[2*Integer.BYTES];
for(int i=0;i<numDocs;i++) {
random().nextBytes(buffer);
w.add(buffer, i);
}
IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT);
long fp = w.finish(out);
out.close();
IndexInput in = dir.openInput("bkd", IOContext.DEFAULT);
in.seek(fp);
BKDReader r = new BKDReader(in);
int[] count = new int[1];
r.intersect(new IntersectVisitor() {
@Override
public void visit(int docID) {
count[0]++;
}
@Override
public void visit(int docID, byte[] packedValue) {
visit(docID);
}
@Override
public Relation compare(byte[] minPacked, byte[] maxPacked) {
if (random().nextInt(7) == 1) {
return Relation.CELL_CROSSES_QUERY;
} else {
return Relation.CELL_INSIDE_QUERY;
}
}
});
assertEquals(numDocs, count[0]);
in.close();
}
}
} }

View File

@ -277,10 +277,32 @@ public class RandomCodec extends AssertingCodec {
public RandomlySplittingBKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDims, public RandomlySplittingBKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDims,
int bytesPerDim, int maxPointsInLeafNode, double maxMBSortInHeap, int bytesPerDim, int maxPointsInLeafNode, double maxMBSortInHeap,
long totalPointCount, boolean singleValuePerDoc, int randomSeed) throws IOException { long totalPointCount, boolean singleValuePerDoc, int randomSeed) throws IOException {
super(maxDoc, tempDir, tempFileNamePrefix, numDims, bytesPerDim, maxPointsInLeafNode, maxMBSortInHeap, totalPointCount, singleValuePerDoc); super(maxDoc, tempDir, tempFileNamePrefix, numDims, bytesPerDim, maxPointsInLeafNode, maxMBSortInHeap, totalPointCount,
getRandomSingleValuePerDoc(singleValuePerDoc, randomSeed),
getRandomLongOrds(totalPointCount, singleValuePerDoc, randomSeed),
getRandomOfflineSorterBufferMB(randomSeed),
getRandomOfflineSorterMaxTempFiles(randomSeed));
this.random = new Random(randomSeed); this.random = new Random(randomSeed);
} }
private static boolean getRandomSingleValuePerDoc(boolean singleValuePerDoc, int randomSeed) {
// If we are single valued, sometimes pretend we aren't:
return singleValuePerDoc && (new Random(randomSeed).nextBoolean());
}
private static boolean getRandomLongOrds(long totalPointCount, boolean singleValuePerDoc, int randomSeed) {
// Always use long ords if we have too many points, but sometimes randomly use it anyway when singleValuePerDoc is false:
return totalPointCount > Integer.MAX_VALUE || (getRandomSingleValuePerDoc(singleValuePerDoc, randomSeed) == false && new Random(randomSeed).nextBoolean());
}
private static long getRandomOfflineSorterBufferMB(int randomSeed) {
return TestUtil.nextInt(new Random(randomSeed), 1, 8);
}
private static int getRandomOfflineSorterMaxTempFiles(int randomSeed) {
return TestUtil.nextInt(new Random(randomSeed), 2, 20);
}
@Override @Override
protected int split(byte[] minPackedValue, byte[] maxPackedValue) { protected int split(byte[] minPackedValue, byte[] maxPackedValue) {
// BKD normally defaults by the widest dimension, to try to make as squarish cells as possible, but we just pick a random one ;) // BKD normally defaults by the widest dimension, to try to make as squarish cells as possible, but we just pick a random one ;)