LUCENE-7142: BKDWriter wasn't splitting correctly with long ords; improve tests so we sometimes long ords even for small number of points

This commit is contained in:
Mike McCandless 2016-03-27 05:52:00 -04:00
parent 9f9c2f74b3
commit 28a4335813
5 changed files with 104 additions and 15 deletions

View File

@ -128,21 +128,31 @@ public class BKDWriter implements Closeable {
protected long pointCount;
/** true if we have so many values that we must write ords using long (8 bytes) instead of int (4 bytes) */
private final boolean longOrds;
protected final boolean longOrds;
/** An upper bound on how many points the caller will add (includes deletions) */
private final long totalPointCount;
/** True if every document has at most one value. We specialize this case by not bothering to store the ord since it's redundant with docID. */
private final boolean singleValuePerDoc;
protected final boolean singleValuePerDoc;
/** How much heap OfflineSorter is allowed to use */
protected final OfflineSorter.BufferSize offlineSorterBufferMB;
/** How much heap OfflineSorter is allowed to use */
protected final int offlineSorterMaxTempFiles;
private final int maxDoc;
public BKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDims, int bytesPerDim, long totalPointCount, boolean singleValuePerDoc) throws IOException {
this(maxDoc, tempDir, tempFileNamePrefix, numDims, bytesPerDim, DEFAULT_MAX_POINTS_IN_LEAF_NODE, DEFAULT_MAX_MB_SORT_IN_HEAP, totalPointCount, singleValuePerDoc);
public BKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDims, int bytesPerDim,
int maxPointsInLeafNode, double maxMBSortInHeap, long totalPointCount, boolean singleValuePerDoc) throws IOException {
this(maxDoc, tempDir, tempFileNamePrefix, numDims, bytesPerDim, maxPointsInLeafNode, maxMBSortInHeap, totalPointCount, singleValuePerDoc,
totalPointCount > Integer.MAX_VALUE, Math.max(1, (long) maxMBSortInHeap), OfflineSorter.MAX_TEMPFILES);
}
public BKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDims, int bytesPerDim, int maxPointsInLeafNode, double maxMBSortInHeap, long totalPointCount, boolean singleValuePerDoc) throws IOException {
protected BKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDims, int bytesPerDim,
int maxPointsInLeafNode, double maxMBSortInHeap, long totalPointCount,
boolean singleValuePerDoc, boolean longOrds, long offlineSorterBufferMB, int offlineSorterMaxTempFiles) throws IOException {
verifyParams(numDims, maxPointsInLeafNode, maxMBSortInHeap, totalPointCount);
// We use tracking dir to deal with removing files on exception, so each place that
// creates temp files doesn't need crazy try/finally/sucess logic:
@ -153,6 +163,8 @@ public class BKDWriter implements Closeable {
this.bytesPerDim = bytesPerDim;
this.totalPointCount = totalPointCount;
this.maxDoc = maxDoc;
this.offlineSorterBufferMB = OfflineSorter.BufferSize.megabytes(offlineSorterBufferMB);
this.offlineSorterMaxTempFiles = offlineSorterMaxTempFiles;
docsSeen = new FixedBitSet(maxDoc);
packedBytesLength = numDims * bytesPerDim;
@ -166,7 +178,8 @@ public class BKDWriter implements Closeable {
maxPackedValue = new byte[packedBytesLength];
// If we may have more than 1+Integer.MAX_VALUE values, then we must encode ords with long (8 bytes), else we can use int (4 bytes).
longOrds = totalPointCount > Integer.MAX_VALUE;
this.longOrds = longOrds;
this.singleValuePerDoc = singleValuePerDoc;
// dimensional values (numDims * bytesPerDim) + ord (int or long) + docID (int)
@ -735,7 +748,7 @@ public class BKDWriter implements Closeable {
// TODO: this is sort of sneaky way to get the final OfflinePointWriter from OfflineSorter:
IndexOutput[] lastWriter = new IndexOutput[1];
OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix + "_bkd" + dim, cmp, OfflineSorter.BufferSize.megabytes(Math.max(1, (long) maxMBSortInHeap)), OfflineSorter.MAX_TEMPFILES) {
OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix + "_bkd" + dim, cmp, offlineSorterBufferMB, offlineSorterMaxTempFiles) {
/** We write/read fixed-byte-width file that {@link OfflinePointReader} can read. */
@Override

View File

@ -198,12 +198,15 @@ final class OfflinePointReader extends PointReader {
in.readBytes(buffer, 0, buffer.length);
long ord;
if (singleValuePerDoc == false) {
ord = readInt(buffer, packedBytesLength+Integer.BYTES);
} else if (longOrds) {
if (longOrds) {
// A long ord, after the docID:
ord = readLong(buffer, packedBytesLength+Integer.BYTES);
} else {
} else if (singleValuePerDoc) {
// docID is the ord:
ord = readInt(buffer, packedBytesLength);
} else {
// An int ord, after the docID:
ord = readInt(buffer, packedBytesLength+Integer.BYTES);
}
if (rightTree.get(ord)) {

View File

@ -40,7 +40,8 @@ public class Test2BBKDPoints extends LuceneTestCase {
final int numDocs = (Integer.MAX_VALUE / 26) + 100;
BKDWriter w = new BKDWriter(numDocs, dir, "_0", 1, Long.BYTES, 26L * numDocs, false);
BKDWriter w = new BKDWriter(numDocs, dir, "_0", 1, Long.BYTES,
BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP, 26L * numDocs, false);
int counter = 0;
byte[] packedBytes = new byte[Long.BYTES];
for (int docID = 0; docID < numDocs; docID++) {
@ -73,7 +74,8 @@ public class Test2BBKDPoints extends LuceneTestCase {
final int numDocs = (Integer.MAX_VALUE / 26) + 100;
BKDWriter w = new BKDWriter(numDocs, dir, "_0", 2, Long.BYTES, 26L * numDocs, false);
BKDWriter w = new BKDWriter(numDocs, dir, "_0", 2, Long.BYTES,
BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP, 26L * numDocs, false);
int counter = 0;
byte[] packedBytes = new byte[2*Long.BYTES];
for (int docID = 0; docID < numDocs; docID++) {

View File

@ -891,7 +891,7 @@ public class TestBKD extends LuceneTestCase {
public void testTieBreakOrder() throws Exception {
try (Directory dir = newDirectory()) {
int numDocs = 10000;
BKDWriter w = new BKDWriter(numDocs+1, dir, "tmp", 1, 4, 2, 0.01f, numDocs, true);
BKDWriter w = new BKDWriter(numDocs+1, dir, "tmp", 1, Integer.BYTES, 2, 0.01f, numDocs, true);
for(int i=0;i<numDocs;i++) {
w.add(new byte[Integer.BYTES], i);
}
@ -925,4 +925,53 @@ public class TestBKD extends LuceneTestCase {
in.close();
}
}
public void test2DLongOrdsOffline() throws Exception {
try (Directory dir = newDirectory()) {
int numDocs = 100000;
boolean singleValuePerDoc = false;
boolean longOrds = true;
int offlineSorterMaxTempFiles = TestUtil.nextInt(random(), 2, 20);
BKDWriter w = new BKDWriter(numDocs+1, dir, "tmp", 2, Integer.BYTES, 2, 0.01f, numDocs,
singleValuePerDoc, longOrds, 1, offlineSorterMaxTempFiles);
byte[] buffer = new byte[2*Integer.BYTES];
for(int i=0;i<numDocs;i++) {
random().nextBytes(buffer);
w.add(buffer, i);
}
IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT);
long fp = w.finish(out);
out.close();
IndexInput in = dir.openInput("bkd", IOContext.DEFAULT);
in.seek(fp);
BKDReader r = new BKDReader(in);
int[] count = new int[1];
r.intersect(new IntersectVisitor() {
@Override
public void visit(int docID) {
count[0]++;
}
@Override
public void visit(int docID, byte[] packedValue) {
visit(docID);
}
@Override
public Relation compare(byte[] minPacked, byte[] maxPacked) {
if (random().nextInt(7) == 1) {
return Relation.CELL_CROSSES_QUERY;
} else {
return Relation.CELL_INSIDE_QUERY;
}
}
});
assertEquals(numDocs, count[0]);
in.close();
}
}
}

View File

@ -277,10 +277,32 @@ public class RandomCodec extends AssertingCodec {
public RandomlySplittingBKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDims,
int bytesPerDim, int maxPointsInLeafNode, double maxMBSortInHeap,
long totalPointCount, boolean singleValuePerDoc, int randomSeed) throws IOException {
super(maxDoc, tempDir, tempFileNamePrefix, numDims, bytesPerDim, maxPointsInLeafNode, maxMBSortInHeap, totalPointCount, singleValuePerDoc);
super(maxDoc, tempDir, tempFileNamePrefix, numDims, bytesPerDim, maxPointsInLeafNode, maxMBSortInHeap, totalPointCount,
getRandomSingleValuePerDoc(singleValuePerDoc, randomSeed),
getRandomLongOrds(totalPointCount, singleValuePerDoc, randomSeed),
getRandomOfflineSorterBufferMB(randomSeed),
getRandomOfflineSorterMaxTempFiles(randomSeed));
this.random = new Random(randomSeed);
}
private static boolean getRandomSingleValuePerDoc(boolean singleValuePerDoc, int randomSeed) {
// If we are single valued, sometimes pretend we aren't:
return singleValuePerDoc && (new Random(randomSeed).nextBoolean());
}
private static boolean getRandomLongOrds(long totalPointCount, boolean singleValuePerDoc, int randomSeed) {
// Always use long ords if we have too many points, but sometimes randomly use it anyway when singleValuePerDoc is false:
return totalPointCount > Integer.MAX_VALUE || (getRandomSingleValuePerDoc(singleValuePerDoc, randomSeed) == false && new Random(randomSeed).nextBoolean());
}
private static long getRandomOfflineSorterBufferMB(int randomSeed) {
return TestUtil.nextInt(new Random(randomSeed), 1, 8);
}
private static int getRandomOfflineSorterMaxTempFiles(int randomSeed) {
return TestUtil.nextInt(new Random(randomSeed), 2, 20);
}
@Override
protected int split(byte[] minPackedValue, byte[] maxPackedValue) {
// BKD normally defaults by the widest dimension, to try to make as squarish cells as possible, but we just pick a random one ;)