From 7b93fc46b1e284494a500f4df541269bdfa67dcf Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Sun, 27 Mar 2016 05:52:00 -0400 Subject: [PATCH] LUCENE-7142: BKDWriter wasn't splitting correctly with long ords; improve tests so we sometimes long ords even for small number of points --- .../org/apache/lucene/util/bkd/BKDWriter.java | 27 +++++++--- .../lucene/util/bkd/OfflinePointReader.java | 11 ++-- .../lucene/util/bkd/Test2BBKDPoints.java | 6 ++- .../org/apache/lucene/util/bkd/TestBKD.java | 51 ++++++++++++++++++- .../org/apache/lucene/index/RandomCodec.java | 24 ++++++++- 5 files changed, 104 insertions(+), 15 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java index 33f72e4e9e1..3cf32a9b0d3 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java @@ -128,21 +128,31 @@ public class BKDWriter implements Closeable { protected long pointCount; /** true if we have so many values that we must write ords using long (8 bytes) instead of int (4 bytes) */ - private final boolean longOrds; + protected final boolean longOrds; /** An upper bound on how many points the caller will add (includes deletions) */ private final long totalPointCount; /** True if every document has at most one value. We specialize this case by not bothering to store the ord since it's redundant with docID. */ - private final boolean singleValuePerDoc; + protected final boolean singleValuePerDoc; + + /** How much heap OfflineSorter is allowed to use */ + protected final OfflineSorter.BufferSize offlineSorterBufferMB; + + /** How much heap OfflineSorter is allowed to use */ + protected final int offlineSorterMaxTempFiles; private final int maxDoc; - public BKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDims, int bytesPerDim, long totalPointCount, boolean singleValuePerDoc) throws IOException { - this(maxDoc, tempDir, tempFileNamePrefix, numDims, bytesPerDim, DEFAULT_MAX_POINTS_IN_LEAF_NODE, DEFAULT_MAX_MB_SORT_IN_HEAP, totalPointCount, singleValuePerDoc); + public BKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDims, int bytesPerDim, + int maxPointsInLeafNode, double maxMBSortInHeap, long totalPointCount, boolean singleValuePerDoc) throws IOException { + this(maxDoc, tempDir, tempFileNamePrefix, numDims, bytesPerDim, maxPointsInLeafNode, maxMBSortInHeap, totalPointCount, singleValuePerDoc, + totalPointCount > Integer.MAX_VALUE, Math.max(1, (long) maxMBSortInHeap), OfflineSorter.MAX_TEMPFILES); } - public BKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDims, int bytesPerDim, int maxPointsInLeafNode, double maxMBSortInHeap, long totalPointCount, boolean singleValuePerDoc) throws IOException { + protected BKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDims, int bytesPerDim, + int maxPointsInLeafNode, double maxMBSortInHeap, long totalPointCount, + boolean singleValuePerDoc, boolean longOrds, long offlineSorterBufferMB, int offlineSorterMaxTempFiles) throws IOException { verifyParams(numDims, maxPointsInLeafNode, maxMBSortInHeap, totalPointCount); // We use tracking dir to deal with removing files on exception, so each place that // creates temp files doesn't need crazy try/finally/sucess logic: @@ -153,6 +163,8 @@ public class BKDWriter implements Closeable { this.bytesPerDim = bytesPerDim; this.totalPointCount = totalPointCount; this.maxDoc = maxDoc; + this.offlineSorterBufferMB = OfflineSorter.BufferSize.megabytes(offlineSorterBufferMB); + this.offlineSorterMaxTempFiles = offlineSorterMaxTempFiles; docsSeen = new FixedBitSet(maxDoc); packedBytesLength = numDims * bytesPerDim; @@ -166,7 +178,8 @@ public class BKDWriter implements Closeable { maxPackedValue = new byte[packedBytesLength]; // If we may have more than 1+Integer.MAX_VALUE values, then we must encode ords with long (8 bytes), else we can use int (4 bytes). - longOrds = totalPointCount > Integer.MAX_VALUE; + this.longOrds = longOrds; + this.singleValuePerDoc = singleValuePerDoc; // dimensional values (numDims * bytesPerDim) + ord (int or long) + docID (int) @@ -735,7 +748,7 @@ public class BKDWriter implements Closeable { // TODO: this is sort of sneaky way to get the final OfflinePointWriter from OfflineSorter: IndexOutput[] lastWriter = new IndexOutput[1]; - OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix + "_bkd" + dim, cmp, OfflineSorter.BufferSize.megabytes(Math.max(1, (long) maxMBSortInHeap)), OfflineSorter.MAX_TEMPFILES) { + OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix + "_bkd" + dim, cmp, offlineSorterBufferMB, offlineSorterMaxTempFiles) { /** We write/read fixed-byte-width file that {@link OfflinePointReader} can read. */ @Override diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointReader.java index 20faa9e361d..17758c02c93 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointReader.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointReader.java @@ -198,12 +198,15 @@ final class OfflinePointReader extends PointReader { in.readBytes(buffer, 0, buffer.length); long ord; - if (singleValuePerDoc == false) { - ord = readInt(buffer, packedBytesLength+Integer.BYTES); - } else if (longOrds) { + if (longOrds) { + // A long ord, after the docID: ord = readLong(buffer, packedBytesLength+Integer.BYTES); - } else { + } else if (singleValuePerDoc) { + // docID is the ord: ord = readInt(buffer, packedBytesLength); + } else { + // An int ord, after the docID: + ord = readInt(buffer, packedBytesLength+Integer.BYTES); } if (rightTree.get(ord)) { diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/Test2BBKDPoints.java b/lucene/core/src/test/org/apache/lucene/util/bkd/Test2BBKDPoints.java index ffcbcf83cc5..af2e463102c 100644 --- a/lucene/core/src/test/org/apache/lucene/util/bkd/Test2BBKDPoints.java +++ b/lucene/core/src/test/org/apache/lucene/util/bkd/Test2BBKDPoints.java @@ -40,7 +40,8 @@ public class Test2BBKDPoints extends LuceneTestCase { final int numDocs = (Integer.MAX_VALUE / 26) + 100; - BKDWriter w = new BKDWriter(numDocs, dir, "_0", 1, Long.BYTES, 26L * numDocs, false); + BKDWriter w = new BKDWriter(numDocs, dir, "_0", 1, Long.BYTES, + BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP, 26L * numDocs, false); int counter = 0; byte[] packedBytes = new byte[Long.BYTES]; for (int docID = 0; docID < numDocs; docID++) { @@ -73,7 +74,8 @@ public class Test2BBKDPoints extends LuceneTestCase { final int numDocs = (Integer.MAX_VALUE / 26) + 100; - BKDWriter w = new BKDWriter(numDocs, dir, "_0", 2, Long.BYTES, 26L * numDocs, false); + BKDWriter w = new BKDWriter(numDocs, dir, "_0", 2, Long.BYTES, + BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP, 26L * numDocs, false); int counter = 0; byte[] packedBytes = new byte[2*Long.BYTES]; for (int docID = 0; docID < numDocs; docID++) { diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java index 18fbacf3890..38b3fb5c87c 100644 --- a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java +++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java @@ -891,7 +891,7 @@ public class TestBKD extends LuceneTestCase { public void testTieBreakOrder() throws Exception { try (Directory dir = newDirectory()) { int numDocs = 10000; - BKDWriter w = new BKDWriter(numDocs+1, dir, "tmp", 1, 4, 2, 0.01f, numDocs, true); + BKDWriter w = new BKDWriter(numDocs+1, dir, "tmp", 1, Integer.BYTES, 2, 0.01f, numDocs, true); for(int i=0;i Integer.MAX_VALUE || (getRandomSingleValuePerDoc(singleValuePerDoc, randomSeed) == false && new Random(randomSeed).nextBoolean()); + } + + private static long getRandomOfflineSorterBufferMB(int randomSeed) { + return TestUtil.nextInt(new Random(randomSeed), 1, 8); + } + + private static int getRandomOfflineSorterMaxTempFiles(int randomSeed) { + return TestUtil.nextInt(new Random(randomSeed), 2, 20); + } + @Override protected int split(byte[] minPackedValue, byte[] maxPackedValue) { // BKD normally defaults by the widest dimension, to try to make as squarish cells as possible, but we just pick a random one ;)