fix int overflow bug in BKDWriter that prevented it from indexing > 2.1B points; try to improve runtime of Test2BPoints

This commit is contained in:
Mike McCandless 2016-03-11 06:48:30 -05:00
parent fafbb2b6c7
commit 1e05d3be76
2 changed files with 50 additions and 16 deletions

View File

@ -1082,7 +1082,7 @@ public class BKDWriter implements Closeable {
// Second pass: write the full values: // Second pass: write the full values:
byte[] lastPackedValue = new byte[bytesPerDim]; byte[] lastPackedValue = new byte[bytesPerDim];
for (int i=0;i<source.count;i++) { for (int i=0;i<count;i++) {
// TODO: we could do bulk copying here, avoiding the intermediate copy: // TODO: we could do bulk copying here, avoiding the intermediate copy:
heapSource.readPackedValue(Math.toIntExact(source.start + i), scratchPackedValue); heapSource.readPackedValue(Math.toIntExact(source.start + i), scratchPackedValue);
assert numDims != 1 || valueInOrder(i, lastPackedValue, scratchPackedValue); assert numDims != 1 || valueInOrder(i, lastPackedValue, scratchPackedValue);
@ -1143,7 +1143,7 @@ public class BKDWriter implements Closeable {
// Partition this source according to how the splitDim split the values: // Partition this source according to how the splitDim split the values:
int nextRightCount = 0; int nextRightCount = 0;
for (int i=0;i<source.count;i++) { for (long i=0;i<source.count;i++) {
boolean result = reader.next(); boolean result = reader.next();
assert result; assert result;
byte[] packedValue = reader.packedValue(); byte[] packedValue = reader.packedValue();

View File

@ -16,8 +16,16 @@
*/ */
package org.apache.lucene.index; package org.apache.lucene.index;
import java.io.IOException;
import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.FilterCodec;
import org.apache.lucene.codecs.PointsFormat;
import org.apache.lucene.codecs.PointsReader;
import org.apache.lucene.codecs.PointsWriter;
import org.apache.lucene.codecs.lucene60.Lucene60PointsReader;
import org.apache.lucene.codecs.lucene60.Lucene60PointsWriter;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.LongPoint; import org.apache.lucene.document.LongPoint;
import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.IndexSearcher;
@ -33,10 +41,10 @@ import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
// e.g. run like this: ant test -Dtestcase=Test2BPoints -Dtests.nightly=true -Dtests.verbose=true -Dtests.monster=true // e.g. run like this: ant test -Dtestcase=Test2BPoints -Dtests.nightly=true -Dtests.verbose=true -Dtests.monster=true
// //
// or: python -u /l/util/src/python/repeatLuceneTest.py -once -nolog -tmpDir /b/tmp -logDir /l/logs Test2BPoints.test1D -verbose // or: python -u /l/util/src/python/repeatLuceneTest.py -heap 6g -once -nolog -tmpDir /b/tmp -logDir /l/logs Test2BPoints.test2D -verbose
@SuppressCodecs({ "SimpleText", "Memory", "Direct", "Compressing" }) @SuppressCodecs({ "SimpleText", "Memory", "Direct", "Compressing" })
@TimeoutSuite(millis = 16 * TimeUnits.HOUR) @TimeoutSuite(millis = 365 * 24 * TimeUnits.HOUR) // hopefully ~1 year is long enough ;)
@Monster("takes at least 4 hours and consumes many GB of temp disk space") @Monster("takes at least 4 hours and consumes many GB of temp disk space")
public class Test2BPoints extends LuceneTestCase { public class Test2BPoints extends LuceneTestCase {
public void test1D() throws Exception { public void test1D() throws Exception {
@ -44,13 +52,15 @@ public class Test2BPoints extends LuceneTestCase {
System.out.println("DIR: " + ((FSDirectory) dir).getDirectory()); System.out.println("DIR: " + ((FSDirectory) dir).getDirectory());
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())) IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()))
.setCodec(Codec.forName("Lucene60")) .setCodec(getCodec())
.setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH) .setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH)
.setRAMBufferSizeMB(64.0) .setRAMBufferSizeMB(256.0)
.setMergeScheduler(new ConcurrentMergeScheduler()) .setMergeScheduler(new ConcurrentMergeScheduler())
.setMergePolicy(newLogMergePolicy(false, 10)) .setMergePolicy(newLogMergePolicy(false, 10))
.setOpenMode(IndexWriterConfig.OpenMode.CREATE); .setOpenMode(IndexWriterConfig.OpenMode.CREATE);
((ConcurrentMergeScheduler) iwc.getMergeScheduler()).setMaxMergesAndThreads(6, 3);
IndexWriter w = new IndexWriter(dir, iwc); IndexWriter w = new IndexWriter(dir, iwc);
MergePolicy mp = w.getConfig().getMergePolicy(); MergePolicy mp = w.getConfig().getMergePolicy();
@ -88,13 +98,15 @@ public class Test2BPoints extends LuceneTestCase {
Directory dir = FSDirectory.open(createTempDir("2BPoints2D")); Directory dir = FSDirectory.open(createTempDir("2BPoints2D"));
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())) IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()))
.setCodec(Codec.forName("Lucene60")) .setCodec(getCodec())
.setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH) .setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH)
.setRAMBufferSizeMB(64.0) .setRAMBufferSizeMB(256.0)
.setMergeScheduler(new ConcurrentMergeScheduler()) .setMergeScheduler(new ConcurrentMergeScheduler())
.setMergePolicy(newLogMergePolicy(false, 10)) .setMergePolicy(newLogMergePolicy(false, 10))
.setOpenMode(IndexWriterConfig.OpenMode.CREATE); .setOpenMode(IndexWriterConfig.OpenMode.CREATE);
((ConcurrentMergeScheduler) iwc.getMergeScheduler()).setMaxMergesAndThreads(6, 3);
IndexWriter w = new IndexWriter(dir, iwc); IndexWriter w = new IndexWriter(dir, iwc);
MergePolicy mp = w.getConfig().getMergePolicy(); MergePolicy mp = w.getConfig().getMergePolicy();
@ -127,4 +139,26 @@ public class Test2BPoints extends LuceneTestCase {
TestUtil.checkIndex(dir); TestUtil.checkIndex(dir);
dir.close(); dir.close();
} }
private static Codec getCodec() {
return new FilterCodec("Lucene60", Codec.forName("Lucene60")) {
@Override
public PointsFormat pointsFormat() {
return new PointsFormat() {
@Override
public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException {
int maxPointsInLeafNode = 1024;
double maxMBSortInHeap = 256.0;
return new Lucene60PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap);
}
@Override
public PointsReader fieldsReader(SegmentReadState readState) throws IOException {
return new Lucene60PointsReader(readState);
}
};
}
};
}
} }