LUCENE-7390: improve points indexing performance by letting the codec use transient heap in proportion to IndexWriter's indexing buffer, by default

This commit is contained in:
Mike McCandless 2016-07-25 11:33:34 -04:00
parent 4ed68bc80e
commit 3a0a9fd2c2
16 changed files with 53 additions and 48 deletions

View File

@ -135,6 +135,11 @@ Improvements
* LUCENE-7385: Improve/fix assert messages in SpanScorer. (David Smiley)
* LUCENE-7390: Improve performance of indexing points by allowing the
codec to use transient heap in proportion to IndexWriter's RAM
buffer, instead of a fixed 16.0 MB. A custom codec can still
override the buffer size itself. (Mike McCandless)
Optimizations
* LUCENE-7330, LUCENE-7339: Speed up conjunction queries. (Adrien Grand)

View File

@ -68,7 +68,7 @@ class SimpleTextPointsWriter extends PointsWriter {
}
@Override
public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException {
public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException {
boolean singleValuePerDoc = values.size(fieldInfo.name) == values.getDocCount(fieldInfo.name);
@ -79,7 +79,7 @@ class SimpleTextPointsWriter extends PointsWriter {
fieldInfo.getPointDimensionCount(),
fieldInfo.getPointNumBytes(),
BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE,
BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP,
maxMBSortInHeap,
values.size(fieldInfo.name),
singleValuePerDoc) {

View File

@ -22,6 +22,7 @@ import java.io.IOException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.util.bkd.BKDWriter;
/** Abstract API to write points
*
@ -34,8 +35,9 @@ public abstract class PointsWriter implements Closeable {
protected PointsWriter() {
}
/** Write all values contained in the provided reader */
public abstract void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException;
/** Write all values contained in the provided reader. {@code maxMBSortInHeap} is the maximum
* transient heap that can be used to sort values, before spilling to disk for offline sorting */
public abstract void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException;
/** Default naive merge implementation for one field: it just re-indexes all the values
* from the incoming segment. The default codec overrides this for 1D fields and uses
@ -145,7 +147,10 @@ public abstract class PointsWriter implements Closeable {
public int getDocCount(String fieldName) {
return finalDocCount;
}
});
},
// TODO: also let merging of > 1D fields tap into IW's indexing buffer size, somehow (1D fields do an optimized merge sort
// and don't need heap)
BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP);
}
/** Default merge implementation to merge incoming points readers by visiting all their points and

View File

@ -39,7 +39,9 @@ import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.bkd.BKDReader;
import org.apache.lucene.util.bkd.BKDWriter;
/** Writes dimensional values */
/** Writes dimensional values
*
* @lucene.experimental */
public class Lucene60PointsWriter extends PointsWriter implements Closeable {
/** Output used to write the BKD tree data file */
@ -50,15 +52,13 @@ public class Lucene60PointsWriter extends PointsWriter implements Closeable {
final SegmentWriteState writeState;
final int maxPointsInLeafNode;
final double maxMBSortInHeap;
private boolean finished;
/** Full constructor */
public Lucene60PointsWriter(SegmentWriteState writeState, int maxPointsInLeafNode, double maxMBSortInHeap) throws IOException {
public Lucene60PointsWriter(SegmentWriteState writeState, int maxPointsInLeafNode) throws IOException {
assert writeState.fieldInfos.hasPointValues();
this.writeState = writeState;
this.maxPointsInLeafNode = maxPointsInLeafNode;
this.maxMBSortInHeap = maxMBSortInHeap;
String dataFileName = IndexFileNames.segmentFileName(writeState.segmentInfo.name,
writeState.segmentSuffix,
Lucene60PointsFormat.DATA_EXTENSION);
@ -80,11 +80,11 @@ public class Lucene60PointsWriter extends PointsWriter implements Closeable {
/** Uses the defaults values for {@code maxPointsInLeafNode} (1024) and {@code maxMBSortInHeap} (16.0) */
public Lucene60PointsWriter(SegmentWriteState writeState) throws IOException {
this(writeState, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP);
this(writeState, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE);
}
@Override
public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException {
public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException {
boolean singleValuePerDoc = values.size(fieldInfo.name) == values.getDocCount(fieldInfo.name);
@ -173,7 +173,8 @@ public class Lucene60PointsWriter extends PointsWriter implements Closeable {
fieldInfo.getPointDimensionCount(),
fieldInfo.getPointNumBytes(),
maxPointsInLeafNode,
maxMBSortInHeap,
// NOTE: not used, since BKDWriter.merge does a merge sort:
BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP,
totMaxSize,
singleValuePerDoc)) {
List<BKDReader> bkdReaders = new ArrayList<>();

View File

@ -153,7 +153,7 @@ class DocumentsWriterPerThread {
final Allocator byteBlockAllocator;
final IntBlockPool.Allocator intBlockAllocator;
private final AtomicLong pendingNumDocs;
private final LiveIndexWriterConfig indexWriterConfig;
final LiveIndexWriterConfig indexWriterConfig;
private final boolean enableTestPoints;
private final IndexWriter indexWriter;

View File

@ -762,7 +762,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
* {@link #getConfig()}.
*
* <p>
* <b>NOTE:</b> after ths writer is created, the given configuration instance
* <b>NOTE:</b> after this writer is created, the given configuration instance
* cannot be passed to another writer.
*
* @param d

View File

@ -168,9 +168,14 @@ public class LiveIndexWriterConfig {
/**
* Determines the amount of RAM that may be used for buffering added documents
* and deletions before they are flushed to the Directory. Generally for
* faster indexing performance it's best to flush by RAM usage instead of
* document count and use as large a RAM buffer as you can.
* and deletions before beginning to flush them to the Directory. For
* faster indexing performance it's best to use as large a RAM buffer as you can.
* <p>
* Note that this setting is not a hard limit on memory usage during indexing, as
* transient and non-trivial memory well beyond this buffer size may be used,
* for example due to segment merges or writing points to new segments.
* For application stability the available memory in the JVM
* should be significantly larger than the RAM buffer used for indexing.
* <p>
* When this is set, the writer will flush whenever buffered documents and
* deletions use this much RAM. Pass in
@ -178,14 +183,6 @@ public class LiveIndexWriterConfig {
* due to RAM usage. Note that if flushing by document count is also enabled,
* then the flush will be triggered by whichever comes first.
* <p>
* The maximum RAM limit is inherently determined by the JVMs available
* memory. Yet, an {@link IndexWriter} session can consume a significantly
* larger amount of memory than the given RAM limit since this limit is just
* an indicator when to flush memory resident documents to the Directory.
* Flushes are likely happen concurrently while other threads adding documents
* to the writer. For application stability the available memory in the JVM
* should be significantly larger than the RAM buffer used for indexing.
* <p>
* <b>NOTE</b>: the account of RAM usage for pending deletions is only
* approximate. Specifically, if you delete by Query, Lucene currently has no
* way to measure the RAM usage of individual Queries so the accounting will

View File

@ -24,6 +24,7 @@ import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.bkd.BKDWriter;
/** Buffers up pending byte[][] value(s) per doc, then flushes when segment flushes. */
class PointValuesWriter {
@ -35,6 +36,7 @@ class PointValuesWriter {
private int numDocs;
private int lastDocID = -1;
private final byte[] packedValue;
private final LiveIndexWriterConfig indexWriterConfig;
public PointValuesWriter(DocumentsWriterPerThread docWriter, FieldInfo fieldInfo) {
this.fieldInfo = fieldInfo;
@ -43,6 +45,7 @@ class PointValuesWriter {
docIDs = new int[16];
iwBytesUsed.addAndGet(16 * Integer.BYTES);
packedValue = new byte[fieldInfo.getPointDimensionCount() * fieldInfo.getPointNumBytes()];
indexWriterConfig = docWriter.indexWriterConfig;
}
// TODO: if exactly the same value is added to exactly the same doc, should we dedup?
@ -124,6 +127,7 @@ class PointValuesWriter {
public int getDocCount(String fieldName) {
return numDocs;
}
});
},
Math.max(indexWriterConfig.getRAMBufferSizeMB()/8.0, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP));
}
}

View File

@ -204,7 +204,7 @@ public class BKDWriter implements Closeable {
// all recursive halves (i.e. 16 + 8 + 4 + 2) so the memory usage is 2X
// what that level would consume, so we multiply by 0.5 to convert from
// bytes to points here. Each dimension has its own sorted partition, so
// we must divide by numDims as wel.
// we must divide by numDims as well.
maxPointsSortInHeap = (int) (0.5 * (maxMBSortInHeap * 1024 * 1024) / (bytesPerDoc * numDims));

View File

@ -41,9 +41,8 @@ public class TestLucene60PointsFormat extends BasePointsFormatTestCase {
if (random().nextBoolean()) {
// randomize parameters
int maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 500);
double maxMBSortInHeap = 3.0 + (3*random().nextDouble());
if (VERBOSE) {
System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode + " and maxMBSortInHeap=" + maxMBSortInHeap);
System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode);
}
// sneaky impersonation!
@ -53,7 +52,7 @@ public class TestLucene60PointsFormat extends BasePointsFormatTestCase {
return new PointsFormat() {
@Override
public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException {
return new Lucene60PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap);
return new Lucene60PointsWriter(writeState, maxPointsInLeafNode);
}
@Override

View File

@ -1156,9 +1156,8 @@ public class TestPointQueries extends LuceneTestCase {
private static Codec getCodec() {
if (Codec.getDefault().getName().equals("Lucene62")) {
int maxPointsInLeafNode = TestUtil.nextInt(random(), 16, 2048);
double maxMBSortInHeap = 5.0 + (3*random().nextDouble());
if (VERBOSE) {
System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode + " and maxMBSortInHeap=" + maxMBSortInHeap);
System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode);
}
return new FilterCodec("Lucene62", Codec.getDefault()) {
@ -1167,7 +1166,7 @@ public class TestPointQueries extends LuceneTestCase {
return new PointsFormat() {
@Override
public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException {
return new Lucene60PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap);
return new Lucene60PointsWriter(writeState, maxPointsInLeafNode);
}
@Override

View File

@ -87,9 +87,8 @@ public class TestGeo3DPoint extends LuceneTestCase {
private static Codec getCodec() {
if (Codec.getDefault().getName().equals("Lucene62")) {
int maxPointsInLeafNode = TestUtil.nextInt(random(), 16, 2048);
double maxMBSortInHeap = 3.0 + (3*random().nextDouble());
if (VERBOSE) {
System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode + " and maxMBSortInHeap=" + maxMBSortInHeap);
System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode);
}
return new FilterCodec("Lucene62", Codec.getDefault()) {
@ -98,7 +97,7 @@ public class TestGeo3DPoint extends LuceneTestCase {
return new PointsFormat() {
@Override
public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException {
return new Lucene60PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap);
return new Lucene60PointsWriter(writeState, maxPointsInLeafNode);
}
@Override

View File

@ -254,11 +254,11 @@ public final class AssertingPointsFormat extends PointsFormat {
}
@Override
public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException {
public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException {
if (fieldInfo.getPointDimensionCount() == 0) {
throw new IllegalArgumentException("writing field=\"" + fieldInfo.name + "\" but pointDimensionalCount is 0");
}
in.writeField(fieldInfo, values);
in.writeField(fieldInfo, values, maxMBSortInHeap);
}
@Override

View File

@ -56,11 +56,11 @@ class CrankyPointsFormat extends PointsFormat {
}
@Override
public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException {
public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException {
if (random.nextInt(100) == 0) {
throw new IOException("Fake IOException");
}
delegate.writeField(fieldInfo, values);
delegate.writeField(fieldInfo, values, maxMBSortInHeap);
}
@Override

View File

@ -67,7 +67,6 @@ import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.SloppyMath;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.bkd.BKDWriter;
/**
* Abstract class to do basic tests for a geospatial impl (high level
@ -1248,7 +1247,7 @@ public abstract class BaseGeoPointTestCase extends LuceneTestCase {
return new PointsFormat() {
@Override
public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException {
return new Lucene60PointsWriter(writeState, pointsInLeaf, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP);
return new Lucene60PointsWriter(writeState, pointsInLeaf);
}
@Override

View File

@ -92,7 +92,6 @@ public class RandomCodec extends AssertingCodec {
// which is less effective for testing.
// TODO: improve how we randomize this...
private final int maxPointsInLeafNode;
private final double maxMBSortInHeap;
private final int bkdSplitRandomSeed;
@Override
@ -103,9 +102,9 @@ public class RandomCodec extends AssertingCodec {
// Randomize how BKDWriter chooses its splis:
return new Lucene60PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap) {
return new Lucene60PointsWriter(writeState, maxPointsInLeafNode) {
@Override
public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException {
public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException {
boolean singleValuePerDoc = values.size(fieldInfo.name) == values.getDocCount(fieldInfo.name);
@ -185,7 +184,6 @@ public class RandomCodec extends AssertingCodec {
int lowFreqCutoff = TestUtil.nextInt(random, 2, 100);
maxPointsInLeafNode = TestUtil.nextInt(random, 16, 2048);
maxMBSortInHeap = 5.0 + (3*random.nextDouble());
bkdSplitRandomSeed = random.nextInt();
add(avoidCodecs,
@ -253,8 +251,7 @@ public class RandomCodec extends AssertingCodec {
public String toString() {
return super.toString() + ": " + previousMappings.toString() +
", docValues:" + previousDVMappings.toString() +
", maxPointsInLeafNode=" + maxPointsInLeafNode +
", maxMBSortInHeap=" + maxMBSortInHeap;
", maxPointsInLeafNode=" + maxPointsInLeafNode;
}
/** Just like {@link BKDWriter} except it evilly picks random ways to split cells on