LUCENE-7390: improve points indexing performance by letting the codec use transient heap in proportion to IndexWriter's indexing buffer, by default

This commit is contained in:
Mike McCandless 2016-07-25 11:33:34 -04:00
parent 4ed68bc80e
commit 3a0a9fd2c2
16 changed files with 53 additions and 48 deletions

View File

@ -135,6 +135,11 @@ Improvements
* LUCENE-7385: Improve/fix assert messages in SpanScorer. (David Smiley) * LUCENE-7385: Improve/fix assert messages in SpanScorer. (David Smiley)
* LUCENE-7390: Improve performance of indexing points by allowing the
codec to use transient heap in proportion to IndexWriter's RAM
buffer, instead of a fixed 16.0 MB. A custom codec can still
override the buffer size itself. (Mike McCandless)
Optimizations Optimizations
* LUCENE-7330, LUCENE-7339: Speed up conjunction queries. (Adrien Grand) * LUCENE-7330, LUCENE-7339: Speed up conjunction queries. (Adrien Grand)

View File

@ -68,7 +68,7 @@ class SimpleTextPointsWriter extends PointsWriter {
} }
@Override @Override
public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException { public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException {
boolean singleValuePerDoc = values.size(fieldInfo.name) == values.getDocCount(fieldInfo.name); boolean singleValuePerDoc = values.size(fieldInfo.name) == values.getDocCount(fieldInfo.name);
@ -79,7 +79,7 @@ class SimpleTextPointsWriter extends PointsWriter {
fieldInfo.getPointDimensionCount(), fieldInfo.getPointDimensionCount(),
fieldInfo.getPointNumBytes(), fieldInfo.getPointNumBytes(),
BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE,
BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP, maxMBSortInHeap,
values.size(fieldInfo.name), values.size(fieldInfo.name),
singleValuePerDoc) { singleValuePerDoc) {

View File

@ -22,6 +22,7 @@ import java.io.IOException;
import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.MergeState; import org.apache.lucene.index.MergeState;
import org.apache.lucene.util.bkd.BKDWriter;
/** Abstract API to write points /** Abstract API to write points
* *
@ -34,8 +35,9 @@ public abstract class PointsWriter implements Closeable {
protected PointsWriter() { protected PointsWriter() {
} }
/** Write all values contained in the provided reader */ /** Write all values contained in the provided reader. {@code maxMBSortInHeap} is the maximum
public abstract void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException; * transient heap that can be used to sort values, before spilling to disk for offline sorting */
public abstract void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException;
/** Default naive merge implementation for one field: it just re-indexes all the values /** Default naive merge implementation for one field: it just re-indexes all the values
* from the incoming segment. The default codec overrides this for 1D fields and uses * from the incoming segment. The default codec overrides this for 1D fields and uses
@ -145,7 +147,10 @@ public abstract class PointsWriter implements Closeable {
public int getDocCount(String fieldName) { public int getDocCount(String fieldName) {
return finalDocCount; return finalDocCount;
} }
}); },
// TODO: also let merging of > 1D fields tap into IW's indexing buffer size, somehow (1D fields do an optimized merge sort
// and don't need heap)
BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP);
} }
/** Default merge implementation to merge incoming points readers by visiting all their points and /** Default merge implementation to merge incoming points readers by visiting all their points and

View File

@ -39,7 +39,9 @@ import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.bkd.BKDReader; import org.apache.lucene.util.bkd.BKDReader;
import org.apache.lucene.util.bkd.BKDWriter; import org.apache.lucene.util.bkd.BKDWriter;
/** Writes dimensional values */ /** Writes dimensional values
*
* @lucene.experimental */
public class Lucene60PointsWriter extends PointsWriter implements Closeable { public class Lucene60PointsWriter extends PointsWriter implements Closeable {
/** Output used to write the BKD tree data file */ /** Output used to write the BKD tree data file */
@ -50,15 +52,13 @@ public class Lucene60PointsWriter extends PointsWriter implements Closeable {
final SegmentWriteState writeState; final SegmentWriteState writeState;
final int maxPointsInLeafNode; final int maxPointsInLeafNode;
final double maxMBSortInHeap;
private boolean finished; private boolean finished;
/** Full constructor */ /** Full constructor */
public Lucene60PointsWriter(SegmentWriteState writeState, int maxPointsInLeafNode, double maxMBSortInHeap) throws IOException { public Lucene60PointsWriter(SegmentWriteState writeState, int maxPointsInLeafNode) throws IOException {
assert writeState.fieldInfos.hasPointValues(); assert writeState.fieldInfos.hasPointValues();
this.writeState = writeState; this.writeState = writeState;
this.maxPointsInLeafNode = maxPointsInLeafNode; this.maxPointsInLeafNode = maxPointsInLeafNode;
this.maxMBSortInHeap = maxMBSortInHeap;
String dataFileName = IndexFileNames.segmentFileName(writeState.segmentInfo.name, String dataFileName = IndexFileNames.segmentFileName(writeState.segmentInfo.name,
writeState.segmentSuffix, writeState.segmentSuffix,
Lucene60PointsFormat.DATA_EXTENSION); Lucene60PointsFormat.DATA_EXTENSION);
@ -80,11 +80,11 @@ public class Lucene60PointsWriter extends PointsWriter implements Closeable {
/** Uses the defaults values for {@code maxPointsInLeafNode} (1024) and {@code maxMBSortInHeap} (16.0) */ /** Uses the defaults values for {@code maxPointsInLeafNode} (1024) and {@code maxMBSortInHeap} (16.0) */
public Lucene60PointsWriter(SegmentWriteState writeState) throws IOException { public Lucene60PointsWriter(SegmentWriteState writeState) throws IOException {
this(writeState, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP); this(writeState, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE);
} }
@Override @Override
public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException { public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException {
boolean singleValuePerDoc = values.size(fieldInfo.name) == values.getDocCount(fieldInfo.name); boolean singleValuePerDoc = values.size(fieldInfo.name) == values.getDocCount(fieldInfo.name);
@ -173,7 +173,8 @@ public class Lucene60PointsWriter extends PointsWriter implements Closeable {
fieldInfo.getPointDimensionCount(), fieldInfo.getPointDimensionCount(),
fieldInfo.getPointNumBytes(), fieldInfo.getPointNumBytes(),
maxPointsInLeafNode, maxPointsInLeafNode,
maxMBSortInHeap, // NOTE: not used, since BKDWriter.merge does a merge sort:
BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP,
totMaxSize, totMaxSize,
singleValuePerDoc)) { singleValuePerDoc)) {
List<BKDReader> bkdReaders = new ArrayList<>(); List<BKDReader> bkdReaders = new ArrayList<>();

View File

@ -153,7 +153,7 @@ class DocumentsWriterPerThread {
final Allocator byteBlockAllocator; final Allocator byteBlockAllocator;
final IntBlockPool.Allocator intBlockAllocator; final IntBlockPool.Allocator intBlockAllocator;
private final AtomicLong pendingNumDocs; private final AtomicLong pendingNumDocs;
private final LiveIndexWriterConfig indexWriterConfig; final LiveIndexWriterConfig indexWriterConfig;
private final boolean enableTestPoints; private final boolean enableTestPoints;
private final IndexWriter indexWriter; private final IndexWriter indexWriter;

View File

@ -762,7 +762,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
* {@link #getConfig()}. * {@link #getConfig()}.
* *
* <p> * <p>
* <b>NOTE:</b> after ths writer is created, the given configuration instance * <b>NOTE:</b> after this writer is created, the given configuration instance
* cannot be passed to another writer. * cannot be passed to another writer.
* *
* @param d * @param d

View File

@ -168,9 +168,14 @@ public class LiveIndexWriterConfig {
/** /**
* Determines the amount of RAM that may be used for buffering added documents * Determines the amount of RAM that may be used for buffering added documents
* and deletions before they are flushed to the Directory. Generally for * and deletions before beginning to flush them to the Directory. For
* faster indexing performance it's best to flush by RAM usage instead of * faster indexing performance it's best to use as large a RAM buffer as you can.
* document count and use as large a RAM buffer as you can. * <p>
* Note that this setting is not a hard limit on memory usage during indexing, as
* transient and non-trivial memory well beyond this buffer size may be used,
* for example due to segment merges or writing points to new segments.
* For application stability the available memory in the JVM
* should be significantly larger than the RAM buffer used for indexing.
* <p> * <p>
* When this is set, the writer will flush whenever buffered documents and * When this is set, the writer will flush whenever buffered documents and
* deletions use this much RAM. Pass in * deletions use this much RAM. Pass in
@ -178,14 +183,6 @@ public class LiveIndexWriterConfig {
* due to RAM usage. Note that if flushing by document count is also enabled, * due to RAM usage. Note that if flushing by document count is also enabled,
* then the flush will be triggered by whichever comes first. * then the flush will be triggered by whichever comes first.
* <p> * <p>
* The maximum RAM limit is inherently determined by the JVMs available
* memory. Yet, an {@link IndexWriter} session can consume a significantly
* larger amount of memory than the given RAM limit since this limit is just
* an indicator when to flush memory resident documents to the Directory.
* Flushes are likely happen concurrently while other threads adding documents
* to the writer. For application stability the available memory in the JVM
* should be significantly larger than the RAM buffer used for indexing.
* <p>
* <b>NOTE</b>: the account of RAM usage for pending deletions is only * <b>NOTE</b>: the account of RAM usage for pending deletions is only
* approximate. Specifically, if you delete by Query, Lucene currently has no * approximate. Specifically, if you delete by Query, Lucene currently has no
* way to measure the RAM usage of individual Queries so the accounting will * way to measure the RAM usage of individual Queries so the accounting will

View File

@ -24,6 +24,7 @@ import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.ByteBlockPool; import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Counter; import org.apache.lucene.util.Counter;
import org.apache.lucene.util.bkd.BKDWriter;
/** Buffers up pending byte[][] value(s) per doc, then flushes when segment flushes. */ /** Buffers up pending byte[][] value(s) per doc, then flushes when segment flushes. */
class PointValuesWriter { class PointValuesWriter {
@ -35,6 +36,7 @@ class PointValuesWriter {
private int numDocs; private int numDocs;
private int lastDocID = -1; private int lastDocID = -1;
private final byte[] packedValue; private final byte[] packedValue;
private final LiveIndexWriterConfig indexWriterConfig;
public PointValuesWriter(DocumentsWriterPerThread docWriter, FieldInfo fieldInfo) { public PointValuesWriter(DocumentsWriterPerThread docWriter, FieldInfo fieldInfo) {
this.fieldInfo = fieldInfo; this.fieldInfo = fieldInfo;
@ -43,6 +45,7 @@ class PointValuesWriter {
docIDs = new int[16]; docIDs = new int[16];
iwBytesUsed.addAndGet(16 * Integer.BYTES); iwBytesUsed.addAndGet(16 * Integer.BYTES);
packedValue = new byte[fieldInfo.getPointDimensionCount() * fieldInfo.getPointNumBytes()]; packedValue = new byte[fieldInfo.getPointDimensionCount() * fieldInfo.getPointNumBytes()];
indexWriterConfig = docWriter.indexWriterConfig;
} }
// TODO: if exactly the same value is added to exactly the same doc, should we dedup? // TODO: if exactly the same value is added to exactly the same doc, should we dedup?
@ -124,6 +127,7 @@ class PointValuesWriter {
public int getDocCount(String fieldName) { public int getDocCount(String fieldName) {
return numDocs; return numDocs;
} }
}); },
Math.max(indexWriterConfig.getRAMBufferSizeMB()/8.0, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP));
} }
} }

View File

@ -204,7 +204,7 @@ public class BKDWriter implements Closeable {
// all recursive halves (i.e. 16 + 8 + 4 + 2) so the memory usage is 2X // all recursive halves (i.e. 16 + 8 + 4 + 2) so the memory usage is 2X
// what that level would consume, so we multiply by 0.5 to convert from // what that level would consume, so we multiply by 0.5 to convert from
// bytes to points here. Each dimension has its own sorted partition, so // bytes to points here. Each dimension has its own sorted partition, so
// we must divide by numDims as wel. // we must divide by numDims as well.
maxPointsSortInHeap = (int) (0.5 * (maxMBSortInHeap * 1024 * 1024) / (bytesPerDoc * numDims)); maxPointsSortInHeap = (int) (0.5 * (maxMBSortInHeap * 1024 * 1024) / (bytesPerDoc * numDims));

View File

@ -41,9 +41,8 @@ public class TestLucene60PointsFormat extends BasePointsFormatTestCase {
if (random().nextBoolean()) { if (random().nextBoolean()) {
// randomize parameters // randomize parameters
int maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 500); int maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 500);
double maxMBSortInHeap = 3.0 + (3*random().nextDouble());
if (VERBOSE) { if (VERBOSE) {
System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode + " and maxMBSortInHeap=" + maxMBSortInHeap); System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode);
} }
// sneaky impersonation! // sneaky impersonation!
@ -53,7 +52,7 @@ public class TestLucene60PointsFormat extends BasePointsFormatTestCase {
return new PointsFormat() { return new PointsFormat() {
@Override @Override
public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException { public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException {
return new Lucene60PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap); return new Lucene60PointsWriter(writeState, maxPointsInLeafNode);
} }
@Override @Override

View File

@ -1156,9 +1156,8 @@ public class TestPointQueries extends LuceneTestCase {
private static Codec getCodec() { private static Codec getCodec() {
if (Codec.getDefault().getName().equals("Lucene62")) { if (Codec.getDefault().getName().equals("Lucene62")) {
int maxPointsInLeafNode = TestUtil.nextInt(random(), 16, 2048); int maxPointsInLeafNode = TestUtil.nextInt(random(), 16, 2048);
double maxMBSortInHeap = 5.0 + (3*random().nextDouble());
if (VERBOSE) { if (VERBOSE) {
System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode + " and maxMBSortInHeap=" + maxMBSortInHeap); System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode);
} }
return new FilterCodec("Lucene62", Codec.getDefault()) { return new FilterCodec("Lucene62", Codec.getDefault()) {
@ -1167,7 +1166,7 @@ public class TestPointQueries extends LuceneTestCase {
return new PointsFormat() { return new PointsFormat() {
@Override @Override
public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException { public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException {
return new Lucene60PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap); return new Lucene60PointsWriter(writeState, maxPointsInLeafNode);
} }
@Override @Override

View File

@ -87,9 +87,8 @@ public class TestGeo3DPoint extends LuceneTestCase {
private static Codec getCodec() { private static Codec getCodec() {
if (Codec.getDefault().getName().equals("Lucene62")) { if (Codec.getDefault().getName().equals("Lucene62")) {
int maxPointsInLeafNode = TestUtil.nextInt(random(), 16, 2048); int maxPointsInLeafNode = TestUtil.nextInt(random(), 16, 2048);
double maxMBSortInHeap = 3.0 + (3*random().nextDouble());
if (VERBOSE) { if (VERBOSE) {
System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode + " and maxMBSortInHeap=" + maxMBSortInHeap); System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode);
} }
return new FilterCodec("Lucene62", Codec.getDefault()) { return new FilterCodec("Lucene62", Codec.getDefault()) {
@ -98,7 +97,7 @@ public class TestGeo3DPoint extends LuceneTestCase {
return new PointsFormat() { return new PointsFormat() {
@Override @Override
public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException { public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException {
return new Lucene60PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap); return new Lucene60PointsWriter(writeState, maxPointsInLeafNode);
} }
@Override @Override

View File

@ -254,11 +254,11 @@ public final class AssertingPointsFormat extends PointsFormat {
} }
@Override @Override
public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException { public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException {
if (fieldInfo.getPointDimensionCount() == 0) { if (fieldInfo.getPointDimensionCount() == 0) {
throw new IllegalArgumentException("writing field=\"" + fieldInfo.name + "\" but pointDimensionalCount is 0"); throw new IllegalArgumentException("writing field=\"" + fieldInfo.name + "\" but pointDimensionalCount is 0");
} }
in.writeField(fieldInfo, values); in.writeField(fieldInfo, values, maxMBSortInHeap);
} }
@Override @Override

View File

@ -56,11 +56,11 @@ class CrankyPointsFormat extends PointsFormat {
} }
@Override @Override
public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException { public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException {
if (random.nextInt(100) == 0) { if (random.nextInt(100) == 0) {
throw new IOException("Fake IOException"); throw new IOException("Fake IOException");
} }
delegate.writeField(fieldInfo, values); delegate.writeField(fieldInfo, values, maxMBSortInHeap);
} }
@Override @Override

View File

@ -67,7 +67,6 @@ import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.SloppyMath; import org.apache.lucene.util.SloppyMath;
import org.apache.lucene.util.TestUtil; import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.bkd.BKDWriter;
/** /**
* Abstract class to do basic tests for a geospatial impl (high level * Abstract class to do basic tests for a geospatial impl (high level
@ -1248,7 +1247,7 @@ public abstract class BaseGeoPointTestCase extends LuceneTestCase {
return new PointsFormat() { return new PointsFormat() {
@Override @Override
public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException { public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException {
return new Lucene60PointsWriter(writeState, pointsInLeaf, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP); return new Lucene60PointsWriter(writeState, pointsInLeaf);
} }
@Override @Override

View File

@ -92,7 +92,6 @@ public class RandomCodec extends AssertingCodec {
// which is less effective for testing. // which is less effective for testing.
// TODO: improve how we randomize this... // TODO: improve how we randomize this...
private final int maxPointsInLeafNode; private final int maxPointsInLeafNode;
private final double maxMBSortInHeap;
private final int bkdSplitRandomSeed; private final int bkdSplitRandomSeed;
@Override @Override
@ -103,9 +102,9 @@ public class RandomCodec extends AssertingCodec {
// Randomize how BKDWriter chooses its splis: // Randomize how BKDWriter chooses its splis:
return new Lucene60PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap) { return new Lucene60PointsWriter(writeState, maxPointsInLeafNode) {
@Override @Override
public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException { public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException {
boolean singleValuePerDoc = values.size(fieldInfo.name) == values.getDocCount(fieldInfo.name); boolean singleValuePerDoc = values.size(fieldInfo.name) == values.getDocCount(fieldInfo.name);
@ -185,7 +184,6 @@ public class RandomCodec extends AssertingCodec {
int lowFreqCutoff = TestUtil.nextInt(random, 2, 100); int lowFreqCutoff = TestUtil.nextInt(random, 2, 100);
maxPointsInLeafNode = TestUtil.nextInt(random, 16, 2048); maxPointsInLeafNode = TestUtil.nextInt(random, 16, 2048);
maxMBSortInHeap = 5.0 + (3*random.nextDouble());
bkdSplitRandomSeed = random.nextInt(); bkdSplitRandomSeed = random.nextInt();
add(avoidCodecs, add(avoidCodecs,
@ -253,8 +251,7 @@ public class RandomCodec extends AssertingCodec {
public String toString() { public String toString() {
return super.toString() + ": " + previousMappings.toString() + return super.toString() + ": " + previousMappings.toString() +
", docValues:" + previousDVMappings.toString() + ", docValues:" + previousDVMappings.toString() +
", maxPointsInLeafNode=" + maxPointsInLeafNode + ", maxPointsInLeafNode=" + maxPointsInLeafNode;
", maxMBSortInHeap=" + maxMBSortInHeap;
} }
/** Just like {@link BKDWriter} except it evilly picks random ways to split cells on /** Just like {@link BKDWriter} except it evilly picks random ways to split cells on