mirror of https://github.com/apache/lucene.git
LUCENE-7390: improve points indexing performance by letting the codec use transient heap in proportion to IndexWriter's indexing buffer, by default
This commit is contained in:
parent
4ed68bc80e
commit
3a0a9fd2c2
|
@ -135,6 +135,11 @@ Improvements
|
|||
|
||||
* LUCENE-7385: Improve/fix assert messages in SpanScorer. (David Smiley)
|
||||
|
||||
* LUCENE-7390: Improve performance of indexing points by allowing the
|
||||
codec to use transient heap in proportion to IndexWriter's RAM
|
||||
buffer, instead of a fixed 16.0 MB. A custom codec can still
|
||||
override the buffer size itself. (Mike McCandless)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-7330, LUCENE-7339: Speed up conjunction queries. (Adrien Grand)
|
||||
|
|
|
@ -68,7 +68,7 @@ class SimpleTextPointsWriter extends PointsWriter {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException {
|
||||
public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException {
|
||||
|
||||
boolean singleValuePerDoc = values.size(fieldInfo.name) == values.getDocCount(fieldInfo.name);
|
||||
|
||||
|
@ -79,7 +79,7 @@ class SimpleTextPointsWriter extends PointsWriter {
|
|||
fieldInfo.getPointDimensionCount(),
|
||||
fieldInfo.getPointNumBytes(),
|
||||
BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE,
|
||||
BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP,
|
||||
maxMBSortInHeap,
|
||||
values.size(fieldInfo.name),
|
||||
singleValuePerDoc) {
|
||||
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.io.IOException;
|
|||
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.MergeState;
|
||||
import org.apache.lucene.util.bkd.BKDWriter;
|
||||
|
||||
/** Abstract API to write points
|
||||
*
|
||||
|
@ -34,8 +35,9 @@ public abstract class PointsWriter implements Closeable {
|
|||
protected PointsWriter() {
|
||||
}
|
||||
|
||||
/** Write all values contained in the provided reader */
|
||||
public abstract void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException;
|
||||
/** Write all values contained in the provided reader. {@code maxMBSortInHeap} is the maximum
|
||||
* transient heap that can be used to sort values, before spilling to disk for offline sorting */
|
||||
public abstract void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException;
|
||||
|
||||
/** Default naive merge implementation for one field: it just re-indexes all the values
|
||||
* from the incoming segment. The default codec overrides this for 1D fields and uses
|
||||
|
@ -145,7 +147,10 @@ public abstract class PointsWriter implements Closeable {
|
|||
public int getDocCount(String fieldName) {
|
||||
return finalDocCount;
|
||||
}
|
||||
});
|
||||
},
|
||||
// TODO: also let merging of > 1D fields tap into IW's indexing buffer size, somehow (1D fields do an optimized merge sort
|
||||
// and don't need heap)
|
||||
BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP);
|
||||
}
|
||||
|
||||
/** Default merge implementation to merge incoming points readers by visiting all their points and
|
||||
|
|
|
@ -39,7 +39,9 @@ import org.apache.lucene.util.IOUtils;
|
|||
import org.apache.lucene.util.bkd.BKDReader;
|
||||
import org.apache.lucene.util.bkd.BKDWriter;
|
||||
|
||||
/** Writes dimensional values */
|
||||
/** Writes dimensional values
|
||||
*
|
||||
* @lucene.experimental */
|
||||
public class Lucene60PointsWriter extends PointsWriter implements Closeable {
|
||||
|
||||
/** Output used to write the BKD tree data file */
|
||||
|
@ -50,15 +52,13 @@ public class Lucene60PointsWriter extends PointsWriter implements Closeable {
|
|||
|
||||
final SegmentWriteState writeState;
|
||||
final int maxPointsInLeafNode;
|
||||
final double maxMBSortInHeap;
|
||||
private boolean finished;
|
||||
|
||||
/** Full constructor */
|
||||
public Lucene60PointsWriter(SegmentWriteState writeState, int maxPointsInLeafNode, double maxMBSortInHeap) throws IOException {
|
||||
public Lucene60PointsWriter(SegmentWriteState writeState, int maxPointsInLeafNode) throws IOException {
|
||||
assert writeState.fieldInfos.hasPointValues();
|
||||
this.writeState = writeState;
|
||||
this.maxPointsInLeafNode = maxPointsInLeafNode;
|
||||
this.maxMBSortInHeap = maxMBSortInHeap;
|
||||
String dataFileName = IndexFileNames.segmentFileName(writeState.segmentInfo.name,
|
||||
writeState.segmentSuffix,
|
||||
Lucene60PointsFormat.DATA_EXTENSION);
|
||||
|
@ -80,11 +80,11 @@ public class Lucene60PointsWriter extends PointsWriter implements Closeable {
|
|||
|
||||
/** Uses the defaults values for {@code maxPointsInLeafNode} (1024) and {@code maxMBSortInHeap} (16.0) */
|
||||
public Lucene60PointsWriter(SegmentWriteState writeState) throws IOException {
|
||||
this(writeState, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP);
|
||||
this(writeState, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException {
|
||||
public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException {
|
||||
|
||||
boolean singleValuePerDoc = values.size(fieldInfo.name) == values.getDocCount(fieldInfo.name);
|
||||
|
||||
|
@ -173,7 +173,8 @@ public class Lucene60PointsWriter extends PointsWriter implements Closeable {
|
|||
fieldInfo.getPointDimensionCount(),
|
||||
fieldInfo.getPointNumBytes(),
|
||||
maxPointsInLeafNode,
|
||||
maxMBSortInHeap,
|
||||
// NOTE: not used, since BKDWriter.merge does a merge sort:
|
||||
BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP,
|
||||
totMaxSize,
|
||||
singleValuePerDoc)) {
|
||||
List<BKDReader> bkdReaders = new ArrayList<>();
|
||||
|
|
|
@ -153,7 +153,7 @@ class DocumentsWriterPerThread {
|
|||
final Allocator byteBlockAllocator;
|
||||
final IntBlockPool.Allocator intBlockAllocator;
|
||||
private final AtomicLong pendingNumDocs;
|
||||
private final LiveIndexWriterConfig indexWriterConfig;
|
||||
final LiveIndexWriterConfig indexWriterConfig;
|
||||
private final boolean enableTestPoints;
|
||||
private final IndexWriter indexWriter;
|
||||
|
||||
|
|
|
@ -762,7 +762,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
|
|||
* {@link #getConfig()}.
|
||||
*
|
||||
* <p>
|
||||
* <b>NOTE:</b> after ths writer is created, the given configuration instance
|
||||
* <b>NOTE:</b> after this writer is created, the given configuration instance
|
||||
* cannot be passed to another writer.
|
||||
*
|
||||
* @param d
|
||||
|
|
|
@ -168,9 +168,14 @@ public class LiveIndexWriterConfig {
|
|||
|
||||
/**
|
||||
* Determines the amount of RAM that may be used for buffering added documents
|
||||
* and deletions before they are flushed to the Directory. Generally for
|
||||
* faster indexing performance it's best to flush by RAM usage instead of
|
||||
* document count and use as large a RAM buffer as you can.
|
||||
* and deletions before beginning to flush them to the Directory. For
|
||||
* faster indexing performance it's best to use as large a RAM buffer as you can.
|
||||
* <p>
|
||||
* Note that this setting is not a hard limit on memory usage during indexing, as
|
||||
* transient and non-trivial memory well beyond this buffer size may be used,
|
||||
* for example due to segment merges or writing points to new segments.
|
||||
* For application stability the available memory in the JVM
|
||||
* should be significantly larger than the RAM buffer used for indexing.
|
||||
* <p>
|
||||
* When this is set, the writer will flush whenever buffered documents and
|
||||
* deletions use this much RAM. Pass in
|
||||
|
@ -178,14 +183,6 @@ public class LiveIndexWriterConfig {
|
|||
* due to RAM usage. Note that if flushing by document count is also enabled,
|
||||
* then the flush will be triggered by whichever comes first.
|
||||
* <p>
|
||||
* The maximum RAM limit is inherently determined by the JVMs available
|
||||
* memory. Yet, an {@link IndexWriter} session can consume a significantly
|
||||
* larger amount of memory than the given RAM limit since this limit is just
|
||||
* an indicator when to flush memory resident documents to the Directory.
|
||||
* Flushes are likely happen concurrently while other threads adding documents
|
||||
* to the writer. For application stability the available memory in the JVM
|
||||
* should be significantly larger than the RAM buffer used for indexing.
|
||||
* <p>
|
||||
* <b>NOTE</b>: the account of RAM usage for pending deletions is only
|
||||
* approximate. Specifically, if you delete by Query, Lucene currently has no
|
||||
* way to measure the RAM usage of individual Queries so the accounting will
|
||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.lucene.util.ArrayUtil;
|
|||
import org.apache.lucene.util.ByteBlockPool;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.Counter;
|
||||
import org.apache.lucene.util.bkd.BKDWriter;
|
||||
|
||||
/** Buffers up pending byte[][] value(s) per doc, then flushes when segment flushes. */
|
||||
class PointValuesWriter {
|
||||
|
@ -35,6 +36,7 @@ class PointValuesWriter {
|
|||
private int numDocs;
|
||||
private int lastDocID = -1;
|
||||
private final byte[] packedValue;
|
||||
private final LiveIndexWriterConfig indexWriterConfig;
|
||||
|
||||
public PointValuesWriter(DocumentsWriterPerThread docWriter, FieldInfo fieldInfo) {
|
||||
this.fieldInfo = fieldInfo;
|
||||
|
@ -43,6 +45,7 @@ class PointValuesWriter {
|
|||
docIDs = new int[16];
|
||||
iwBytesUsed.addAndGet(16 * Integer.BYTES);
|
||||
packedValue = new byte[fieldInfo.getPointDimensionCount() * fieldInfo.getPointNumBytes()];
|
||||
indexWriterConfig = docWriter.indexWriterConfig;
|
||||
}
|
||||
|
||||
// TODO: if exactly the same value is added to exactly the same doc, should we dedup?
|
||||
|
@ -124,6 +127,7 @@ class PointValuesWriter {
|
|||
public int getDocCount(String fieldName) {
|
||||
return numDocs;
|
||||
}
|
||||
});
|
||||
},
|
||||
Math.max(indexWriterConfig.getRAMBufferSizeMB()/8.0, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -204,7 +204,7 @@ public class BKDWriter implements Closeable {
|
|||
// all recursive halves (i.e. 16 + 8 + 4 + 2) so the memory usage is 2X
|
||||
// what that level would consume, so we multiply by 0.5 to convert from
|
||||
// bytes to points here. Each dimension has its own sorted partition, so
|
||||
// we must divide by numDims as wel.
|
||||
// we must divide by numDims as well.
|
||||
|
||||
maxPointsSortInHeap = (int) (0.5 * (maxMBSortInHeap * 1024 * 1024) / (bytesPerDoc * numDims));
|
||||
|
||||
|
|
|
@ -41,9 +41,8 @@ public class TestLucene60PointsFormat extends BasePointsFormatTestCase {
|
|||
if (random().nextBoolean()) {
|
||||
// randomize parameters
|
||||
int maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 500);
|
||||
double maxMBSortInHeap = 3.0 + (3*random().nextDouble());
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode + " and maxMBSortInHeap=" + maxMBSortInHeap);
|
||||
System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode);
|
||||
}
|
||||
|
||||
// sneaky impersonation!
|
||||
|
@ -53,7 +52,7 @@ public class TestLucene60PointsFormat extends BasePointsFormatTestCase {
|
|||
return new PointsFormat() {
|
||||
@Override
|
||||
public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException {
|
||||
return new Lucene60PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap);
|
||||
return new Lucene60PointsWriter(writeState, maxPointsInLeafNode);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -1156,9 +1156,8 @@ public class TestPointQueries extends LuceneTestCase {
|
|||
private static Codec getCodec() {
|
||||
if (Codec.getDefault().getName().equals("Lucene62")) {
|
||||
int maxPointsInLeafNode = TestUtil.nextInt(random(), 16, 2048);
|
||||
double maxMBSortInHeap = 5.0 + (3*random().nextDouble());
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode + " and maxMBSortInHeap=" + maxMBSortInHeap);
|
||||
System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode);
|
||||
}
|
||||
|
||||
return new FilterCodec("Lucene62", Codec.getDefault()) {
|
||||
|
@ -1167,7 +1166,7 @@ public class TestPointQueries extends LuceneTestCase {
|
|||
return new PointsFormat() {
|
||||
@Override
|
||||
public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException {
|
||||
return new Lucene60PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap);
|
||||
return new Lucene60PointsWriter(writeState, maxPointsInLeafNode);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -87,9 +87,8 @@ public class TestGeo3DPoint extends LuceneTestCase {
|
|||
private static Codec getCodec() {
|
||||
if (Codec.getDefault().getName().equals("Lucene62")) {
|
||||
int maxPointsInLeafNode = TestUtil.nextInt(random(), 16, 2048);
|
||||
double maxMBSortInHeap = 3.0 + (3*random().nextDouble());
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode + " and maxMBSortInHeap=" + maxMBSortInHeap);
|
||||
System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode);
|
||||
}
|
||||
|
||||
return new FilterCodec("Lucene62", Codec.getDefault()) {
|
||||
|
@ -98,7 +97,7 @@ public class TestGeo3DPoint extends LuceneTestCase {
|
|||
return new PointsFormat() {
|
||||
@Override
|
||||
public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException {
|
||||
return new Lucene60PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap);
|
||||
return new Lucene60PointsWriter(writeState, maxPointsInLeafNode);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -254,11 +254,11 @@ public final class AssertingPointsFormat extends PointsFormat {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException {
|
||||
public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException {
|
||||
if (fieldInfo.getPointDimensionCount() == 0) {
|
||||
throw new IllegalArgumentException("writing field=\"" + fieldInfo.name + "\" but pointDimensionalCount is 0");
|
||||
}
|
||||
in.writeField(fieldInfo, values);
|
||||
in.writeField(fieldInfo, values, maxMBSortInHeap);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -56,11 +56,11 @@ class CrankyPointsFormat extends PointsFormat {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException {
|
||||
public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException {
|
||||
if (random.nextInt(100) == 0) {
|
||||
throw new IOException("Fake IOException");
|
||||
}
|
||||
delegate.writeField(fieldInfo, values);
|
||||
delegate.writeField(fieldInfo, values, maxMBSortInHeap);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -67,7 +67,6 @@ import org.apache.lucene.util.IOUtils;
|
|||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.SloppyMath;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
import org.apache.lucene.util.bkd.BKDWriter;
|
||||
|
||||
/**
|
||||
* Abstract class to do basic tests for a geospatial impl (high level
|
||||
|
@ -1248,7 +1247,7 @@ public abstract class BaseGeoPointTestCase extends LuceneTestCase {
|
|||
return new PointsFormat() {
|
||||
@Override
|
||||
public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException {
|
||||
return new Lucene60PointsWriter(writeState, pointsInLeaf, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP);
|
||||
return new Lucene60PointsWriter(writeState, pointsInLeaf);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -92,7 +92,6 @@ public class RandomCodec extends AssertingCodec {
|
|||
// which is less effective for testing.
|
||||
// TODO: improve how we randomize this...
|
||||
private final int maxPointsInLeafNode;
|
||||
private final double maxMBSortInHeap;
|
||||
private final int bkdSplitRandomSeed;
|
||||
|
||||
@Override
|
||||
|
@ -103,9 +102,9 @@ public class RandomCodec extends AssertingCodec {
|
|||
|
||||
// Randomize how BKDWriter chooses its splis:
|
||||
|
||||
return new Lucene60PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap) {
|
||||
return new Lucene60PointsWriter(writeState, maxPointsInLeafNode) {
|
||||
@Override
|
||||
public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException {
|
||||
public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException {
|
||||
|
||||
boolean singleValuePerDoc = values.size(fieldInfo.name) == values.getDocCount(fieldInfo.name);
|
||||
|
||||
|
@ -185,7 +184,6 @@ public class RandomCodec extends AssertingCodec {
|
|||
int lowFreqCutoff = TestUtil.nextInt(random, 2, 100);
|
||||
|
||||
maxPointsInLeafNode = TestUtil.nextInt(random, 16, 2048);
|
||||
maxMBSortInHeap = 5.0 + (3*random.nextDouble());
|
||||
bkdSplitRandomSeed = random.nextInt();
|
||||
|
||||
add(avoidCodecs,
|
||||
|
@ -253,8 +251,7 @@ public class RandomCodec extends AssertingCodec {
|
|||
public String toString() {
|
||||
return super.toString() + ": " + previousMappings.toString() +
|
||||
", docValues:" + previousDVMappings.toString() +
|
||||
", maxPointsInLeafNode=" + maxPointsInLeafNode +
|
||||
", maxMBSortInHeap=" + maxMBSortInHeap;
|
||||
", maxPointsInLeafNode=" + maxPointsInLeafNode;
|
||||
}
|
||||
|
||||
/** Just like {@link BKDWriter} except it evilly picks random ways to split cells on
|
||||
|
|
Loading…
Reference in New Issue