diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 7b53017c780..917dfa28da7 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -135,6 +135,11 @@ Improvements * LUCENE-7385: Improve/fix assert messages in SpanScorer. (David Smiley) +* LUCENE-7390: Improve performance of indexing points by allowing the + codec to use transient heap in proportion to IndexWriter's RAM + buffer, instead of a fixed 16.0 MB. A custom codec can still + override the buffer size itself. (Mike McCandless) + Optimizations * LUCENE-7330, LUCENE-7339: Speed up conjunction queries. (Adrien Grand) diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java index 8d5c0344509..67289b63c10 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java @@ -68,7 +68,7 @@ class SimpleTextPointsWriter extends PointsWriter { } @Override - public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException { + public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException { boolean singleValuePerDoc = values.size(fieldInfo.name) == values.getDocCount(fieldInfo.name); @@ -79,7 +79,7 @@ class SimpleTextPointsWriter extends PointsWriter { fieldInfo.getPointDimensionCount(), fieldInfo.getPointNumBytes(), BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE, - BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP, + maxMBSortInHeap, values.size(fieldInfo.name), singleValuePerDoc) { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/PointsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/PointsWriter.java index 05084db6ca1..845849780a8 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/PointsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/PointsWriter.java @@ -22,6 +22,7 @@ import java.io.IOException; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.MergeState; +import org.apache.lucene.util.bkd.BKDWriter; /** Abstract API to write points * @@ -34,8 +35,9 @@ public abstract class PointsWriter implements Closeable { protected PointsWriter() { } - /** Write all values contained in the provided reader */ - public abstract void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException; + /** Write all values contained in the provided reader. {@code maxMBSortInHeap} is the maximum + * transient heap that can be used to sort values, before spilling to disk for offline sorting */ + public abstract void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException; /** Default naive merge implementation for one field: it just re-indexes all the values * from the incoming segment. The default codec overrides this for 1D fields and uses @@ -145,7 +147,10 @@ public abstract class PointsWriter implements Closeable { public int getDocCount(String fieldName) { return finalDocCount; } - }); + }, + // TODO: also let merging of > 1D fields tap into IW's indexing buffer size, somehow (1D fields do an optimized merge sort + // and don't need heap) + BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP); } /** Default merge implementation to merge incoming points readers by visiting all their points and diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsWriter.java index 63308c422b3..3acfac3f8b0 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsWriter.java @@ -39,7 +39,9 @@ import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.bkd.BKDReader; import org.apache.lucene.util.bkd.BKDWriter; -/** Writes dimensional values */ +/** Writes dimensional values + * + * @lucene.experimental */ public class Lucene60PointsWriter extends PointsWriter implements Closeable { /** Output used to write the BKD tree data file */ @@ -50,15 +52,13 @@ public class Lucene60PointsWriter extends PointsWriter implements Closeable { final SegmentWriteState writeState; final int maxPointsInLeafNode; - final double maxMBSortInHeap; private boolean finished; /** Full constructor */ - public Lucene60PointsWriter(SegmentWriteState writeState, int maxPointsInLeafNode, double maxMBSortInHeap) throws IOException { + public Lucene60PointsWriter(SegmentWriteState writeState, int maxPointsInLeafNode) throws IOException { assert writeState.fieldInfos.hasPointValues(); this.writeState = writeState; this.maxPointsInLeafNode = maxPointsInLeafNode; - this.maxMBSortInHeap = maxMBSortInHeap; String dataFileName = IndexFileNames.segmentFileName(writeState.segmentInfo.name, writeState.segmentSuffix, Lucene60PointsFormat.DATA_EXTENSION); @@ -80,11 +80,11 @@ public class Lucene60PointsWriter extends PointsWriter implements Closeable { /** Uses the defaults values for {@code maxPointsInLeafNode} (1024) and {@code maxMBSortInHeap} (16.0) */ public Lucene60PointsWriter(SegmentWriteState writeState) throws IOException { - this(writeState, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP); + this(writeState, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE); } @Override - public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException { + public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException { boolean singleValuePerDoc = values.size(fieldInfo.name) == values.getDocCount(fieldInfo.name); @@ -173,7 +173,8 @@ public class Lucene60PointsWriter extends PointsWriter implements Closeable { fieldInfo.getPointDimensionCount(), fieldInfo.getPointNumBytes(), maxPointsInLeafNode, - maxMBSortInHeap, + // NOTE: not used, since BKDWriter.merge does a merge sort: + BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP, totMaxSize, singleValuePerDoc)) { List bkdReaders = new ArrayList<>(); diff --git a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java index e72145c93f6..351235e2b5d 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java @@ -153,7 +153,7 @@ class DocumentsWriterPerThread { final Allocator byteBlockAllocator; final IntBlockPool.Allocator intBlockAllocator; private final AtomicLong pendingNumDocs; - private final LiveIndexWriterConfig indexWriterConfig; + final LiveIndexWriterConfig indexWriterConfig; private final boolean enableTestPoints; private final IndexWriter indexWriter; diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java index 0fb23d96b36..bdd95526429 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java @@ -762,7 +762,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { * {@link #getConfig()}. * *

- * NOTE: after ths writer is created, the given configuration instance + * NOTE: after this writer is created, the given configuration instance * cannot be passed to another writer. * * @param d diff --git a/lucene/core/src/java/org/apache/lucene/index/LiveIndexWriterConfig.java b/lucene/core/src/java/org/apache/lucene/index/LiveIndexWriterConfig.java index cec70c099aa..fe4924df6c7 100644 --- a/lucene/core/src/java/org/apache/lucene/index/LiveIndexWriterConfig.java +++ b/lucene/core/src/java/org/apache/lucene/index/LiveIndexWriterConfig.java @@ -168,9 +168,14 @@ public class LiveIndexWriterConfig { /** * Determines the amount of RAM that may be used for buffering added documents - * and deletions before they are flushed to the Directory. Generally for - * faster indexing performance it's best to flush by RAM usage instead of - * document count and use as large a RAM buffer as you can. + * and deletions before beginning to flush them to the Directory. For + * faster indexing performance it's best to use as large a RAM buffer as you can. + *

+ * Note that this setting is not a hard limit on memory usage during indexing, as + * transient and non-trivial memory well beyond this buffer size may be used, + * for example due to segment merges or writing points to new segments. + * For application stability the available memory in the JVM + * should be significantly larger than the RAM buffer used for indexing. *

* When this is set, the writer will flush whenever buffered documents and * deletions use this much RAM. Pass in @@ -178,14 +183,6 @@ public class LiveIndexWriterConfig { * due to RAM usage. Note that if flushing by document count is also enabled, * then the flush will be triggered by whichever comes first. *

- * The maximum RAM limit is inherently determined by the JVMs available - * memory. Yet, an {@link IndexWriter} session can consume a significantly - * larger amount of memory than the given RAM limit since this limit is just - * an indicator when to flush memory resident documents to the Directory. - * Flushes are likely happen concurrently while other threads adding documents - * to the writer. For application stability the available memory in the JVM - * should be significantly larger than the RAM buffer used for indexing. - *

* NOTE: the account of RAM usage for pending deletions is only * approximate. Specifically, if you delete by Query, Lucene currently has no * way to measure the RAM usage of individual Queries so the accounting will diff --git a/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java index e3d1c1d8659..511635c14ed 100644 --- a/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java @@ -24,6 +24,7 @@ import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ByteBlockPool; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.Counter; +import org.apache.lucene.util.bkd.BKDWriter; /** Buffers up pending byte[][] value(s) per doc, then flushes when segment flushes. */ class PointValuesWriter { @@ -35,6 +36,7 @@ class PointValuesWriter { private int numDocs; private int lastDocID = -1; private final byte[] packedValue; + private final LiveIndexWriterConfig indexWriterConfig; public PointValuesWriter(DocumentsWriterPerThread docWriter, FieldInfo fieldInfo) { this.fieldInfo = fieldInfo; @@ -43,6 +45,7 @@ class PointValuesWriter { docIDs = new int[16]; iwBytesUsed.addAndGet(16 * Integer.BYTES); packedValue = new byte[fieldInfo.getPointDimensionCount() * fieldInfo.getPointNumBytes()]; + indexWriterConfig = docWriter.indexWriterConfig; } // TODO: if exactly the same value is added to exactly the same doc, should we dedup? @@ -124,6 +127,7 @@ class PointValuesWriter { public int getDocCount(String fieldName) { return numDocs; } - }); + }, + Math.max(indexWriterConfig.getRAMBufferSizeMB()/8.0, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP)); } } diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java index 09e6412be5a..97651e47e38 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java @@ -204,7 +204,7 @@ public class BKDWriter implements Closeable { // all recursive halves (i.e. 16 + 8 + 4 + 2) so the memory usage is 2X // what that level would consume, so we multiply by 0.5 to convert from // bytes to points here. Each dimension has its own sorted partition, so - // we must divide by numDims as wel. + // we must divide by numDims as well. maxPointsSortInHeap = (int) (0.5 * (maxMBSortInHeap * 1024 * 1024) / (bytesPerDoc * numDims)); diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene60/TestLucene60PointsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene60/TestLucene60PointsFormat.java index afa8ec412f5..4b898c30e21 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene60/TestLucene60PointsFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene60/TestLucene60PointsFormat.java @@ -41,9 +41,8 @@ public class TestLucene60PointsFormat extends BasePointsFormatTestCase { if (random().nextBoolean()) { // randomize parameters int maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 500); - double maxMBSortInHeap = 3.0 + (3*random().nextDouble()); if (VERBOSE) { - System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode + " and maxMBSortInHeap=" + maxMBSortInHeap); + System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode); } // sneaky impersonation! @@ -53,7 +52,7 @@ public class TestLucene60PointsFormat extends BasePointsFormatTestCase { return new PointsFormat() { @Override public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException { - return new Lucene60PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap); + return new Lucene60PointsWriter(writeState, maxPointsInLeafNode); } @Override diff --git a/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java b/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java index cf8372dc427..c139b64d9a5 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java @@ -1156,9 +1156,8 @@ public class TestPointQueries extends LuceneTestCase { private static Codec getCodec() { if (Codec.getDefault().getName().equals("Lucene62")) { int maxPointsInLeafNode = TestUtil.nextInt(random(), 16, 2048); - double maxMBSortInHeap = 5.0 + (3*random().nextDouble()); if (VERBOSE) { - System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode + " and maxMBSortInHeap=" + maxMBSortInHeap); + System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode); } return new FilterCodec("Lucene62", Codec.getDefault()) { @@ -1167,7 +1166,7 @@ public class TestPointQueries extends LuceneTestCase { return new PointsFormat() { @Override public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException { - return new Lucene60PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap); + return new Lucene60PointsWriter(writeState, maxPointsInLeafNode); } @Override diff --git a/lucene/spatial3d/src/test/org/apache/lucene/spatial3d/TestGeo3DPoint.java b/lucene/spatial3d/src/test/org/apache/lucene/spatial3d/TestGeo3DPoint.java index d9baf612e2b..bb98145d631 100644 --- a/lucene/spatial3d/src/test/org/apache/lucene/spatial3d/TestGeo3DPoint.java +++ b/lucene/spatial3d/src/test/org/apache/lucene/spatial3d/TestGeo3DPoint.java @@ -87,9 +87,8 @@ public class TestGeo3DPoint extends LuceneTestCase { private static Codec getCodec() { if (Codec.getDefault().getName().equals("Lucene62")) { int maxPointsInLeafNode = TestUtil.nextInt(random(), 16, 2048); - double maxMBSortInHeap = 3.0 + (3*random().nextDouble()); if (VERBOSE) { - System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode + " and maxMBSortInHeap=" + maxMBSortInHeap); + System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode); } return new FilterCodec("Lucene62", Codec.getDefault()) { @@ -98,7 +97,7 @@ public class TestGeo3DPoint extends LuceneTestCase { return new PointsFormat() { @Override public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException { - return new Lucene60PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap); + return new Lucene60PointsWriter(writeState, maxPointsInLeafNode); } @Override diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingPointsFormat.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingPointsFormat.java index 731aaec8926..c3c672b6475 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingPointsFormat.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingPointsFormat.java @@ -254,11 +254,11 @@ public final class AssertingPointsFormat extends PointsFormat { } @Override - public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException { + public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException { if (fieldInfo.getPointDimensionCount() == 0) { throw new IllegalArgumentException("writing field=\"" + fieldInfo.name + "\" but pointDimensionalCount is 0"); } - in.writeField(fieldInfo, values); + in.writeField(fieldInfo, values, maxMBSortInHeap); } @Override diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/cranky/CrankyPointsFormat.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/cranky/CrankyPointsFormat.java index fd2260be6a5..ffd9a8c4eab 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/cranky/CrankyPointsFormat.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/cranky/CrankyPointsFormat.java @@ -56,11 +56,11 @@ class CrankyPointsFormat extends PointsFormat { } @Override - public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException { + public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException { if (random.nextInt(100) == 0) { throw new IOException("Fake IOException"); } - delegate.writeField(fieldInfo, values); + delegate.writeField(fieldInfo, values, maxMBSortInHeap); } @Override diff --git a/lucene/test-framework/src/java/org/apache/lucene/geo/BaseGeoPointTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/geo/BaseGeoPointTestCase.java index 275c1864857..926132fa061 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/geo/BaseGeoPointTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/geo/BaseGeoPointTestCase.java @@ -67,7 +67,6 @@ import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.SloppyMath; import org.apache.lucene.util.TestUtil; -import org.apache.lucene.util.bkd.BKDWriter; /** * Abstract class to do basic tests for a geospatial impl (high level @@ -1248,7 +1247,7 @@ public abstract class BaseGeoPointTestCase extends LuceneTestCase { return new PointsFormat() { @Override public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException { - return new Lucene60PointsWriter(writeState, pointsInLeaf, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP); + return new Lucene60PointsWriter(writeState, pointsInLeaf); } @Override diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java b/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java index 127549ff065..6e8f7bcb14a 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java @@ -92,7 +92,6 @@ public class RandomCodec extends AssertingCodec { // which is less effective for testing. // TODO: improve how we randomize this... private final int maxPointsInLeafNode; - private final double maxMBSortInHeap; private final int bkdSplitRandomSeed; @Override @@ -103,9 +102,9 @@ public class RandomCodec extends AssertingCodec { // Randomize how BKDWriter chooses its splis: - return new Lucene60PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap) { + return new Lucene60PointsWriter(writeState, maxPointsInLeafNode) { @Override - public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException { + public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException { boolean singleValuePerDoc = values.size(fieldInfo.name) == values.getDocCount(fieldInfo.name); @@ -185,7 +184,6 @@ public class RandomCodec extends AssertingCodec { int lowFreqCutoff = TestUtil.nextInt(random, 2, 100); maxPointsInLeafNode = TestUtil.nextInt(random, 16, 2048); - maxMBSortInHeap = 5.0 + (3*random.nextDouble()); bkdSplitRandomSeed = random.nextInt(); add(avoidCodecs, @@ -253,8 +251,7 @@ public class RandomCodec extends AssertingCodec { public String toString() { return super.toString() + ": " + previousMappings.toString() + ", docValues:" + previousDVMappings.toString() + - ", maxPointsInLeafNode=" + maxPointsInLeafNode + - ", maxMBSortInHeap=" + maxMBSortInHeap; + ", maxPointsInLeafNode=" + maxPointsInLeafNode; } /** Just like {@link BKDWriter} except it evilly picks random ways to split cells on