diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 67e268336b7..45bc8309bbe 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -138,11 +138,6 @@ Improvements * LUCENE-7385: Improve/fix assert messages in SpanScorer. (David Smiley) -* LUCENE-7390: Improve performance of indexing points by allowing the - codec to use transient heap in proportion to IndexWriter's RAM - buffer, instead of a fixed 16.0 MB. A custom codec can still - override the buffer size itself. (Mike McCandless) - * LUCENE-7393: Add ICUTokenizer option to parse Myanmar text as syllables instead of words, because the ICU word-breaking algorithm has some issues. This allows for the previous tokenization used before Lucene 5. (AM, Robert Muir) diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java index 67289b63c10..8d5c0344509 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java @@ -68,7 +68,7 @@ class SimpleTextPointsWriter extends PointsWriter { } @Override - public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException { + public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException { boolean singleValuePerDoc = values.size(fieldInfo.name) == values.getDocCount(fieldInfo.name); @@ -79,7 +79,7 @@ class SimpleTextPointsWriter extends PointsWriter { fieldInfo.getPointDimensionCount(), fieldInfo.getPointNumBytes(), BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE, - maxMBSortInHeap, + BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP, values.size(fieldInfo.name), singleValuePerDoc) { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/PointsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/PointsWriter.java index 845849780a8..05084db6ca1 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/PointsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/PointsWriter.java @@ -22,7 +22,6 @@ import java.io.IOException; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.MergeState; -import org.apache.lucene.util.bkd.BKDWriter; /** Abstract API to write points * @@ -35,9 +34,8 @@ public abstract class PointsWriter implements Closeable { protected PointsWriter() { } - /** Write all values contained in the provided reader. {@code maxMBSortInHeap} is the maximum - * transient heap that can be used to sort values, before spilling to disk for offline sorting */ - public abstract void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException; + /** Write all values contained in the provided reader */ + public abstract void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException; /** Default naive merge implementation for one field: it just re-indexes all the values * from the incoming segment. The default codec overrides this for 1D fields and uses @@ -147,10 +145,7 @@ public abstract class PointsWriter implements Closeable { public int getDocCount(String fieldName) { return finalDocCount; } - }, - // TODO: also let merging of > 1D fields tap into IW's indexing buffer size, somehow (1D fields do an optimized merge sort - // and don't need heap) - BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP); + }); } /** Default merge implementation to merge incoming points readers by visiting all their points and diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsWriter.java index 5fedf64fd91..ff9de58091c 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsWriter.java @@ -40,9 +40,7 @@ import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.bkd.BKDReader; import org.apache.lucene.util.bkd.BKDWriter; -/** Writes dimensional values - * - * @lucene.experimental */ +/** Writes dimensional values */ public class Lucene60PointsWriter extends PointsWriter implements Closeable { /** Output used to write the BKD tree data file */ @@ -53,13 +51,15 @@ public class Lucene60PointsWriter extends PointsWriter implements Closeable { final SegmentWriteState writeState; final int maxPointsInLeafNode; + final double maxMBSortInHeap; private boolean finished; /** Full constructor */ - public Lucene60PointsWriter(SegmentWriteState writeState, int maxPointsInLeafNode) throws IOException { + public Lucene60PointsWriter(SegmentWriteState writeState, int maxPointsInLeafNode, double maxMBSortInHeap) throws IOException { assert writeState.fieldInfos.hasPointValues(); this.writeState = writeState; this.maxPointsInLeafNode = maxPointsInLeafNode; + this.maxMBSortInHeap = maxMBSortInHeap; String dataFileName = IndexFileNames.segmentFileName(writeState.segmentInfo.name, writeState.segmentSuffix, Lucene60PointsFormat.DATA_EXTENSION); @@ -81,11 +81,11 @@ public class Lucene60PointsWriter extends PointsWriter implements Closeable { /** Uses the defaults values for {@code maxPointsInLeafNode} (1024) and {@code maxMBSortInHeap} (16.0) */ public Lucene60PointsWriter(SegmentWriteState writeState) throws IOException { - this(writeState, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE); + this(writeState, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP); } @Override - public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException { + public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException { boolean singleValuePerDoc = values.size(fieldInfo.name) == values.getDocCount(fieldInfo.name); @@ -182,8 +182,7 @@ public class Lucene60PointsWriter extends PointsWriter implements Closeable { fieldInfo.getPointDimensionCount(), fieldInfo.getPointNumBytes(), maxPointsInLeafNode, - // NOTE: not used, since BKDWriter.merge does a merge sort: - BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP, + maxMBSortInHeap, totMaxSize, singleValuePerDoc)) { List bkdReaders = new ArrayList<>(); diff --git a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java index 351235e2b5d..e72145c93f6 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java @@ -153,7 +153,7 @@ class DocumentsWriterPerThread { final Allocator byteBlockAllocator; final IntBlockPool.Allocator intBlockAllocator; private final AtomicLong pendingNumDocs; - final LiveIndexWriterConfig indexWriterConfig; + private final LiveIndexWriterConfig indexWriterConfig; private final boolean enableTestPoints; private final IndexWriter indexWriter; diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java index bdd95526429..0fb23d96b36 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java @@ -762,7 +762,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { * {@link #getConfig()}. * *

- * NOTE: after this writer is created, the given configuration instance + * NOTE: after ths writer is created, the given configuration instance * cannot be passed to another writer. * * @param d diff --git a/lucene/core/src/java/org/apache/lucene/index/LiveIndexWriterConfig.java b/lucene/core/src/java/org/apache/lucene/index/LiveIndexWriterConfig.java index fe4924df6c7..cec70c099aa 100644 --- a/lucene/core/src/java/org/apache/lucene/index/LiveIndexWriterConfig.java +++ b/lucene/core/src/java/org/apache/lucene/index/LiveIndexWriterConfig.java @@ -168,14 +168,9 @@ public class LiveIndexWriterConfig { /** * Determines the amount of RAM that may be used for buffering added documents - * and deletions before beginning to flush them to the Directory. For - * faster indexing performance it's best to use as large a RAM buffer as you can. - *

- * Note that this setting is not a hard limit on memory usage during indexing, as - * transient and non-trivial memory well beyond this buffer size may be used, - * for example due to segment merges or writing points to new segments. - * For application stability the available memory in the JVM - * should be significantly larger than the RAM buffer used for indexing. + * and deletions before they are flushed to the Directory. Generally for + * faster indexing performance it's best to flush by RAM usage instead of + * document count and use as large a RAM buffer as you can. *

* When this is set, the writer will flush whenever buffered documents and * deletions use this much RAM. Pass in @@ -183,6 +178,14 @@ public class LiveIndexWriterConfig { * due to RAM usage. Note that if flushing by document count is also enabled, * then the flush will be triggered by whichever comes first. *

+ * The maximum RAM limit is inherently determined by the JVMs available + * memory. Yet, an {@link IndexWriter} session can consume a significantly + * larger amount of memory than the given RAM limit since this limit is just + * an indicator when to flush memory resident documents to the Directory. + * Flushes are likely happen concurrently while other threads adding documents + * to the writer. For application stability the available memory in the JVM + * should be significantly larger than the RAM buffer used for indexing. + *

* NOTE: the account of RAM usage for pending deletions is only * approximate. Specifically, if you delete by Query, Lucene currently has no * way to measure the RAM usage of individual Queries so the accounting will diff --git a/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java index b4decb67174..ce7e5787d53 100644 --- a/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java @@ -25,7 +25,6 @@ import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ByteBlockPool; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.Counter; -import org.apache.lucene.util.bkd.BKDWriter; /** Buffers up pending byte[][] value(s) per doc, then flushes when segment flushes. */ class PointValuesWriter { @@ -37,7 +36,6 @@ class PointValuesWriter { private int numDocs; private int lastDocID = -1; private final int packedBytesLength; - private final LiveIndexWriterConfig indexWriterConfig; public PointValuesWriter(DocumentsWriterPerThread docWriter, FieldInfo fieldInfo) { this.fieldInfo = fieldInfo; @@ -46,7 +44,6 @@ class PointValuesWriter { docIDs = new int[16]; iwBytesUsed.addAndGet(16 * Integer.BYTES); packedBytesLength = fieldInfo.getPointDimensionCount() * fieldInfo.getPointNumBytes(); - indexWriterConfig = docWriter.indexWriterConfig; } // TODO: if exactly the same value is added to exactly the same doc, should we dedup? @@ -167,6 +164,6 @@ class PointValuesWriter { } }; - writer.writeField(fieldInfo, reader, Math.max(indexWriterConfig.getRAMBufferSizeMB()/8.0, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP)); + writer.writeField(fieldInfo, reader, Math.max(indexWriterConfig.getRAMBufferSizeMB()/8.0)); } } diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java index d0d7dca2eb8..8bd66d0a347 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java @@ -205,7 +205,7 @@ public class BKDWriter implements Closeable { // all recursive halves (i.e. 16 + 8 + 4 + 2) so the memory usage is 2X // what that level would consume, so we multiply by 0.5 to convert from // bytes to points here. Each dimension has its own sorted partition, so - // we must divide by numDims as well. + // we must divide by numDims as wel. maxPointsSortInHeap = (int) (0.5 * (maxMBSortInHeap * 1024 * 1024) / (bytesPerDoc * numDims)); diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene60/TestLucene60PointsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene60/TestLucene60PointsFormat.java index 4b898c30e21..afa8ec412f5 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene60/TestLucene60PointsFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene60/TestLucene60PointsFormat.java @@ -41,8 +41,9 @@ public class TestLucene60PointsFormat extends BasePointsFormatTestCase { if (random().nextBoolean()) { // randomize parameters int maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 500); + double maxMBSortInHeap = 3.0 + (3*random().nextDouble()); if (VERBOSE) { - System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode); + System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode + " and maxMBSortInHeap=" + maxMBSortInHeap); } // sneaky impersonation! @@ -52,7 +53,7 @@ public class TestLucene60PointsFormat extends BasePointsFormatTestCase { return new PointsFormat() { @Override public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException { - return new Lucene60PointsWriter(writeState, maxPointsInLeafNode); + return new Lucene60PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap); } @Override diff --git a/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java b/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java index c139b64d9a5..cf8372dc427 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java @@ -1156,8 +1156,9 @@ public class TestPointQueries extends LuceneTestCase { private static Codec getCodec() { if (Codec.getDefault().getName().equals("Lucene62")) { int maxPointsInLeafNode = TestUtil.nextInt(random(), 16, 2048); + double maxMBSortInHeap = 5.0 + (3*random().nextDouble()); if (VERBOSE) { - System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode); + System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode + " and maxMBSortInHeap=" + maxMBSortInHeap); } return new FilterCodec("Lucene62", Codec.getDefault()) { @@ -1166,7 +1167,7 @@ public class TestPointQueries extends LuceneTestCase { return new PointsFormat() { @Override public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException { - return new Lucene60PointsWriter(writeState, maxPointsInLeafNode); + return new Lucene60PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap); } @Override diff --git a/lucene/spatial3d/src/test/org/apache/lucene/spatial3d/TestGeo3DPoint.java b/lucene/spatial3d/src/test/org/apache/lucene/spatial3d/TestGeo3DPoint.java index bb98145d631..d9baf612e2b 100644 --- a/lucene/spatial3d/src/test/org/apache/lucene/spatial3d/TestGeo3DPoint.java +++ b/lucene/spatial3d/src/test/org/apache/lucene/spatial3d/TestGeo3DPoint.java @@ -87,8 +87,9 @@ public class TestGeo3DPoint extends LuceneTestCase { private static Codec getCodec() { if (Codec.getDefault().getName().equals("Lucene62")) { int maxPointsInLeafNode = TestUtil.nextInt(random(), 16, 2048); + double maxMBSortInHeap = 3.0 + (3*random().nextDouble()); if (VERBOSE) { - System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode); + System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode + " and maxMBSortInHeap=" + maxMBSortInHeap); } return new FilterCodec("Lucene62", Codec.getDefault()) { @@ -97,7 +98,7 @@ public class TestGeo3DPoint extends LuceneTestCase { return new PointsFormat() { @Override public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException { - return new Lucene60PointsWriter(writeState, maxPointsInLeafNode); + return new Lucene60PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap); } @Override diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingPointsFormat.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingPointsFormat.java index 0bbf2c6faf6..c6f5485202a 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingPointsFormat.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingPointsFormat.java @@ -255,11 +255,11 @@ public final class AssertingPointsFormat extends PointsFormat { } @Override - public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException { + public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException { if (fieldInfo.getPointDimensionCount() == 0) { throw new IllegalArgumentException("writing field=\"" + fieldInfo.name + "\" but pointDimensionalCount is 0"); } - in.writeField(fieldInfo, values, maxMBSortInHeap); + in.writeField(fieldInfo, values); } @Override diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/cranky/CrankyPointsFormat.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/cranky/CrankyPointsFormat.java index ffd9a8c4eab..fd2260be6a5 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/cranky/CrankyPointsFormat.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/cranky/CrankyPointsFormat.java @@ -56,11 +56,11 @@ class CrankyPointsFormat extends PointsFormat { } @Override - public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException { + public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException { if (random.nextInt(100) == 0) { throw new IOException("Fake IOException"); } - delegate.writeField(fieldInfo, values, maxMBSortInHeap); + delegate.writeField(fieldInfo, values); } @Override diff --git a/lucene/test-framework/src/java/org/apache/lucene/geo/BaseGeoPointTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/geo/BaseGeoPointTestCase.java index 926132fa061..275c1864857 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/geo/BaseGeoPointTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/geo/BaseGeoPointTestCase.java @@ -67,6 +67,7 @@ import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.SloppyMath; import org.apache.lucene.util.TestUtil; +import org.apache.lucene.util.bkd.BKDWriter; /** * Abstract class to do basic tests for a geospatial impl (high level @@ -1247,7 +1248,7 @@ public abstract class BaseGeoPointTestCase extends LuceneTestCase { return new PointsFormat() { @Override public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException { - return new Lucene60PointsWriter(writeState, pointsInLeaf); + return new Lucene60PointsWriter(writeState, pointsInLeaf, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP); } @Override diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java b/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java index 6e8f7bcb14a..127549ff065 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java @@ -92,6 +92,7 @@ public class RandomCodec extends AssertingCodec { // which is less effective for testing. // TODO: improve how we randomize this... private final int maxPointsInLeafNode; + private final double maxMBSortInHeap; private final int bkdSplitRandomSeed; @Override @@ -102,9 +103,9 @@ public class RandomCodec extends AssertingCodec { // Randomize how BKDWriter chooses its splis: - return new Lucene60PointsWriter(writeState, maxPointsInLeafNode) { + return new Lucene60PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap) { @Override - public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException { + public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException { boolean singleValuePerDoc = values.size(fieldInfo.name) == values.getDocCount(fieldInfo.name); @@ -184,6 +185,7 @@ public class RandomCodec extends AssertingCodec { int lowFreqCutoff = TestUtil.nextInt(random, 2, 100); maxPointsInLeafNode = TestUtil.nextInt(random, 16, 2048); + maxMBSortInHeap = 5.0 + (3*random.nextDouble()); bkdSplitRandomSeed = random.nextInt(); add(avoidCodecs, @@ -251,7 +253,8 @@ public class RandomCodec extends AssertingCodec { public String toString() { return super.toString() + ": " + previousMappings.toString() + ", docValues:" + previousDVMappings.toString() + - ", maxPointsInLeafNode=" + maxPointsInLeafNode; + ", maxPointsInLeafNode=" + maxPointsInLeafNode + + ", maxMBSortInHeap=" + maxMBSortInHeap; } /** Just like {@link BKDWriter} except it evilly picks random ways to split cells on