LUCENE-7390: improve points indexing performance by letting the codec use transient heap in proportion to IndexWriter's indexing buffer, by default

2016-07-25 11:33:34 -04:00 · 2016-07-25 11:33:34 -04:00 · 3a0a9fd2c2
parent 4ed68bc80e
commit 3a0a9fd2c2
16 changed files with 53 additions and 48 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -135,6 +135,11 @@ Improvements
 * LUCENE-7385: Improve/fix assert messages in SpanScorer. (David Smiley)
 * LUCENE-7390: Improve performance of indexing points by allowing the
  codec to use transient heap in proportion to IndexWriter's RAM
  buffer, instead of a fixed 16.0 MB.  A custom codec can still
  override the buffer size itself. (Mike McCandless)
 Optimizations
 * LUCENE-7330, LUCENE-7339: Speed up conjunction queries. (Adrien Grand)
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java
@ -68,7 +68,7 @@ class SimpleTextPointsWriter extends PointsWriter {
  }
  @Override
-  public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException {
+  public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException {
    boolean singleValuePerDoc = values.size(fieldInfo.name) == values.getDocCount(fieldInfo.name);
@ -79,7 +79,7 @@ class SimpleTextPointsWriter extends PointsWriter {
                                          fieldInfo.getPointDimensionCount(),
                                          fieldInfo.getPointNumBytes(),
                                          BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE,
-                                          BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP,
+                                          maxMBSortInHeap,
                                          values.size(fieldInfo.name),
                                          singleValuePerDoc) {
--- a/lucene/core/src/java/org/apache/lucene/codecs/PointsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/PointsWriter.java
@ -22,6 +22,7 @@ import java.io.IOException;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.MergeState;
 import org.apache.lucene.util.bkd.BKDWriter;
 /** Abstract API to write points
 *
@ -34,8 +35,9 @@ public abstract class PointsWriter implements Closeable {
  protected PointsWriter() {
  }
-  /** Write all values contained in the provided reader */
+  /** Write all values contained in the provided reader.  {@code maxMBSortInHeap} is the maximum
-  public abstract void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException;
+   *  transient heap that can be used to sort values, before spilling to disk for offline sorting */
  public abstract void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException;
  /** Default naive merge implementation for one field: it just re-indexes all the values
   *  from the incoming segment.  The default codec overrides this for 1D fields and uses
@ -145,7 +147,10 @@ public abstract class PointsWriter implements Closeable {
                 public int getDocCount(String fieldName) {
                   return finalDocCount;
                 }
-               });
+               },
               // TODO: also let merging of > 1D fields tap into IW's indexing buffer size, somehow (1D fields do an optimized merge sort
               // and don't need heap)
               BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP);
  }
  /** Default merge implementation to merge incoming points readers by visiting all their points and
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsWriter.java
@ -39,7 +39,9 @@ import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.bkd.BKDReader;
 import org.apache.lucene.util.bkd.BKDWriter;
-/** Writes dimensional values */
+/** Writes dimensional values
 *
 * @lucene.experimental */
 public class Lucene60PointsWriter extends PointsWriter implements Closeable {
  /** Output used to write the BKD tree data file */
@ -50,15 +52,13 @@ public class Lucene60PointsWriter extends PointsWriter implements Closeable {
  final SegmentWriteState writeState;
  final int maxPointsInLeafNode;
  final double maxMBSortInHeap;
  private boolean finished;
  /** Full constructor */
-  public Lucene60PointsWriter(SegmentWriteState writeState, int maxPointsInLeafNode, double maxMBSortInHeap) throws IOException {
+  public Lucene60PointsWriter(SegmentWriteState writeState, int maxPointsInLeafNode) throws IOException {
    assert writeState.fieldInfos.hasPointValues();
    this.writeState = writeState;
    this.maxPointsInLeafNode = maxPointsInLeafNode;
    this.maxMBSortInHeap = maxMBSortInHeap;
    String dataFileName = IndexFileNames.segmentFileName(writeState.segmentInfo.name,
                                                         writeState.segmentSuffix,
                                                         Lucene60PointsFormat.DATA_EXTENSION);
@ -80,11 +80,11 @@ public class Lucene60PointsWriter extends PointsWriter implements Closeable {
  /** Uses the defaults values for {@code maxPointsInLeafNode} (1024) and {@code maxMBSortInHeap} (16.0) */
  public Lucene60PointsWriter(SegmentWriteState writeState) throws IOException {
-    this(writeState, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP);
+    this(writeState, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE);
  }
  @Override
-  public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException {
+  public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException {
    boolean singleValuePerDoc = values.size(fieldInfo.name) == values.getDocCount(fieldInfo.name);
@ -173,7 +173,8 @@ public class Lucene60PointsWriter extends PointsWriter implements Closeable {
                                                fieldInfo.getPointDimensionCount(),
                                                fieldInfo.getPointNumBytes(),
                                                maxPointsInLeafNode,
-                                                maxMBSortInHeap,
+                                                // NOTE: not used, since BKDWriter.merge does a merge sort:
                                                BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP,
                                                totMaxSize,
                                                singleValuePerDoc)) {
            List<BKDReader> bkdReaders = new ArrayList<>();
--- a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java
+++ b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java
@ -153,7 +153,7 @@ class DocumentsWriterPerThread {
  final Allocator byteBlockAllocator;
  final IntBlockPool.Allocator intBlockAllocator;
  private final AtomicLong pendingNumDocs;
-  private final LiveIndexWriterConfig indexWriterConfig;
+  final LiveIndexWriterConfig indexWriterConfig;
  private final boolean enableTestPoints;
  private final IndexWriter indexWriter;
--- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
@ -762,7 +762,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
   * {@link #getConfig()}.
   * 
   * <p>
-   * <b>NOTE:</b> after ths writer is created, the given configuration instance
+   * <b>NOTE:</b> after this writer is created, the given configuration instance
   * cannot be passed to another writer.
   * 
   * @param d
--- a/lucene/core/src/java/org/apache/lucene/index/LiveIndexWriterConfig.java
+++ b/lucene/core/src/java/org/apache/lucene/index/LiveIndexWriterConfig.java
@ -168,9 +168,14 @@ public class LiveIndexWriterConfig {
  /**
   * Determines the amount of RAM that may be used for buffering added documents
-   * and deletions before they are flushed to the Directory. Generally for
+   * and deletions before beginning to flush them to the Directory.  For
-   * faster indexing performance it's best to flush by RAM usage instead of
+   * faster indexing performance it's best to use as large a RAM buffer as you can.
-   * document count and use as large a RAM buffer as you can.
+   * <p>
   * Note that this setting is not a hard limit on memory usage during indexing, as
   * transient and non-trivial memory well beyond this buffer size may be used,
   * for example due to segment merges or writing points to new segments.
   * For application stability the available memory in the JVM
   * should be significantly larger than the RAM buffer used for indexing.
   * <p>
   * When this is set, the writer will flush whenever buffered documents and
   * deletions use this much RAM. Pass in
@ -178,14 +183,6 @@ public class LiveIndexWriterConfig {
   * due to RAM usage. Note that if flushing by document count is also enabled,
   * then the flush will be triggered by whichever comes first.
   * <p>
   * The maximum RAM limit is inherently determined by the JVMs available
   * memory. Yet, an {@link IndexWriter} session can consume a significantly
   * larger amount of memory than the given RAM limit since this limit is just
   * an indicator when to flush memory resident documents to the Directory.
   * Flushes are likely happen concurrently while other threads adding documents
   * to the writer. For application stability the available memory in the JVM
   * should be significantly larger than the RAM buffer used for indexing.
   * <p>
   * <b>NOTE</b>: the account of RAM usage for pending deletions is only
   * approximate. Specifically, if you delete by Query, Lucene currently has no
   * way to measure the RAM usage of individual Queries so the accounting will
--- a/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java
@ -24,6 +24,7 @@ import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.ByteBlockPool;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.Counter;
 import org.apache.lucene.util.bkd.BKDWriter;
 /** Buffers up pending byte[][] value(s) per doc, then flushes when segment flushes. */
 class PointValuesWriter {
@ -35,6 +36,7 @@ class PointValuesWriter {
  private int numDocs;
  private int lastDocID = -1;
  private final byte[] packedValue;
  private final LiveIndexWriterConfig indexWriterConfig;
  public PointValuesWriter(DocumentsWriterPerThread docWriter, FieldInfo fieldInfo) {
    this.fieldInfo = fieldInfo;
@ -43,6 +45,7 @@ class PointValuesWriter {
    docIDs = new int[16];
    iwBytesUsed.addAndGet(16 * Integer.BYTES);
    packedValue = new byte[fieldInfo.getPointDimensionCount() * fieldInfo.getPointNumBytes()];
    indexWriterConfig = docWriter.indexWriterConfig;
  }
  // TODO: if exactly the same value is added to exactly the same doc, should we dedup?
@ -124,6 +127,7 @@ class PointValuesWriter {
                        public int getDocCount(String fieldName) {
                          return numDocs;
                        }
-                      });
+                      },
                      Math.max(indexWriterConfig.getRAMBufferSizeMB()/8.0, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP));
  }
 }
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
@ -204,7 +204,7 @@ public class BKDWriter implements Closeable {
    // all recursive halves (i.e. 16 + 8 + 4 + 2) so the memory usage is 2X
    // what that level would consume, so we multiply by 0.5 to convert from
    // bytes to points here.  Each dimension has its own sorted partition, so
-    // we must divide by numDims as wel.
+    // we must divide by numDims as well.
    maxPointsSortInHeap = (int) (0.5 * (maxMBSortInHeap * 1024 * 1024) / (bytesPerDoc * numDims));
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene60/TestLucene60PointsFormat.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene60/TestLucene60PointsFormat.java
@ -41,9 +41,8 @@ public class TestLucene60PointsFormat extends BasePointsFormatTestCase {
    if (random().nextBoolean()) {
      // randomize parameters
      int maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 500);
      double maxMBSortInHeap = 3.0 + (3*random().nextDouble());
      if (VERBOSE) {
-        System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode + " and maxMBSortInHeap=" + maxMBSortInHeap);
+        System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode);
      }
      // sneaky impersonation!
@ -53,7 +52,7 @@ public class TestLucene60PointsFormat extends BasePointsFormatTestCase {
          return new PointsFormat() {
            @Override
            public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException {
-              return new Lucene60PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap);
+              return new Lucene60PointsWriter(writeState, maxPointsInLeafNode);
            }
            @Override
--- a/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java
@ -1156,9 +1156,8 @@ public class TestPointQueries extends LuceneTestCase {
  private static Codec getCodec() {
    if (Codec.getDefault().getName().equals("Lucene62")) {
      int maxPointsInLeafNode = TestUtil.nextInt(random(), 16, 2048);
      double maxMBSortInHeap = 5.0 + (3*random().nextDouble());
      if (VERBOSE) {
-        System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode + " and maxMBSortInHeap=" + maxMBSortInHeap);
+        System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode);
      }
      return new FilterCodec("Lucene62", Codec.getDefault()) {
@ -1167,7 +1166,7 @@ public class TestPointQueries extends LuceneTestCase {
          return new PointsFormat() {
            @Override
            public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException {
-              return new Lucene60PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap);
+              return new Lucene60PointsWriter(writeState, maxPointsInLeafNode);
            }
            @Override
--- a/lucene/spatial3d/src/test/org/apache/lucene/spatial3d/TestGeo3DPoint.java
+++ b/lucene/spatial3d/src/test/org/apache/lucene/spatial3d/TestGeo3DPoint.java
@ -87,9 +87,8 @@ public class TestGeo3DPoint extends LuceneTestCase {
  private static Codec getCodec() {
    if (Codec.getDefault().getName().equals("Lucene62")) {
      int maxPointsInLeafNode = TestUtil.nextInt(random(), 16, 2048);
      double maxMBSortInHeap = 3.0 + (3*random().nextDouble());
      if (VERBOSE) {
-        System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode + " and maxMBSortInHeap=" + maxMBSortInHeap);
+        System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode);
      }
      return new FilterCodec("Lucene62", Codec.getDefault()) {
@ -98,7 +97,7 @@ public class TestGeo3DPoint extends LuceneTestCase {
          return new PointsFormat() {
            @Override
            public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException {
-              return new Lucene60PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap);
+              return new Lucene60PointsWriter(writeState, maxPointsInLeafNode);
            }
            @Override
--- a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingPointsFormat.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingPointsFormat.java
@ -254,11 +254,11 @@ public final class AssertingPointsFormat extends PointsFormat {
    }
    @Override
-    public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException {
+    public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException {
      if (fieldInfo.getPointDimensionCount() == 0) {
        throw new IllegalArgumentException("writing field=\"" + fieldInfo.name + "\" but pointDimensionalCount is 0");
      }
-      in.writeField(fieldInfo, values);
+      in.writeField(fieldInfo, values, maxMBSortInHeap);
    }
    @Override
--- a/lucene/test-framework/src/java/org/apache/lucene/codecs/cranky/CrankyPointsFormat.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/cranky/CrankyPointsFormat.java
@ -56,11 +56,11 @@ class CrankyPointsFormat extends PointsFormat {
    }
    @Override
-    public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException {
+    public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException {
      if (random.nextInt(100) == 0) {
        throw new IOException("Fake IOException");
      }  
-      delegate.writeField(fieldInfo, values);
+      delegate.writeField(fieldInfo, values, maxMBSortInHeap);
    }
    @Override
--- a/lucene/test-framework/src/java/org/apache/lucene/geo/BaseGeoPointTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/geo/BaseGeoPointTestCase.java
@ -67,7 +67,6 @@ import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.SloppyMath;
 import org.apache.lucene.util.TestUtil;
 import org.apache.lucene.util.bkd.BKDWriter;
 /**
 * Abstract class to do basic tests for a geospatial impl (high level
@ -1248,7 +1247,7 @@ public abstract class BaseGeoPointTestCase extends LuceneTestCase {
        return new PointsFormat() {
          @Override
          public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException {
-            return new Lucene60PointsWriter(writeState, pointsInLeaf, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP);
+            return new Lucene60PointsWriter(writeState, pointsInLeaf);
          }
          @Override
--- a/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java
@ -92,7 +92,6 @@ public class RandomCodec extends AssertingCodec {
  // which is less effective for testing.
  // TODO: improve how we randomize this...
  private final int maxPointsInLeafNode;
  private final double maxMBSortInHeap;
  private final int bkdSplitRandomSeed;
  @Override
@ -103,9 +102,9 @@ public class RandomCodec extends AssertingCodec {
        // Randomize how BKDWriter chooses its splis:
-        return new Lucene60PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap) {
+        return new Lucene60PointsWriter(writeState, maxPointsInLeafNode) {
          @Override
-          public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException {
+          public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException {
            boolean singleValuePerDoc = values.size(fieldInfo.name) == values.getDocCount(fieldInfo.name);
@ -185,7 +184,6 @@ public class RandomCodec extends AssertingCodec {
    int lowFreqCutoff = TestUtil.nextInt(random, 2, 100);
    maxPointsInLeafNode = TestUtil.nextInt(random, 16, 2048);
    maxMBSortInHeap = 5.0 + (3*random.nextDouble());
    bkdSplitRandomSeed = random.nextInt();
    add(avoidCodecs,
@ -253,8 +251,7 @@ public class RandomCodec extends AssertingCodec {
  public String toString() {
    return super.toString() + ": " + previousMappings.toString() +
           ", docValues:" + previousDVMappings.toString() +
-           ", maxPointsInLeafNode=" + maxPointsInLeafNode +
+           ", maxPointsInLeafNode=" + maxPointsInLeafNode;
           ", maxMBSortInHeap=" + maxMBSortInHeap;
  }
  /** Just like {@link BKDWriter} except it evilly picks random ways to split cells on