LUCENE-7390: revert this change, since it's obsoleted by the much better LUCENE-7396

2016-07-29 14:02:00 -04:00 · 2016-07-29 14:02:00 -04:00 · 1aecdd28d1
parent 60975d2dfa
commit 1aecdd28d1
16 changed files with 48 additions and 52 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -138,11 +138,6 @@ Improvements

 * LUCENE-7385: Improve/fix assert messages in SpanScorer. (David Smiley)

-* LUCENE-7390: Improve performance of indexing points by allowing the
-  codec to use transient heap in proportion to IndexWriter's RAM
-  buffer, instead of a fixed 16.0 MB.  A custom codec can still
-  override the buffer size itself. (Mike McCandless)
-
 * LUCENE-7393: Add ICUTokenizer option to parse Myanmar text as syllables instead of words,
  because the ICU word-breaking algorithm has some issues. This allows for the previous 
  tokenization used before Lucene 5. (AM, Robert Muir)
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java
@ -68,7 +68,7 @@ class SimpleTextPointsWriter extends PointsWriter {
  }

  @Override
-  public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException {
+  public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException {

    boolean singleValuePerDoc = values.size(fieldInfo.name) == values.getDocCount(fieldInfo.name);

@ -79,7 +79,7 @@ class SimpleTextPointsWriter extends PointsWriter {
                                          fieldInfo.getPointDimensionCount(),
                                          fieldInfo.getPointNumBytes(),
                                          BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE,
-                                          maxMBSortInHeap,
+                                          BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP,
                                          values.size(fieldInfo.name),
                                          singleValuePerDoc) {

--- a/lucene/core/src/java/org/apache/lucene/codecs/PointsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/PointsWriter.java
@ -22,7 +22,6 @@ import java.io.IOException;

 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.MergeState;
-import org.apache.lucene.util.bkd.BKDWriter;

 /** Abstract API to write points
 *
@ -35,9 +34,8 @@ public abstract class PointsWriter implements Closeable {
  protected PointsWriter() {
  }

-  /** Write all values contained in the provided reader.  {@code maxMBSortInHeap} is the maximum
-   *  transient heap that can be used to sort values, before spilling to disk for offline sorting */
-  public abstract void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException;
+  /** Write all values contained in the provided reader */
+  public abstract void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException;

  /** Default naive merge implementation for one field: it just re-indexes all the values
   *  from the incoming segment.  The default codec overrides this for 1D fields and uses
@ -147,10 +145,7 @@ public abstract class PointsWriter implements Closeable {
                 public int getDocCount(String fieldName) {
                   return finalDocCount;
                 }
-               },
-               // TODO: also let merging of > 1D fields tap into IW's indexing buffer size, somehow (1D fields do an optimized merge sort
-               // and don't need heap)
-               BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP);
+               });
  }

  /** Default merge implementation to merge incoming points readers by visiting all their points and
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsWriter.java
@ -40,9 +40,7 @@ import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.bkd.BKDReader;
 import org.apache.lucene.util.bkd.BKDWriter;

-/** Writes dimensional values
- *
- * @lucene.experimental */
+/** Writes dimensional values */
 public class Lucene60PointsWriter extends PointsWriter implements Closeable {

  /** Output used to write the BKD tree data file */
@ -53,13 +51,15 @@ public class Lucene60PointsWriter extends PointsWriter implements Closeable {

  final SegmentWriteState writeState;
  final int maxPointsInLeafNode;
+  final double maxMBSortInHeap;
  private boolean finished;

  /** Full constructor */
-  public Lucene60PointsWriter(SegmentWriteState writeState, int maxPointsInLeafNode) throws IOException {
+  public Lucene60PointsWriter(SegmentWriteState writeState, int maxPointsInLeafNode, double maxMBSortInHeap) throws IOException {
    assert writeState.fieldInfos.hasPointValues();
    this.writeState = writeState;
    this.maxPointsInLeafNode = maxPointsInLeafNode;
+    this.maxMBSortInHeap = maxMBSortInHeap;
    String dataFileName = IndexFileNames.segmentFileName(writeState.segmentInfo.name,
                                                         writeState.segmentSuffix,
                                                         Lucene60PointsFormat.DATA_EXTENSION);
@ -81,11 +81,11 @@ public class Lucene60PointsWriter extends PointsWriter implements Closeable {

  /** Uses the defaults values for {@code maxPointsInLeafNode} (1024) and {@code maxMBSortInHeap} (16.0) */
  public Lucene60PointsWriter(SegmentWriteState writeState) throws IOException {
-    this(writeState, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE);
+    this(writeState, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP);
  }

  @Override
-  public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException {
+  public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException {

    boolean singleValuePerDoc = values.size(fieldInfo.name) == values.getDocCount(fieldInfo.name);

@ -182,8 +182,7 @@ public class Lucene60PointsWriter extends PointsWriter implements Closeable {
                                                fieldInfo.getPointDimensionCount(),
                                                fieldInfo.getPointNumBytes(),
                                                maxPointsInLeafNode,
-                                                // NOTE: not used, since BKDWriter.merge does a merge sort:
-                                                BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP,
+                                                maxMBSortInHeap,
                                                totMaxSize,
                                                singleValuePerDoc)) {
            List<BKDReader> bkdReaders = new ArrayList<>();
--- a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java
+++ b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java
@ -153,7 +153,7 @@ class DocumentsWriterPerThread {
  final Allocator byteBlockAllocator;
  final IntBlockPool.Allocator intBlockAllocator;
  private final AtomicLong pendingNumDocs;
-  final LiveIndexWriterConfig indexWriterConfig;
+  private final LiveIndexWriterConfig indexWriterConfig;
  private final boolean enableTestPoints;
  private final IndexWriter indexWriter;
  
--- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
@ -762,7 +762,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
   * {@link #getConfig()}.
   * 
   * <p>
-   * <b>NOTE:</b> after this writer is created, the given configuration instance
+   * <b>NOTE:</b> after ths writer is created, the given configuration instance
   * cannot be passed to another writer.
   * 
   * @param d
--- a/lucene/core/src/java/org/apache/lucene/index/LiveIndexWriterConfig.java
+++ b/lucene/core/src/java/org/apache/lucene/index/LiveIndexWriterConfig.java
@ -168,14 +168,9 @@ public class LiveIndexWriterConfig {
  
  /**
   * Determines the amount of RAM that may be used for buffering added documents
-   * and deletions before beginning to flush them to the Directory.  For
-   * faster indexing performance it's best to use as large a RAM buffer as you can.
-   * <p>
-   * Note that this setting is not a hard limit on memory usage during indexing, as
-   * transient and non-trivial memory well beyond this buffer size may be used,
-   * for example due to segment merges or writing points to new segments.
-   * For application stability the available memory in the JVM
-   * should be significantly larger than the RAM buffer used for indexing.
+   * and deletions before they are flushed to the Directory. Generally for
+   * faster indexing performance it's best to flush by RAM usage instead of
+   * document count and use as large a RAM buffer as you can.
   * <p>
   * When this is set, the writer will flush whenever buffered documents and
   * deletions use this much RAM. Pass in
@ -183,6 +178,14 @@ public class LiveIndexWriterConfig {
   * due to RAM usage. Note that if flushing by document count is also enabled,
   * then the flush will be triggered by whichever comes first.
   * <p>
+   * The maximum RAM limit is inherently determined by the JVMs available
+   * memory. Yet, an {@link IndexWriter} session can consume a significantly
+   * larger amount of memory than the given RAM limit since this limit is just
+   * an indicator when to flush memory resident documents to the Directory.
+   * Flushes are likely happen concurrently while other threads adding documents
+   * to the writer. For application stability the available memory in the JVM
+   * should be significantly larger than the RAM buffer used for indexing.
+   * <p>
   * <b>NOTE</b>: the account of RAM usage for pending deletions is only
   * approximate. Specifically, if you delete by Query, Lucene currently has no
   * way to measure the RAM usage of individual Queries so the accounting will
--- a/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java
@ -25,7 +25,6 @@ import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.ByteBlockPool;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.Counter;
-import org.apache.lucene.util.bkd.BKDWriter;

 /** Buffers up pending byte[][] value(s) per doc, then flushes when segment flushes. */
 class PointValuesWriter {
@ -37,7 +36,6 @@ class PointValuesWriter {
  private int numDocs;
  private int lastDocID = -1;
  private final int packedBytesLength;
-  private final LiveIndexWriterConfig indexWriterConfig;

  public PointValuesWriter(DocumentsWriterPerThread docWriter, FieldInfo fieldInfo) {
    this.fieldInfo = fieldInfo;
@ -46,7 +44,6 @@ class PointValuesWriter {
    docIDs = new int[16];
    iwBytesUsed.addAndGet(16 * Integer.BYTES);
    packedBytesLength = fieldInfo.getPointDimensionCount() * fieldInfo.getPointNumBytes();
-    indexWriterConfig = docWriter.indexWriterConfig;
  }

  // TODO: if exactly the same value is added to exactly the same doc, should we dedup?
@ -167,6 +164,6 @@ class PointValuesWriter {
      }
    };

-    writer.writeField(fieldInfo, reader, Math.max(indexWriterConfig.getRAMBufferSizeMB()/8.0, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP));
+    writer.writeField(fieldInfo, reader, Math.max(indexWriterConfig.getRAMBufferSizeMB()/8.0));
  }
 }
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
@ -205,7 +205,7 @@ public class BKDWriter implements Closeable {
    // all recursive halves (i.e. 16 + 8 + 4 + 2) so the memory usage is 2X
    // what that level would consume, so we multiply by 0.5 to convert from
    // bytes to points here.  Each dimension has its own sorted partition, so
-    // we must divide by numDims as well.
+    // we must divide by numDims as wel.

    maxPointsSortInHeap = (int) (0.5 * (maxMBSortInHeap * 1024 * 1024) / (bytesPerDoc * numDims));

--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene60/TestLucene60PointsFormat.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene60/TestLucene60PointsFormat.java
@ -41,8 +41,9 @@ public class TestLucene60PointsFormat extends BasePointsFormatTestCase {
    if (random().nextBoolean()) {
      // randomize parameters
      int maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 500);
+      double maxMBSortInHeap = 3.0 + (3*random().nextDouble());
      if (VERBOSE) {
-        System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode);
+        System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode + " and maxMBSortInHeap=" + maxMBSortInHeap);
      }

      // sneaky impersonation!
@ -52,7 +53,7 @@ public class TestLucene60PointsFormat extends BasePointsFormatTestCase {
          return new PointsFormat() {
            @Override
            public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException {
-              return new Lucene60PointsWriter(writeState, maxPointsInLeafNode);
+              return new Lucene60PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap);
            }

            @Override
--- a/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java
@ -1156,8 +1156,9 @@ public class TestPointQueries extends LuceneTestCase {
  private static Codec getCodec() {
    if (Codec.getDefault().getName().equals("Lucene62")) {
      int maxPointsInLeafNode = TestUtil.nextInt(random(), 16, 2048);
+      double maxMBSortInHeap = 5.0 + (3*random().nextDouble());
      if (VERBOSE) {
-        System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode);
+        System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode + " and maxMBSortInHeap=" + maxMBSortInHeap);
      }

      return new FilterCodec("Lucene62", Codec.getDefault()) {
@ -1166,7 +1167,7 @@ public class TestPointQueries extends LuceneTestCase {
          return new PointsFormat() {
            @Override
            public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException {
-              return new Lucene60PointsWriter(writeState, maxPointsInLeafNode);
+              return new Lucene60PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap);
            }

            @Override
--- a/lucene/spatial3d/src/test/org/apache/lucene/spatial3d/TestGeo3DPoint.java
+++ b/lucene/spatial3d/src/test/org/apache/lucene/spatial3d/TestGeo3DPoint.java
@ -87,8 +87,9 @@ public class TestGeo3DPoint extends LuceneTestCase {
  private static Codec getCodec() {
    if (Codec.getDefault().getName().equals("Lucene62")) {
      int maxPointsInLeafNode = TestUtil.nextInt(random(), 16, 2048);
+      double maxMBSortInHeap = 3.0 + (3*random().nextDouble());
      if (VERBOSE) {
-        System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode);
+        System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode + " and maxMBSortInHeap=" + maxMBSortInHeap);
      }

      return new FilterCodec("Lucene62", Codec.getDefault()) {
@ -97,7 +98,7 @@ public class TestGeo3DPoint extends LuceneTestCase {
          return new PointsFormat() {
            @Override
            public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException {
-              return new Lucene60PointsWriter(writeState, maxPointsInLeafNode);
+              return new Lucene60PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap);
            }

            @Override
--- a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingPointsFormat.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingPointsFormat.java
@ -255,11 +255,11 @@ public final class AssertingPointsFormat extends PointsFormat {
    }
    
    @Override
-    public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException {
+    public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException {
      if (fieldInfo.getPointDimensionCount() == 0) {
        throw new IllegalArgumentException("writing field=\"" + fieldInfo.name + "\" but pointDimensionalCount is 0");
      }
-      in.writeField(fieldInfo, values, maxMBSortInHeap);
+      in.writeField(fieldInfo, values);
    }

    @Override
--- a/lucene/test-framework/src/java/org/apache/lucene/codecs/cranky/CrankyPointsFormat.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/cranky/CrankyPointsFormat.java
@ -56,11 +56,11 @@ class CrankyPointsFormat extends PointsFormat {
    }

    @Override
-    public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException {
+    public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException {
      if (random.nextInt(100) == 0) {
        throw new IOException("Fake IOException");
      }  
-      delegate.writeField(fieldInfo, values, maxMBSortInHeap);
+      delegate.writeField(fieldInfo, values);
    }

    @Override
--- a/lucene/test-framework/src/java/org/apache/lucene/geo/BaseGeoPointTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/geo/BaseGeoPointTestCase.java
@ -67,6 +67,7 @@ import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.SloppyMath;
 import org.apache.lucene.util.TestUtil;
+import org.apache.lucene.util.bkd.BKDWriter;

 /**
 * Abstract class to do basic tests for a geospatial impl (high level
@ -1247,7 +1248,7 @@ public abstract class BaseGeoPointTestCase extends LuceneTestCase {
        return new PointsFormat() {
          @Override
          public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException {
-            return new Lucene60PointsWriter(writeState, pointsInLeaf);
+            return new Lucene60PointsWriter(writeState, pointsInLeaf, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP);
          }
  
          @Override
--- a/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java
@ -92,6 +92,7 @@ public class RandomCodec extends AssertingCodec {
  // which is less effective for testing.
  // TODO: improve how we randomize this...
  private final int maxPointsInLeafNode;
+  private final double maxMBSortInHeap;
  private final int bkdSplitRandomSeed;

  @Override
@ -102,9 +103,9 @@ public class RandomCodec extends AssertingCodec {

        // Randomize how BKDWriter chooses its splis:

-        return new Lucene60PointsWriter(writeState, maxPointsInLeafNode) {
+        return new Lucene60PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap) {
          @Override
-          public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException {
+          public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException {

            boolean singleValuePerDoc = values.size(fieldInfo.name) == values.getDocCount(fieldInfo.name);

@ -184,6 +185,7 @@ public class RandomCodec extends AssertingCodec {
    int lowFreqCutoff = TestUtil.nextInt(random, 2, 100);

    maxPointsInLeafNode = TestUtil.nextInt(random, 16, 2048);
+    maxMBSortInHeap = 5.0 + (3*random.nextDouble());
    bkdSplitRandomSeed = random.nextInt();

    add(avoidCodecs,
@ -251,7 +253,8 @@ public class RandomCodec extends AssertingCodec {
  public String toString() {
    return super.toString() + ": " + previousMappings.toString() +
           ", docValues:" + previousDVMappings.toString() +
-           ", maxPointsInLeafNode=" + maxPointsInLeafNode;
+           ", maxPointsInLeafNode=" + maxPointsInLeafNode +
+           ", maxMBSortInHeap=" + maxMBSortInHeap;
  }

  /** Just like {@link BKDWriter} except it evilly picks random ways to split cells on