LUCENE-3216: keep float doc values in memory during indexing while merge directly to the target file

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1140044 13f79535-47bb-0310-9956-ffa450edef68
2011-06-27 08:07:15 +00:00 · 2011-06-27 08:07:15 +00:00 · 28c15b9637
parent b15ca9a6b9
commit 28c15b9637
4 changed files with 101 additions and 50 deletions
--- a/lucene/src/java/org/apache/lucene/index/values/Bytes.java
+++ b/lucene/src/java/org/apache/lucene/index/values/Bytes.java
@ -387,7 +387,7 @@ public final class Bytes {
    }

    @Override
-    protected void add(int docID) throws IOException {
+    protected void mergeDoc(int docID) throws IOException {
      add(docID, bytesRef);
    }

--- a/lucene/src/java/org/apache/lucene/index/values/Floats.java
+++ b/lucene/src/java/org/apache/lucene/index/values/Floats.java
@ -25,10 +25,12 @@ import org.apache.lucene.index.values.IndexDocValues.Source;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.CodecUtil;
 import org.apache.lucene.util.FloatsRef;
 import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.RamUsageEstimator;

 /**
 * Exposes {@link Writer} and reader ({@link Source}) for 32 bit and 64 bit
@ -44,11 +46,7 @@ public class Floats {
  private static final String CODEC_NAME = "SimpleFloats";
  static final int VERSION_START = 0;
  static final int VERSION_CURRENT = VERSION_START;
-  private static final int INT_DEFAULT = Float
-      .floatToRawIntBits(0.0f);
-  private static final long LONG_DEFAULT = Double
-      .doubleToRawLongBits(0.0d);
-
+  private static final byte[] DEFAULTS = new byte[] {0,0,0,0,0,0,0,0};
  
  public static Writer getWriter(Directory dir, String id, int precisionBytes,
      AtomicLong bytesUsed) throws IOException {
@ -70,16 +68,27 @@ public class Floats {

  abstract static class FloatsWriter extends Writer {
    private final String id;
-    private FloatsRef floatsRef;
+    protected FloatsRef floatsRef;
    protected int lastDocId = -1;
    protected IndexOutput datOut;
    private final byte precision;
+    private final Directory dir;

    protected FloatsWriter(Directory dir, String id, int precision,
        AtomicLong bytesUsed) throws IOException {
      super(bytesUsed);
      this.id = id;
      this.precision = (byte) precision;
+      this.dir = dir;
+     
+    }
+
+    public long ramBytesUsed() {
+      return 0;
+    }
+    
+    final void initDataOut() throws IOException {
+      assert datOut == null;
      datOut = dir.createOutput(IndexFileNames.segmentFileName(id, "",
          Writer.DATA_EXTENSION));
      boolean success = false;
@ -95,12 +104,8 @@ public class Floats {
      }
    }

-    public long ramBytesUsed() {
-      return 0;
-    }
-
    @Override
-    protected void add(int docID) throws IOException {
+    protected void mergeDoc(int docID) throws IOException {
      add(docID, floatsRef.get());
    }

@ -114,13 +119,20 @@ public class Floats {
      floatsRef = valuesEnum.getFloat();
    }

-    protected abstract int fillDefault(int num) throws IOException;
+    protected final int fillDefault(int numValues) throws IOException {
+      for (int i = 0; i < numValues; i++) {
+        datOut.writeBytes(DEFAULTS, precision);
+      }
+      return numValues;
+    }

    @Override
    protected void merge(MergeState state) throws IOException {
+      if (datOut == null) {
+        initDataOut();
+      }
      if (state.bits == null && state.reader instanceof FloatsReader) {
        // no deletes - bulk copy
-        // TODO: should be do bulks with deletes too?
        final FloatsReader reader = (FloatsReader) state.reader;
        assert reader.precisionBytes == (int) precision;
        if (reader.maxDoc == 0)
@ -131,10 +143,12 @@ public class Floats {
          lastDocId += fillDefault(docBase - lastDocId - 1);
        }
        lastDocId += reader.transferTo(datOut);
-      } else
+      } else {
        super.merge(state);        
      }

+    }
+
    @Override
    public void files(Collection<String> files) throws IOException {
      files.add(IndexFileNames.segmentFileName(id, "", Writer.DATA_EXTENSION));
@ -143,10 +157,12 @@ public class Floats {

  // Writes 4 bytes (float) per value
  static class Float4Writer extends FloatsWriter {
-
+    private int[] values;
    protected Float4Writer(Directory dir, String id, AtomicLong bytesUsed)
        throws IOException {
      super(dir, id, 4, bytesUsed);
+      values = new int[1];
+      bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT);
    }

    @Override
@ -154,46 +170,80 @@ public class Floats {
        throws IOException {
      assert docID > lastDocId : "docID: " + docID
          + " must be greater than the last added doc id: " + lastDocId;
+      if (docID >= values.length) {
+        final long len = values.length;
+        values = ArrayUtil.grow(values, 1 + docID);
+        bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT
+            * ((values.length) - len));
+      }
+      values[docID] = Float.floatToRawIntBits((float)v);
+      lastDocId = docID;
+    }
+
+    @Override
+    protected void mergeDoc(int docID) throws IOException {
+      assert datOut != null;
+      assert docID > lastDocId : "docID: " + docID
+      + " must be greater than the last added doc id: " + lastDocId;
      if (docID - lastDocId > 1) {
        // fill with default values
-        lastDocId += fillDefault(docID - lastDocId - 1);
+        fillDefault(docID - lastDocId - 1);
      }
      assert datOut != null;
-      datOut.writeInt(Float.floatToRawIntBits((float) v));
-      ++lastDocId;
+      datOut.writeInt(Float.floatToRawIntBits((float) floatsRef.get()));
+      lastDocId = docID;
    }

    @Override
    public void finish(int docCount) throws IOException {
+      boolean success = false;
      try {
-        if (docCount > lastDocId + 1)
-          for (int i = lastDocId; i < docCount; i++) {
-            datOut.writeInt(INT_DEFAULT); // default value
+        int numDefaultsToAppend = docCount - (lastDocId + 1);
+        if (datOut == null) {
+          initDataOut();
+          for (int i = 0; i <= lastDocId; i++) {
+            datOut.writeInt(values[i]);
          }
+        }
+        fillDefault(numDefaultsToAppend);
+        success = true;
      } finally {
-        datOut.close();
+        bytesUsed.addAndGet(-(RamUsageEstimator.NUM_BYTES_INT
+            * ((values.length))));
+        values = null;
+        IOUtils.closeSafely(!success, datOut);
      }
    }

-    @Override
-    protected int fillDefault(int numValues) throws IOException {
-      for (int i = 0; i < numValues; i++) {
-        datOut.writeInt(INT_DEFAULT);
-      }
-      return numValues;
-    }
+    
  }

  // Writes 8 bytes (double) per value
  static class Float8Writer extends FloatsWriter {
-
+    private long[] values;
    protected Float8Writer(Directory dir, String id, AtomicLong bytesUsed)
        throws IOException {
      super(dir, id, 8, bytesUsed);
+      values = new long[1];
+      bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_LONG);
    }

    @Override
    public void add(int docID, double v) throws IOException {
+      assert docID > lastDocId : "docID: " + docID
+          + " must be greater than the last added doc id: " + lastDocId;
+      if (docID >= values.length) {
+        final long len = values.length;
+        values = ArrayUtil.grow(values, 1 + docID);
+        bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_LONG
+            * ((values.length) - len));
+      }
+      values[docID] = Double.doubleToLongBits(v);
+      lastDocId = docID;
+    }
+    
+    @Override
+    protected void mergeDoc(int docID) throws IOException {
      assert docID > lastDocId : "docID: " + docID
      + " must be greater than the last added doc id: " + lastDocId;
      if (docID - lastDocId > 1) {
@ -201,29 +251,30 @@ public class Floats {
        lastDocId += fillDefault(docID - lastDocId - 1);
      }
      assert datOut != null;
-      datOut.writeLong(Double.doubleToRawLongBits(v));
-      ++lastDocId;
+      datOut.writeLong(Double.doubleToRawLongBits((float) floatsRef.get()));
+      lastDocId = docID;
    }

    @Override
    public void finish(int docCount) throws IOException {
+      boolean success = false;
      try {
-        if (docCount > lastDocId + 1)
-          for (int i = lastDocId; i < docCount; i++) {
-            datOut.writeLong(LONG_DEFAULT); // default value
+        int numDefaultsToAppend = docCount - (lastDocId + 1);
+        if (datOut == null) {
+          initDataOut();
+          for (int i = 0; i <= lastDocId; i++) {
+            datOut.writeLong(values[i]);
          }
+        }
+        fillDefault(numDefaultsToAppend);
+        success = true;
      } finally {
-        datOut.close();
+        bytesUsed.addAndGet(-(RamUsageEstimator.NUM_BYTES_LONG
+            * ((values.length))));
+        values = null;
+        IOUtils.closeSafely(!success, datOut);
      }
    }
-
-    @Override
-    protected int fillDefault(int numValues) throws IOException {
-      for (int i = 0; i < numValues; i++) {
-        datOut.writeLong(LONG_DEFAULT);
-      }
-      return numValues;
-    }
  }

  /**
--- a/lucene/src/java/org/apache/lucene/index/values/IntsImpl.java
+++ b/lucene/src/java/org/apache/lucene/index/values/IntsImpl.java
@ -157,7 +157,7 @@ class IntsImpl {
    }

    @Override
-    protected void add(int docID) throws IOException {
+    protected void mergeDoc(int docID) throws IOException {
      add(docID, intsRef.get());
    }

--- a/lucene/src/java/org/apache/lucene/index/values/Writer.java
+++ b/lucene/src/java/org/apache/lucene/index/values/Writer.java
@ -113,11 +113,11 @@ public abstract class Writer extends DocValuesConsumer {
   * the {@link Writer} implementation. The given document ID must always be
   * greater than the previous ID or <tt>0</tt> if called the first time.
   */
-  protected abstract void add(int docID) throws IOException;
+  protected abstract void mergeDoc(int docID) throws IOException;

  /**
   * Sets the next {@link ValuesEnum} to consume values from on calls to
-   * {@link #add(int)}
+   * {@link #mergeDoc(int)}
   * 
   * @param valuesEnum
   *          the next {@link ValuesEnum}, this must not be null
@ -159,7 +159,7 @@ public abstract class Writer extends DocValuesConsumer {
              }
            }
            if (currentDocId == i) { // we are on the doc to merge
-              add(docID);
+              mergeDoc(docID);
            }
            ++docID;
          }