LUCENE-1717: properly account for RAM used by buffered deletes

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@792532 13f79535-47bb-0310-9956-ffa450edef68
2009-07-09 12:44:57 +00:00 · 2009-07-09 12:44:57 +00:00 · 1f1fa05a7e
parent 343f292dad
commit 1f1fa05a7e
4 changed files with 103 additions and 27 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -127,6 +127,9 @@ Changes in runtime behavior
    is failing to close reader/writers.  (Brian Groose via Mike
    McCandless)

+ 9. LUCENE-1717: Fixed IndexWriter to account for RAM usage of
+    buffered deletions.  (Mike McCandless)
+
 API Changes

 1. LUCENE-1419: Add expert API to set custom indexing chain. This API is 
--- a/src/java/org/apache/lucene/index/BufferedDeletes.java
+++ b/src/java/org/apache/lucene/index/BufferedDeletes.java
@ -35,6 +35,7 @@ class BufferedDeletes {
  HashMap terms = new HashMap();
  HashMap queries = new HashMap();
  List docIDs = new ArrayList();
+  long bytesUsed;

  // Number of documents a delete term applies to.
  final static class Num {
@ -60,17 +61,21 @@ class BufferedDeletes {
    }
  }

-
+  int size() {
+    // We use numTerms not terms.size() intentionally, so
+    // that deletes by the same term multiple times "count",
+    // ie if you ask to flush every 1000 deletes then even
+    // dup'd terms are counted towards that 1000
+    return numTerms + queries.size() + docIDs.size();
+  }

  void update(BufferedDeletes in) {
    numTerms += in.numTerms;
+    bytesUsed += in.bytesUsed;
    terms.putAll(in.terms);
    queries.putAll(in.queries);
    docIDs.addAll(in.docIDs);
-    in.terms.clear();
-    in.numTerms = 0;
-    in.queries.clear();
-    in.docIDs.clear();
+    in.clear();
  }
    
  void clear() {
@ -78,6 +83,11 @@ class BufferedDeletes {
    queries.clear();
    docIDs.clear();
    numTerms = 0;
+    bytesUsed = 0;
+  }
+
+  void addBytesUsed(long b) {
+    bytesUsed += b;
  }

  boolean any() {
--- a/src/java/org/apache/lucene/index/DocumentsWriter.java
+++ b/src/java/org/apache/lucene/index/DocumentsWriter.java
@ -38,6 +38,7 @@ import org.apache.lucene.search.Similarity;
 import org.apache.lucene.store.AlreadyClosedException;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.Constants;

 /**
 * This class accepts multiple added documents and directly
@ -887,8 +888,25 @@ final class DocumentsWriter {
  }

  synchronized boolean deletesFull() {
-    return maxBufferedDeleteTerms != IndexWriter.DISABLE_AUTO_FLUSH
-      && ((deletesInRAM.numTerms + deletesInRAM.queries.size() + deletesInRAM.docIDs.size()) >= maxBufferedDeleteTerms);
+    return (ramBufferSize != IndexWriter.DISABLE_AUTO_FLUSH &&
+            (deletesInRAM.bytesUsed + deletesFlushed.bytesUsed + numBytesUsed) >= ramBufferSize) ||
+      (maxBufferedDeleteTerms != IndexWriter.DISABLE_AUTO_FLUSH &&
+       ((deletesInRAM.size() + deletesFlushed.size()) >= maxBufferedDeleteTerms));
+  }
+
+  synchronized boolean doApplyDeletes() {
+    // Very similar to deletesFull(), except we don't count
+    // numBytesAlloc, because we are checking whether
+    // deletes (alone) are consuming too many resources now
+    // and thus should be applied.  We apply deletes if RAM
+    // usage is > 1/2 of our allowed RAM buffer, to prevent
+    // too-frequent flushing of a long tail of tiny segments
+    // when merges (which always apply deletes) are
+    // infrequent.
+    return (ramBufferSize != IndexWriter.DISABLE_AUTO_FLUSH &&
+            (deletesInRAM.bytesUsed + deletesFlushed.bytesUsed) >= ramBufferSize/2) ||
+      (maxBufferedDeleteTerms != IndexWriter.DISABLE_AUTO_FLUSH &&
+       ((deletesInRAM.size() + deletesFlushed.size()) >= maxBufferedDeleteTerms));
  }

  synchronized private boolean timeToFlushDeletes() {
@ -1015,20 +1033,24 @@ final class DocumentsWriter {
    else
      num.setNum(docIDUpto);
    deletesInRAM.numTerms++;
+
+    deletesInRAM.addBytesUsed(BYTES_PER_DEL_TERM + term.text.length()*CHAR_NUM_BYTE);
  }

  // Buffer a specific docID for deletion.  Currently only
  // used when we hit a exception when adding a document
  synchronized private void addDeleteDocID(int docID) {
    deletesInRAM.docIDs.add(new Integer(flushedDocCount+docID));
+    deletesInRAM.addBytesUsed(BYTES_PER_DEL_DOCID);
  }

  synchronized private void addDeleteQuery(Query query, int docID) {
    deletesInRAM.queries.put(query, new Integer(flushedDocCount + docID));
+    deletesInRAM.addBytesUsed(BYTES_PER_DEL_QUERY);
  }

  synchronized boolean doBalanceRAM() {
-    return ramBufferSize != IndexWriter.DISABLE_AUTO_FLUSH && !bufferIsFull && (numBytesUsed >= ramBufferSize || numBytesAlloc >= freeTrigger);
+    return ramBufferSize != IndexWriter.DISABLE_AUTO_FLUSH && !bufferIsFull && (numBytesUsed+deletesInRAM.bytesUsed+deletesFlushed.bytesUsed >= ramBufferSize || numBytesAlloc >= freeTrigger);
  }

  /** Does the synchronized work to finish/flush the
@ -1044,7 +1066,6 @@ final class DocumentsWriter {

      assert docWriter == null || docWriter.docID == perThread.docState.docID;

-
      if (aborting) {

        // We are currently aborting, and another thread is
@ -1109,7 +1130,7 @@ final class DocumentsWriter {
  final SkipDocWriter skipDocWriter = new SkipDocWriter();

  long getRAMUsed() {
-    return numBytesUsed;
+    return numBytesUsed + deletesInRAM.bytesUsed + deletesFlushed.bytesUsed;
  }

  long numBytesAlloc;
@ -1137,10 +1158,34 @@ final class DocumentsWriter {

  // Coarse estimates used to measure RAM usage of buffered deletes
  final static int OBJECT_HEADER_BYTES = 8;
-  final static int POINTER_NUM_BYTE = 4;
+  final static int POINTER_NUM_BYTE = Constants.JRE_IS_64BIT ? 8 : 4;
  final static int INT_NUM_BYTE = 4;
  final static int CHAR_NUM_BYTE = 2;

+  /* Rough logic: HashMap has an array[Entry] w/ varying
+     load factor (say 2 * POINTER).  Entry is object w/ Term
+     key, BufferedDeletes.Num val, int hash, Entry next
+     (OBJ_HEADER + 3*POINTER + INT).  Term is object w/
+     String field and String text (OBJ_HEADER + 2*POINTER).
+     We don't count Term's field since it's interned.
+     Term's text is String (OBJ_HEADER + 4*INT + POINTER +
+     OBJ_HEADER + string.length*CHAR).  BufferedDeletes.num is
+     OBJ_HEADER + INT. */
+ 
+  final static int BYTES_PER_DEL_TERM = 8*POINTER_NUM_BYTE + 5*OBJECT_HEADER_BYTES + 6*INT_NUM_BYTE;
+
+  /* Rough logic: del docIDs are List<Integer>.  Say list
+     allocates ~2X size (2*POINTER).  Integer is OBJ_HEADER
+     + int */
+  final static int BYTES_PER_DEL_DOCID = 2*POINTER_NUM_BYTE + OBJECT_HEADER_BYTES + INT_NUM_BYTE;
+
+  /* Rough logic: HashMap has an array[Entry] w/ varying
+     load factor (say 2 * POINTER).  Entry is object w/
+     Query key, Integer val, int hash, Entry next
+     (OBJ_HEADER + 3*POINTER + INT).  Query we often
+     undercount (say 24 bytes).  Integer is OBJ_HEADER + INT. */
+  final static int BYTES_PER_DEL_QUERY = 5*POINTER_NUM_BYTE + 2*OBJECT_HEADER_BYTES + 2*INT_NUM_BYTE + 24;
+
  /* Initial chunks size of the shared byte[] blocks used to
     store postings data */
  final static int BYTE_BLOCK_SHIFT = 15;
@ -1285,17 +1330,20 @@ final class DocumentsWriter {
    // We flush when we've used our target usage
    final long flushTrigger = ramBufferSize;

-    if (numBytesAlloc > freeTrigger) {
+    final long deletesRAMUsed = deletesInRAM.bytesUsed+deletesFlushed.bytesUsed;
+
+    if (numBytesAlloc+deletesRAMUsed > freeTrigger) {

      if (infoStream != null)
        message("  RAM: now balance allocations: usedMB=" + toMB(numBytesUsed) +
                " vs trigger=" + toMB(flushTrigger) +
                " allocMB=" + toMB(numBytesAlloc) +
+                " deletesMB=" + toMB(deletesRAMUsed) +
                " vs trigger=" + toMB(freeTrigger) +
                " byteBlockFree=" + toMB(byteBlockAllocator.freeByteBlocks.size()*BYTE_BLOCK_SIZE) +
                " charBlockFree=" + toMB(freeCharBlocks.size()*CHAR_BLOCK_SIZE*CHAR_NUM_BYTE));

-      final long startBytesAlloc = numBytesAlloc;
+      final long startBytesAlloc = numBytesAlloc + deletesRAMUsed;

      int iter = 0;

@ -1305,12 +1353,12 @@ final class DocumentsWriter {

      boolean any = true;

-      while(numBytesAlloc > freeLevel) {
+      while(numBytesAlloc+deletesRAMUsed > freeLevel) {
      
        synchronized(this) {
          if (0 == byteBlockAllocator.freeByteBlocks.size() && 0 == freeCharBlocks.size() && 0 == freeIntBlocks.size() && !any) {
            // Nothing else to free -- must flush now.
-            bufferIsFull = numBytesUsed > flushTrigger;
+            bufferIsFull = numBytesUsed+deletesRAMUsed > flushTrigger;
            if (infoStream != null) {
              if (numBytesUsed > flushTrigger)
                message("    nothing to free; now set bufferIsFull");
@ -1345,7 +1393,7 @@ final class DocumentsWriter {
      }

      if (infoStream != null)
-        message("    after free: freedMB=" + nf.format((startBytesAlloc-numBytesAlloc)/1024./1024.) + " usedMB=" + nf.format(numBytesUsed/1024./1024.) + " allocMB=" + nf.format(numBytesAlloc/1024./1024.));
+        message("    after free: freedMB=" + nf.format((startBytesAlloc-numBytesAlloc-deletesRAMUsed)/1024./1024.) + " usedMB=" + nf.format((numBytesUsed+deletesRAMUsed)/1024./1024.) + " allocMB=" + nf.format(numBytesAlloc/1024./1024.));
      
    } else {
      // If we have not crossed the 100% mark, but have
@ -1355,10 +1403,11 @@ final class DocumentsWriter {
      // flush.
      synchronized(this) {

-        if (numBytesUsed > flushTrigger) {
+        if (numBytesUsed+deletesRAMUsed > flushTrigger) {
          if (infoStream != null)
            message("  RAM: now flush @ usedMB=" + nf.format(numBytesUsed/1024./1024.) +
                    " allocMB=" + nf.format(numBytesAlloc/1024./1024.) +
+                    " deletesMB=" + nf.format(deletesRAMUsed/1024./1024.) +
                    " triggerMB=" + nf.format(flushTrigger/1024./1024.));

          bufferIsFull = true;
--- a/src/java/org/apache/lucene/index/IndexWriter.java
+++ b/src/java/org/apache/lucene/index/IndexWriter.java
@ -1729,17 +1729,28 @@ public class IndexWriter {
  }

  /** Determines the amount of RAM that may be used for
-   * buffering added documents before they are flushed as a
-   * new Segment.  Generally for faster indexing performance
-   * it's best to flush by RAM usage instead of document
-   * count and use as large a RAM buffer as you can.
+   * buffering added documents and deletions before they are
+   * flushed to the Directory.  Generally for faster
+   * indexing performance it's best to flush by RAM usage
+   * instead of document count and use as large a RAM buffer
+   * as you can.
   *
   * <p>When this is set, the writer will flush whenever
-   * buffered documents use this much RAM.  Pass in {@link
-   * #DISABLE_AUTO_FLUSH} to prevent triggering a flush due
-   * to RAM usage.  Note that if flushing by document count
-   * is also enabled, then the flush will be triggered by
-   * whichever comes first.</p>
+   * buffered documents and deletions use this much RAM.
+   * Pass in {@link #DISABLE_AUTO_FLUSH} to prevent
+   * triggering a flush due to RAM usage.  Note that if
+   * flushing by document count is also enabled, then the
+   * flush will be triggered by whichever comes first.</p>
+   *
+   * <p> <b>NOTE</b>: the account of RAM usage for pending
+   * deletions is only approximate.  Specifically, if you
+   * delete by Query, Lucene currently has no way to measure
+   * the RAM usage if individual Queries so the accounting
+   * will under-estimate and you should compensate by either
+   * calling commit() periodically yourself, or by using
+   * {@link #setMaxBufferedDeleteTerms} to flush by count
+   * instead of RAM usage (each buffered delete Query counts
+   * as one).
   *
   * <p> The default value is {@link #DEFAULT_RAM_BUFFER_SIZE_MB}.</p>
   * 
@ -4089,7 +4100,10 @@ public class IndexWriter {

    flushCount++;

-    flushDeletes |= docWriter.deletesFull();
+    // If we are flushing because too many deletes
+    // accumulated, then we should apply the deletes to free
+    // RAM:
+    flushDeletes |= docWriter.doApplyDeletes();

    // When autoCommit=true we must always flush deletes
    // when flushing a segment; otherwise deletes may become