LUCENE-1717: properly account for RAM used by buffered deletes

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@792532 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2009-07-09 12:44:57 +00:00
parent 343f292dad
commit 1f1fa05a7e
4 changed files with 103 additions and 27 deletions

View File

@ -127,6 +127,9 @@ Changes in runtime behavior
is failing to close reader/writers. (Brian Groose via Mike
McCandless)
9. LUCENE-1717: Fixed IndexWriter to account for RAM usage of
buffered deletions. (Mike McCandless)
API Changes
1. LUCENE-1419: Add expert API to set custom indexing chain. This API is

View File

@ -35,6 +35,7 @@ class BufferedDeletes {
HashMap terms = new HashMap();
HashMap queries = new HashMap();
List docIDs = new ArrayList();
long bytesUsed;
// Number of documents a delete term applies to.
final static class Num {
@ -60,17 +61,21 @@ class BufferedDeletes {
}
}
int size() {
// We use numTerms not terms.size() intentionally, so
// that deletes by the same term multiple times "count",
// ie if you ask to flush every 1000 deletes then even
// dup'd terms are counted towards that 1000
return numTerms + queries.size() + docIDs.size();
}
void update(BufferedDeletes in) {
numTerms += in.numTerms;
bytesUsed += in.bytesUsed;
terms.putAll(in.terms);
queries.putAll(in.queries);
docIDs.addAll(in.docIDs);
in.terms.clear();
in.numTerms = 0;
in.queries.clear();
in.docIDs.clear();
in.clear();
}
void clear() {
@ -78,6 +83,11 @@ class BufferedDeletes {
queries.clear();
docIDs.clear();
numTerms = 0;
bytesUsed = 0;
}
void addBytesUsed(long b) {
bytesUsed += b;
}
boolean any() {

View File

@ -38,6 +38,7 @@ import org.apache.lucene.search.Similarity;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Constants;
/**
* This class accepts multiple added documents and directly
@ -887,8 +888,25 @@ final class DocumentsWriter {
}
synchronized boolean deletesFull() {
return maxBufferedDeleteTerms != IndexWriter.DISABLE_AUTO_FLUSH
&& ((deletesInRAM.numTerms + deletesInRAM.queries.size() + deletesInRAM.docIDs.size()) >= maxBufferedDeleteTerms);
return (ramBufferSize != IndexWriter.DISABLE_AUTO_FLUSH &&
(deletesInRAM.bytesUsed + deletesFlushed.bytesUsed + numBytesUsed) >= ramBufferSize) ||
(maxBufferedDeleteTerms != IndexWriter.DISABLE_AUTO_FLUSH &&
((deletesInRAM.size() + deletesFlushed.size()) >= maxBufferedDeleteTerms));
}
synchronized boolean doApplyDeletes() {
// Very similar to deletesFull(), except we don't count
// numBytesAlloc, because we are checking whether
// deletes (alone) are consuming too many resources now
// and thus should be applied. We apply deletes if RAM
// usage is > 1/2 of our allowed RAM buffer, to prevent
// too-frequent flushing of a long tail of tiny segments
// when merges (which always apply deletes) are
// infrequent.
return (ramBufferSize != IndexWriter.DISABLE_AUTO_FLUSH &&
(deletesInRAM.bytesUsed + deletesFlushed.bytesUsed) >= ramBufferSize/2) ||
(maxBufferedDeleteTerms != IndexWriter.DISABLE_AUTO_FLUSH &&
((deletesInRAM.size() + deletesFlushed.size()) >= maxBufferedDeleteTerms));
}
synchronized private boolean timeToFlushDeletes() {
@ -1015,20 +1033,24 @@ final class DocumentsWriter {
else
num.setNum(docIDUpto);
deletesInRAM.numTerms++;
deletesInRAM.addBytesUsed(BYTES_PER_DEL_TERM + term.text.length()*CHAR_NUM_BYTE);
}
// Buffer a specific docID for deletion. Currently only
// used when we hit a exception when adding a document
synchronized private void addDeleteDocID(int docID) {
deletesInRAM.docIDs.add(new Integer(flushedDocCount+docID));
deletesInRAM.addBytesUsed(BYTES_PER_DEL_DOCID);
}
synchronized private void addDeleteQuery(Query query, int docID) {
deletesInRAM.queries.put(query, new Integer(flushedDocCount + docID));
deletesInRAM.addBytesUsed(BYTES_PER_DEL_QUERY);
}
synchronized boolean doBalanceRAM() {
return ramBufferSize != IndexWriter.DISABLE_AUTO_FLUSH && !bufferIsFull && (numBytesUsed >= ramBufferSize || numBytesAlloc >= freeTrigger);
return ramBufferSize != IndexWriter.DISABLE_AUTO_FLUSH && !bufferIsFull && (numBytesUsed+deletesInRAM.bytesUsed+deletesFlushed.bytesUsed >= ramBufferSize || numBytesAlloc >= freeTrigger);
}
/** Does the synchronized work to finish/flush the
@ -1044,7 +1066,6 @@ final class DocumentsWriter {
assert docWriter == null || docWriter.docID == perThread.docState.docID;
if (aborting) {
// We are currently aborting, and another thread is
@ -1109,7 +1130,7 @@ final class DocumentsWriter {
final SkipDocWriter skipDocWriter = new SkipDocWriter();
long getRAMUsed() {
return numBytesUsed;
return numBytesUsed + deletesInRAM.bytesUsed + deletesFlushed.bytesUsed;
}
long numBytesAlloc;
@ -1137,10 +1158,34 @@ final class DocumentsWriter {
// Coarse estimates used to measure RAM usage of buffered deletes
final static int OBJECT_HEADER_BYTES = 8;
final static int POINTER_NUM_BYTE = 4;
final static int POINTER_NUM_BYTE = Constants.JRE_IS_64BIT ? 8 : 4;
final static int INT_NUM_BYTE = 4;
final static int CHAR_NUM_BYTE = 2;
/* Rough logic: HashMap has an array[Entry] w/ varying
load factor (say 2 * POINTER). Entry is object w/ Term
key, BufferedDeletes.Num val, int hash, Entry next
(OBJ_HEADER + 3*POINTER + INT). Term is object w/
String field and String text (OBJ_HEADER + 2*POINTER).
We don't count Term's field since it's interned.
Term's text is String (OBJ_HEADER + 4*INT + POINTER +
OBJ_HEADER + string.length*CHAR). BufferedDeletes.num is
OBJ_HEADER + INT. */
final static int BYTES_PER_DEL_TERM = 8*POINTER_NUM_BYTE + 5*OBJECT_HEADER_BYTES + 6*INT_NUM_BYTE;
/* Rough logic: del docIDs are List<Integer>. Say list
allocates ~2X size (2*POINTER). Integer is OBJ_HEADER
+ int */
final static int BYTES_PER_DEL_DOCID = 2*POINTER_NUM_BYTE + OBJECT_HEADER_BYTES + INT_NUM_BYTE;
/* Rough logic: HashMap has an array[Entry] w/ varying
load factor (say 2 * POINTER). Entry is object w/
Query key, Integer val, int hash, Entry next
(OBJ_HEADER + 3*POINTER + INT). Query we often
undercount (say 24 bytes). Integer is OBJ_HEADER + INT. */
final static int BYTES_PER_DEL_QUERY = 5*POINTER_NUM_BYTE + 2*OBJECT_HEADER_BYTES + 2*INT_NUM_BYTE + 24;
/* Initial chunks size of the shared byte[] blocks used to
store postings data */
final static int BYTE_BLOCK_SHIFT = 15;
@ -1285,17 +1330,20 @@ final class DocumentsWriter {
// We flush when we've used our target usage
final long flushTrigger = ramBufferSize;
if (numBytesAlloc > freeTrigger) {
final long deletesRAMUsed = deletesInRAM.bytesUsed+deletesFlushed.bytesUsed;
if (numBytesAlloc+deletesRAMUsed > freeTrigger) {
if (infoStream != null)
message(" RAM: now balance allocations: usedMB=" + toMB(numBytesUsed) +
" vs trigger=" + toMB(flushTrigger) +
" allocMB=" + toMB(numBytesAlloc) +
" deletesMB=" + toMB(deletesRAMUsed) +
" vs trigger=" + toMB(freeTrigger) +
" byteBlockFree=" + toMB(byteBlockAllocator.freeByteBlocks.size()*BYTE_BLOCK_SIZE) +
" charBlockFree=" + toMB(freeCharBlocks.size()*CHAR_BLOCK_SIZE*CHAR_NUM_BYTE));
final long startBytesAlloc = numBytesAlloc;
final long startBytesAlloc = numBytesAlloc + deletesRAMUsed;
int iter = 0;
@ -1305,12 +1353,12 @@ final class DocumentsWriter {
boolean any = true;
while(numBytesAlloc > freeLevel) {
while(numBytesAlloc+deletesRAMUsed > freeLevel) {
synchronized(this) {
if (0 == byteBlockAllocator.freeByteBlocks.size() && 0 == freeCharBlocks.size() && 0 == freeIntBlocks.size() && !any) {
// Nothing else to free -- must flush now.
bufferIsFull = numBytesUsed > flushTrigger;
bufferIsFull = numBytesUsed+deletesRAMUsed > flushTrigger;
if (infoStream != null) {
if (numBytesUsed > flushTrigger)
message(" nothing to free; now set bufferIsFull");
@ -1345,7 +1393,7 @@ final class DocumentsWriter {
}
if (infoStream != null)
message(" after free: freedMB=" + nf.format((startBytesAlloc-numBytesAlloc)/1024./1024.) + " usedMB=" + nf.format(numBytesUsed/1024./1024.) + " allocMB=" + nf.format(numBytesAlloc/1024./1024.));
message(" after free: freedMB=" + nf.format((startBytesAlloc-numBytesAlloc-deletesRAMUsed)/1024./1024.) + " usedMB=" + nf.format((numBytesUsed+deletesRAMUsed)/1024./1024.) + " allocMB=" + nf.format(numBytesAlloc/1024./1024.));
} else {
// If we have not crossed the 100% mark, but have
@ -1355,10 +1403,11 @@ final class DocumentsWriter {
// flush.
synchronized(this) {
if (numBytesUsed > flushTrigger) {
if (numBytesUsed+deletesRAMUsed > flushTrigger) {
if (infoStream != null)
message(" RAM: now flush @ usedMB=" + nf.format(numBytesUsed/1024./1024.) +
" allocMB=" + nf.format(numBytesAlloc/1024./1024.) +
" deletesMB=" + nf.format(deletesRAMUsed/1024./1024.) +
" triggerMB=" + nf.format(flushTrigger/1024./1024.));
bufferIsFull = true;

View File

@ -1729,17 +1729,28 @@ public class IndexWriter {
}
/** Determines the amount of RAM that may be used for
* buffering added documents before they are flushed as a
* new Segment. Generally for faster indexing performance
* it's best to flush by RAM usage instead of document
* count and use as large a RAM buffer as you can.
* buffering added documents and deletions before they are
* flushed to the Directory. Generally for faster
* indexing performance it's best to flush by RAM usage
* instead of document count and use as large a RAM buffer
* as you can.
*
* <p>When this is set, the writer will flush whenever
* buffered documents use this much RAM. Pass in {@link
* #DISABLE_AUTO_FLUSH} to prevent triggering a flush due
* to RAM usage. Note that if flushing by document count
* is also enabled, then the flush will be triggered by
* whichever comes first.</p>
* buffered documents and deletions use this much RAM.
* Pass in {@link #DISABLE_AUTO_FLUSH} to prevent
* triggering a flush due to RAM usage. Note that if
* flushing by document count is also enabled, then the
* flush will be triggered by whichever comes first.</p>
*
* <p> <b>NOTE</b>: the account of RAM usage for pending
* deletions is only approximate. Specifically, if you
* delete by Query, Lucene currently has no way to measure
* the RAM usage if individual Queries so the accounting
* will under-estimate and you should compensate by either
* calling commit() periodically yourself, or by using
* {@link #setMaxBufferedDeleteTerms} to flush by count
* instead of RAM usage (each buffered delete Query counts
* as one).
*
* <p> The default value is {@link #DEFAULT_RAM_BUFFER_SIZE_MB}.</p>
*
@ -4089,7 +4100,10 @@ public class IndexWriter {
flushCount++;
flushDeletes |= docWriter.deletesFull();
// If we are flushing because too many deletes
// accumulated, then we should apply the deletes to free
// RAM:
flushDeletes |= docWriter.doApplyDeletes();
// When autoCommit=true we must always flush deletes
// when flushing a segment; otherwise deletes may become