LUCENE-9514: Include TermVectorsWriter in DWPT accounting (#1847)

TermVectorsWriter might consume some heap space memory that
can have a significant impact on decisions made in the IW if
writers should be stalled or DWPTs should be flushed if memory
settings are small in IWC and flushes are frequent. This change adds
RAM accounting to the TermVectorsWriter since it's part of the
DWPT lifecycle and not just present during flush.
This commit is contained in:
Simon Willnauer 2020-09-14 11:11:06 +02:00 committed by GitHub
parent 4d46caa05d
commit 8f895d9075
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 78 additions and 23 deletions

View File

@ -221,6 +221,10 @@ Improvements
heap consumption is taken into account when IndexWriter stalls or should flush
DWPTs. (Simon Willnauer)
* LUCENE-9514: Include TermVectorsWriter in DWPT accounting to ensure that it's
heap consumption is taken into account when IndexWriter stalls or should flush
DWPTs. (Simon Willnauer)
Optimizations
---------------------

View File

@ -56,8 +56,6 @@ public class SimpleTextTermVectorsWriter extends TermVectorsWriter {
static final String VECTORS_EXTENSION = "vec";
private final Directory directory;
private final String segment;
private IndexOutput out;
private int numDocsWritten = 0;
private final BytesRefBuilder scratch = new BytesRefBuilder();
@ -66,8 +64,6 @@ public class SimpleTextTermVectorsWriter extends TermVectorsWriter {
private boolean payloads;
public SimpleTextTermVectorsWriter(Directory directory, String segment, IOContext context) throws IOException {
this.directory = directory;
this.segment = segment;
boolean success = false;
try {
out = directory.createOutput(IndexFileNames.segmentFileName(segment, "", VECTORS_EXTENSION), context);
@ -193,4 +189,9 @@ public class SimpleTextTermVectorsWriter extends TermVectorsWriter {
private void newLine() throws IOException {
SimpleTextUtil.writeNewline(out);
}
@Override
public long ramBytesUsed() {
return scratch.get().bytes.length;
}
}

View File

@ -32,6 +32,7 @@ import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
@ -58,7 +59,7 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
*
* @lucene.experimental
*/
public abstract class TermVectorsWriter implements Closeable {
public abstract class TermVectorsWriter implements Closeable, Accountable {
/** Sole constructor. (For invocation by subclass
* constructors, typically implicit.) */

View File

@ -20,8 +20,10 @@ package org.apache.lucene.codecs.compressing;
import java.io.IOException;
import java.util.ArrayDeque;
import java.util.Arrays;
import java.util.Collection;
import java.util.Deque;
import java.util.Iterator;
import java.util.List;
import java.util.SortedSet;
import java.util.TreeSet;
@ -41,6 +43,7 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
@ -206,7 +209,7 @@ public final class CompressingTermVectorsWriter extends TermVectorsWriter {
private final BlockPackedWriter writer;
/** Sole constructor. */
public CompressingTermVectorsWriter(Directory directory, SegmentInfo si, String segmentSuffix, IOContext context,
CompressingTermVectorsWriter(Directory directory, SegmentInfo si, String segmentSuffix, IOContext context,
String formatName, CompressionMode compressionMode, int chunkSize, int blockShift) throws IOException {
assert directory != null;
this.segment = si.name;
@ -857,4 +860,15 @@ public final class CompressingTermVectorsWriter extends TermVectorsWriter {
return candidate.getNumDirtyChunks() > 1024 ||
candidate.getNumDirtyChunks() * 100 > candidate.getNumChunks();
}
@Override
public long ramBytesUsed() {
return positionsBuf.length + startOffsetsBuf.length + lengthsBuf.length + payloadLengthsBuf.length
+ termSuffixes.ramBytesUsed() + payloadBytes.ramBytesUsed() + lastTerm.bytes.length;
}
@Override
public Collection<Accountable> getChildResources() {
return List.of(termSuffixes, payloadBytes);
}
}

View File

@ -21,6 +21,7 @@ import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@ -43,6 +44,7 @@ import org.apache.lucene.search.SortField;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef;
@ -65,6 +67,8 @@ final class DefaultIndexingChain extends DocConsumer {
final TermsHash termsHash;
// Writes stored fields
final StoredFieldsConsumer storedFieldsConsumer;
final TermVectorsConsumer termVectorsWriter;
// NOTE: I tried using Hash Map<String,PerField>
// but it was ~2% slower on Wiki and Geonames with Java
@ -95,7 +99,6 @@ final class DefaultIndexingChain extends DocConsumer {
this.infoStream = indexWriterConfig.getInfoStream();
this.abortingExceptionConsumer = abortingExceptionConsumer;
final TermsHash termVectorsWriter;
if (segmentInfo.getIndexSort() == null) {
storedFieldsConsumer = new StoredFieldsConsumer(indexWriterConfig.getCodec(), directory, segmentInfo);
termVectorsWriter = new TermVectorsConsumer(intBlockAllocator, byteBlockAllocator, directory, segmentInfo, indexWriterConfig.getCodec());
@ -795,7 +798,13 @@ final class DefaultIndexingChain extends DocConsumer {
@Override
public long ramBytesUsed() {
return bytesUsed.get() + storedFieldsConsumer.ramBytesUsed();
return bytesUsed.get() + storedFieldsConsumer.accountable.ramBytesUsed()
+ termVectorsWriter.accountable.ramBytesUsed();
}
@Override
public Collection<Accountable> getChildResources() {
return List.of(storedFieldsConsumer.accountable, termVectorsWriter.accountable);
}
/** NOTE: not static: accesses at least docState, termsHash. */

View File

@ -185,4 +185,6 @@ final class SortingTermVectorsConsumer extends TermVectorsConsumer {
assert fieldCount == numFields;
writer.finishDocument();
}
}

View File

@ -23,6 +23,7 @@ import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.StoredFieldsWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.IOUtils;
class StoredFieldsConsumer {
@ -30,6 +31,9 @@ class StoredFieldsConsumer {
final Directory directory;
final SegmentInfo info;
StoredFieldsWriter writer;
// this accountable either holds the writer or one that returns null.
// it's cleaner than checking if the writer is null all over the place
Accountable accountable = Accountable.NULL_ACCOUNTABLE;
private int lastDoc;
StoredFieldsConsumer(Codec codec, Directory directory, SegmentInfo info) {
@ -42,6 +46,7 @@ class StoredFieldsConsumer {
protected void initStoredFieldsWriter() throws IOException {
if (writer == null) { // TODO can we allocate this in the ctor? we call start document for every doc anyway
this.writer = codec.storedFieldsFormat().fieldsWriter(directory, info, IOContext.DEFAULT);
accountable = writer;
}
}
@ -76,18 +81,10 @@ class StoredFieldsConsumer {
writer.finish(state.fieldInfos, state.segmentInfo.maxDoc());
} finally {
IOUtils.close(writer);
writer = null;
}
}
void abort() {
if (writer != null) {
IOUtils.closeWhileHandlingException(writer);
writer = null;
}
}
long ramBytesUsed() {
return writer == null ? 0 : writer.ramBytesUsed();
}
}

View File

@ -27,6 +27,7 @@ import org.apache.lucene.codecs.TermVectorsWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FlushInfo;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef;
@ -54,6 +55,9 @@ class TermVectorsConsumer extends TermsHash {
private int numVectorFields;
int lastDocID;
private TermVectorsConsumerPerField[] perFields = new TermVectorsConsumerPerField[1];
// this accountable either holds the writer or one that returns null.
// it's cleaner than checking if the writer is null all over the place
Accountable accountable = Accountable.NULL_ACCOUNTABLE;
TermVectorsConsumer(final IntBlockPool.Allocator intBlockAllocator, final ByteBlockPool.Allocator byteBlockAllocator, Directory directory, SegmentInfo info, Codec codec) {
super(intBlockAllocator, byteBlockAllocator, Counter.newCounter(), null);
@ -74,9 +78,6 @@ class TermVectorsConsumer extends TermsHash {
writer.finish(state.fieldInfos, numDocs);
} finally {
IOUtils.close(writer);
writer = null;
lastDocID = 0;
hasVectors = false;
}
}
}
@ -96,6 +97,7 @@ class TermVectorsConsumer extends TermsHash {
IOContext context = new IOContext(new FlushInfo(lastDocID, bytesUsed.get()));
writer = codec.termVectorsFormat().vectorsWriter(directory, info, context);
lastDocID = 0;
accountable = writer;
}
}
@ -130,13 +132,10 @@ class TermVectorsConsumer extends TermsHash {
@Override
public void abort() {
hasVectors = false;
try {
super.abort();
} finally {
IOUtils.closeWhileHandlingException(writer);
writer = null;
lastDocID = 0;
reset();
}
}
@ -167,4 +166,5 @@ class TermVectorsConsumer extends TermsHash {
resetFields();
numVectorFields = 0;
}
}

View File

@ -41,4 +41,8 @@ public interface Accountable {
return Collections.emptyList();
}
/**
* An accountable that always returns 0
*/
Accountable NULL_ACCOUNTABLE = () -> 0;
}

View File

@ -18,6 +18,7 @@ package org.apache.lucene.codecs.asserting;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.TermVectorsReader;
@ -210,5 +211,14 @@ public class AssertingTermVectorsFormat extends TermVectorsFormat {
in.close(); // close again
}
@Override
public long ramBytesUsed() {
return in.ramBytesUsed();
}
@Override
public Collection<Accountable> getChildResources() {
return Collections.singleton(in);
}
}
}

View File

@ -17,6 +17,8 @@
package org.apache.lucene.codecs.cranky;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Random;
import org.apache.lucene.codecs.TermVectorsFormat;
@ -29,6 +31,7 @@ import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.BytesRef;
class CrankyTermVectorsFormat extends TermVectorsFormat {
@ -151,5 +154,15 @@ class CrankyTermVectorsFormat extends TermVectorsFormat {
}
super.addProx(numProx, positions, offsets);
}
@Override
public long ramBytesUsed() {
return delegate.ramBytesUsed();
}
@Override
public Collection<Accountable> getChildResources() {
return Collections.singleton(delegate);
}
}
}