mirror of https://github.com/apache/lucene.git
LUCENE-9514: Include TermVectorsWriter in DWPT accounting (#1847)
TermVectorsWriter might consume some heap space memory that can have a significant impact on decisions made in the IW if writers should be stalled or DWPTs should be flushed if memory settings are small in IWC and flushes are frequent. This change adds RAM accounting to the TermVectorsWriter since it's part of the DWPT lifecycle and not just present during flush.
This commit is contained in:
parent
4d46caa05d
commit
8f895d9075
|
@ -221,6 +221,10 @@ Improvements
|
||||||
heap consumption is taken into account when IndexWriter stalls or should flush
|
heap consumption is taken into account when IndexWriter stalls or should flush
|
||||||
DWPTs. (Simon Willnauer)
|
DWPTs. (Simon Willnauer)
|
||||||
|
|
||||||
|
* LUCENE-9514: Include TermVectorsWriter in DWPT accounting to ensure that it's
|
||||||
|
heap consumption is taken into account when IndexWriter stalls or should flush
|
||||||
|
DWPTs. (Simon Willnauer)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
|
|
|
@ -56,8 +56,6 @@ public class SimpleTextTermVectorsWriter extends TermVectorsWriter {
|
||||||
|
|
||||||
static final String VECTORS_EXTENSION = "vec";
|
static final String VECTORS_EXTENSION = "vec";
|
||||||
|
|
||||||
private final Directory directory;
|
|
||||||
private final String segment;
|
|
||||||
private IndexOutput out;
|
private IndexOutput out;
|
||||||
private int numDocsWritten = 0;
|
private int numDocsWritten = 0;
|
||||||
private final BytesRefBuilder scratch = new BytesRefBuilder();
|
private final BytesRefBuilder scratch = new BytesRefBuilder();
|
||||||
|
@ -66,8 +64,6 @@ public class SimpleTextTermVectorsWriter extends TermVectorsWriter {
|
||||||
private boolean payloads;
|
private boolean payloads;
|
||||||
|
|
||||||
public SimpleTextTermVectorsWriter(Directory directory, String segment, IOContext context) throws IOException {
|
public SimpleTextTermVectorsWriter(Directory directory, String segment, IOContext context) throws IOException {
|
||||||
this.directory = directory;
|
|
||||||
this.segment = segment;
|
|
||||||
boolean success = false;
|
boolean success = false;
|
||||||
try {
|
try {
|
||||||
out = directory.createOutput(IndexFileNames.segmentFileName(segment, "", VECTORS_EXTENSION), context);
|
out = directory.createOutput(IndexFileNames.segmentFileName(segment, "", VECTORS_EXTENSION), context);
|
||||||
|
@ -193,4 +189,9 @@ public class SimpleTextTermVectorsWriter extends TermVectorsWriter {
|
||||||
private void newLine() throws IOException {
|
private void newLine() throws IOException {
|
||||||
SimpleTextUtil.writeNewline(out);
|
SimpleTextUtil.writeNewline(out);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long ramBytesUsed() {
|
||||||
|
return scratch.get().bytes.length;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -32,6 +32,7 @@ import org.apache.lucene.index.Terms;
|
||||||
import org.apache.lucene.index.TermsEnum;
|
import org.apache.lucene.index.TermsEnum;
|
||||||
import org.apache.lucene.search.DocIdSetIterator;
|
import org.apache.lucene.search.DocIdSetIterator;
|
||||||
import org.apache.lucene.store.DataInput;
|
import org.apache.lucene.store.DataInput;
|
||||||
|
import org.apache.lucene.util.Accountable;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.BytesRefBuilder;
|
import org.apache.lucene.util.BytesRefBuilder;
|
||||||
|
|
||||||
|
@ -58,7 +59,7 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
|
||||||
*
|
*
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
public abstract class TermVectorsWriter implements Closeable {
|
public abstract class TermVectorsWriter implements Closeable, Accountable {
|
||||||
|
|
||||||
/** Sole constructor. (For invocation by subclass
|
/** Sole constructor. (For invocation by subclass
|
||||||
* constructors, typically implicit.) */
|
* constructors, typically implicit.) */
|
||||||
|
|
|
@ -20,8 +20,10 @@ package org.apache.lucene.codecs.compressing;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayDeque;
|
import java.util.ArrayDeque;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
import java.util.Collection;
|
||||||
import java.util.Deque;
|
import java.util.Deque;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
import java.util.SortedSet;
|
import java.util.SortedSet;
|
||||||
import java.util.TreeSet;
|
import java.util.TreeSet;
|
||||||
|
|
||||||
|
@ -41,6 +43,7 @@ import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.store.IOContext;
|
import org.apache.lucene.store.IOContext;
|
||||||
import org.apache.lucene.store.IndexInput;
|
import org.apache.lucene.store.IndexInput;
|
||||||
import org.apache.lucene.store.IndexOutput;
|
import org.apache.lucene.store.IndexOutput;
|
||||||
|
import org.apache.lucene.util.Accountable;
|
||||||
import org.apache.lucene.util.ArrayUtil;
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
import org.apache.lucene.util.Bits;
|
import org.apache.lucene.util.Bits;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
@ -206,7 +209,7 @@ public final class CompressingTermVectorsWriter extends TermVectorsWriter {
|
||||||
private final BlockPackedWriter writer;
|
private final BlockPackedWriter writer;
|
||||||
|
|
||||||
/** Sole constructor. */
|
/** Sole constructor. */
|
||||||
public CompressingTermVectorsWriter(Directory directory, SegmentInfo si, String segmentSuffix, IOContext context,
|
CompressingTermVectorsWriter(Directory directory, SegmentInfo si, String segmentSuffix, IOContext context,
|
||||||
String formatName, CompressionMode compressionMode, int chunkSize, int blockShift) throws IOException {
|
String formatName, CompressionMode compressionMode, int chunkSize, int blockShift) throws IOException {
|
||||||
assert directory != null;
|
assert directory != null;
|
||||||
this.segment = si.name;
|
this.segment = si.name;
|
||||||
|
@ -857,4 +860,15 @@ public final class CompressingTermVectorsWriter extends TermVectorsWriter {
|
||||||
return candidate.getNumDirtyChunks() > 1024 ||
|
return candidate.getNumDirtyChunks() > 1024 ||
|
||||||
candidate.getNumDirtyChunks() * 100 > candidate.getNumChunks();
|
candidate.getNumDirtyChunks() * 100 > candidate.getNumChunks();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long ramBytesUsed() {
|
||||||
|
return positionsBuf.length + startOffsetsBuf.length + lengthsBuf.length + payloadLengthsBuf.length
|
||||||
|
+ termSuffixes.ramBytesUsed() + payloadBytes.ramBytesUsed() + lastTerm.bytes.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Collection<Accountable> getChildResources() {
|
||||||
|
return List.of(termSuffixes, payloadBytes);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,6 +21,7 @@ import java.io.Closeable;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
import java.util.Collection;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
@ -43,6 +44,7 @@ import org.apache.lucene.search.SortField;
|
||||||
import org.apache.lucene.search.similarities.Similarity;
|
import org.apache.lucene.search.similarities.Similarity;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.store.IOContext;
|
import org.apache.lucene.store.IOContext;
|
||||||
|
import org.apache.lucene.util.Accountable;
|
||||||
import org.apache.lucene.util.ArrayUtil;
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
import org.apache.lucene.util.ByteBlockPool;
|
import org.apache.lucene.util.ByteBlockPool;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
@ -65,6 +67,8 @@ final class DefaultIndexingChain extends DocConsumer {
|
||||||
final TermsHash termsHash;
|
final TermsHash termsHash;
|
||||||
// Writes stored fields
|
// Writes stored fields
|
||||||
final StoredFieldsConsumer storedFieldsConsumer;
|
final StoredFieldsConsumer storedFieldsConsumer;
|
||||||
|
final TermVectorsConsumer termVectorsWriter;
|
||||||
|
|
||||||
|
|
||||||
// NOTE: I tried using Hash Map<String,PerField>
|
// NOTE: I tried using Hash Map<String,PerField>
|
||||||
// but it was ~2% slower on Wiki and Geonames with Java
|
// but it was ~2% slower on Wiki and Geonames with Java
|
||||||
|
@ -95,7 +99,6 @@ final class DefaultIndexingChain extends DocConsumer {
|
||||||
this.infoStream = indexWriterConfig.getInfoStream();
|
this.infoStream = indexWriterConfig.getInfoStream();
|
||||||
this.abortingExceptionConsumer = abortingExceptionConsumer;
|
this.abortingExceptionConsumer = abortingExceptionConsumer;
|
||||||
|
|
||||||
final TermsHash termVectorsWriter;
|
|
||||||
if (segmentInfo.getIndexSort() == null) {
|
if (segmentInfo.getIndexSort() == null) {
|
||||||
storedFieldsConsumer = new StoredFieldsConsumer(indexWriterConfig.getCodec(), directory, segmentInfo);
|
storedFieldsConsumer = new StoredFieldsConsumer(indexWriterConfig.getCodec(), directory, segmentInfo);
|
||||||
termVectorsWriter = new TermVectorsConsumer(intBlockAllocator, byteBlockAllocator, directory, segmentInfo, indexWriterConfig.getCodec());
|
termVectorsWriter = new TermVectorsConsumer(intBlockAllocator, byteBlockAllocator, directory, segmentInfo, indexWriterConfig.getCodec());
|
||||||
|
@ -795,7 +798,13 @@ final class DefaultIndexingChain extends DocConsumer {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public long ramBytesUsed() {
|
public long ramBytesUsed() {
|
||||||
return bytesUsed.get() + storedFieldsConsumer.ramBytesUsed();
|
return bytesUsed.get() + storedFieldsConsumer.accountable.ramBytesUsed()
|
||||||
|
+ termVectorsWriter.accountable.ramBytesUsed();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Collection<Accountable> getChildResources() {
|
||||||
|
return List.of(storedFieldsConsumer.accountable, termVectorsWriter.accountable);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** NOTE: not static: accesses at least docState, termsHash. */
|
/** NOTE: not static: accesses at least docState, termsHash. */
|
||||||
|
|
|
@ -185,4 +185,6 @@ final class SortingTermVectorsConsumer extends TermVectorsConsumer {
|
||||||
assert fieldCount == numFields;
|
assert fieldCount == numFields;
|
||||||
writer.finishDocument();
|
writer.finishDocument();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,6 +23,7 @@ import org.apache.lucene.codecs.Codec;
|
||||||
import org.apache.lucene.codecs.StoredFieldsWriter;
|
import org.apache.lucene.codecs.StoredFieldsWriter;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.store.IOContext;
|
import org.apache.lucene.store.IOContext;
|
||||||
|
import org.apache.lucene.util.Accountable;
|
||||||
import org.apache.lucene.util.IOUtils;
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
|
||||||
class StoredFieldsConsumer {
|
class StoredFieldsConsumer {
|
||||||
|
@ -30,6 +31,9 @@ class StoredFieldsConsumer {
|
||||||
final Directory directory;
|
final Directory directory;
|
||||||
final SegmentInfo info;
|
final SegmentInfo info;
|
||||||
StoredFieldsWriter writer;
|
StoredFieldsWriter writer;
|
||||||
|
// this accountable either holds the writer or one that returns null.
|
||||||
|
// it's cleaner than checking if the writer is null all over the place
|
||||||
|
Accountable accountable = Accountable.NULL_ACCOUNTABLE;
|
||||||
private int lastDoc;
|
private int lastDoc;
|
||||||
|
|
||||||
StoredFieldsConsumer(Codec codec, Directory directory, SegmentInfo info) {
|
StoredFieldsConsumer(Codec codec, Directory directory, SegmentInfo info) {
|
||||||
|
@ -42,6 +46,7 @@ class StoredFieldsConsumer {
|
||||||
protected void initStoredFieldsWriter() throws IOException {
|
protected void initStoredFieldsWriter() throws IOException {
|
||||||
if (writer == null) { // TODO can we allocate this in the ctor? we call start document for every doc anyway
|
if (writer == null) { // TODO can we allocate this in the ctor? we call start document for every doc anyway
|
||||||
this.writer = codec.storedFieldsFormat().fieldsWriter(directory, info, IOContext.DEFAULT);
|
this.writer = codec.storedFieldsFormat().fieldsWriter(directory, info, IOContext.DEFAULT);
|
||||||
|
accountable = writer;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -76,18 +81,10 @@ class StoredFieldsConsumer {
|
||||||
writer.finish(state.fieldInfos, state.segmentInfo.maxDoc());
|
writer.finish(state.fieldInfos, state.segmentInfo.maxDoc());
|
||||||
} finally {
|
} finally {
|
||||||
IOUtils.close(writer);
|
IOUtils.close(writer);
|
||||||
writer = null;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void abort() {
|
void abort() {
|
||||||
if (writer != null) {
|
IOUtils.closeWhileHandlingException(writer);
|
||||||
IOUtils.closeWhileHandlingException(writer);
|
|
||||||
writer = null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
long ramBytesUsed() {
|
|
||||||
return writer == null ? 0 : writer.ramBytesUsed();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -27,6 +27,7 @@ import org.apache.lucene.codecs.TermVectorsWriter;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.store.FlushInfo;
|
import org.apache.lucene.store.FlushInfo;
|
||||||
import org.apache.lucene.store.IOContext;
|
import org.apache.lucene.store.IOContext;
|
||||||
|
import org.apache.lucene.util.Accountable;
|
||||||
import org.apache.lucene.util.ArrayUtil;
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
import org.apache.lucene.util.ByteBlockPool;
|
import org.apache.lucene.util.ByteBlockPool;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
@ -54,6 +55,9 @@ class TermVectorsConsumer extends TermsHash {
|
||||||
private int numVectorFields;
|
private int numVectorFields;
|
||||||
int lastDocID;
|
int lastDocID;
|
||||||
private TermVectorsConsumerPerField[] perFields = new TermVectorsConsumerPerField[1];
|
private TermVectorsConsumerPerField[] perFields = new TermVectorsConsumerPerField[1];
|
||||||
|
// this accountable either holds the writer or one that returns null.
|
||||||
|
// it's cleaner than checking if the writer is null all over the place
|
||||||
|
Accountable accountable = Accountable.NULL_ACCOUNTABLE;
|
||||||
|
|
||||||
TermVectorsConsumer(final IntBlockPool.Allocator intBlockAllocator, final ByteBlockPool.Allocator byteBlockAllocator, Directory directory, SegmentInfo info, Codec codec) {
|
TermVectorsConsumer(final IntBlockPool.Allocator intBlockAllocator, final ByteBlockPool.Allocator byteBlockAllocator, Directory directory, SegmentInfo info, Codec codec) {
|
||||||
super(intBlockAllocator, byteBlockAllocator, Counter.newCounter(), null);
|
super(intBlockAllocator, byteBlockAllocator, Counter.newCounter(), null);
|
||||||
|
@ -74,9 +78,6 @@ class TermVectorsConsumer extends TermsHash {
|
||||||
writer.finish(state.fieldInfos, numDocs);
|
writer.finish(state.fieldInfos, numDocs);
|
||||||
} finally {
|
} finally {
|
||||||
IOUtils.close(writer);
|
IOUtils.close(writer);
|
||||||
writer = null;
|
|
||||||
lastDocID = 0;
|
|
||||||
hasVectors = false;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -96,6 +97,7 @@ class TermVectorsConsumer extends TermsHash {
|
||||||
IOContext context = new IOContext(new FlushInfo(lastDocID, bytesUsed.get()));
|
IOContext context = new IOContext(new FlushInfo(lastDocID, bytesUsed.get()));
|
||||||
writer = codec.termVectorsFormat().vectorsWriter(directory, info, context);
|
writer = codec.termVectorsFormat().vectorsWriter(directory, info, context);
|
||||||
lastDocID = 0;
|
lastDocID = 0;
|
||||||
|
accountable = writer;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -130,13 +132,10 @@ class TermVectorsConsumer extends TermsHash {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void abort() {
|
public void abort() {
|
||||||
hasVectors = false;
|
|
||||||
try {
|
try {
|
||||||
super.abort();
|
super.abort();
|
||||||
} finally {
|
} finally {
|
||||||
IOUtils.closeWhileHandlingException(writer);
|
IOUtils.closeWhileHandlingException(writer);
|
||||||
writer = null;
|
|
||||||
lastDocID = 0;
|
|
||||||
reset();
|
reset();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -167,4 +166,5 @@ class TermVectorsConsumer extends TermsHash {
|
||||||
resetFields();
|
resetFields();
|
||||||
numVectorFields = 0;
|
numVectorFields = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -41,4 +41,8 @@ public interface Accountable {
|
||||||
return Collections.emptyList();
|
return Collections.emptyList();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An accountable that always returns 0
|
||||||
|
*/
|
||||||
|
Accountable NULL_ACCOUNTABLE = () -> 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.lucene.codecs.asserting;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
|
import java.util.Collections;
|
||||||
|
|
||||||
import org.apache.lucene.codecs.TermVectorsFormat;
|
import org.apache.lucene.codecs.TermVectorsFormat;
|
||||||
import org.apache.lucene.codecs.TermVectorsReader;
|
import org.apache.lucene.codecs.TermVectorsReader;
|
||||||
|
@ -210,5 +211,14 @@ public class AssertingTermVectorsFormat extends TermVectorsFormat {
|
||||||
in.close(); // close again
|
in.close(); // close again
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long ramBytesUsed() {
|
||||||
|
return in.ramBytesUsed();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Collection<Accountable> getChildResources() {
|
||||||
|
return Collections.singleton(in);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,6 +17,8 @@
|
||||||
package org.apache.lucene.codecs.cranky;
|
package org.apache.lucene.codecs.cranky;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.Collections;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
|
|
||||||
import org.apache.lucene.codecs.TermVectorsFormat;
|
import org.apache.lucene.codecs.TermVectorsFormat;
|
||||||
|
@ -29,6 +31,7 @@ import org.apache.lucene.index.SegmentInfo;
|
||||||
import org.apache.lucene.store.DataInput;
|
import org.apache.lucene.store.DataInput;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.store.IOContext;
|
import org.apache.lucene.store.IOContext;
|
||||||
|
import org.apache.lucene.util.Accountable;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
|
||||||
class CrankyTermVectorsFormat extends TermVectorsFormat {
|
class CrankyTermVectorsFormat extends TermVectorsFormat {
|
||||||
|
@ -151,5 +154,15 @@ class CrankyTermVectorsFormat extends TermVectorsFormat {
|
||||||
}
|
}
|
||||||
super.addProx(numProx, positions, offsets);
|
super.addProx(numProx, positions, offsets);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long ramBytesUsed() {
|
||||||
|
return delegate.ramBytesUsed();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Collection<Accountable> getChildResources() {
|
||||||
|
return Collections.singleton(delegate);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue