LUCENE-9510: Don't compress temporary stored fields and term vectors when index sorting is enabled. (#1874)

When index sorting is enabled, stored fields and term vectors can't be
written on the fly like in the normal case, so they are written into
temporary files that then get resorted. For these temporary files,
disabling compression speeds up indexing significantly.

On a synthetic test that indexes stored fields and a doc value field
populated with random values that is used for index sorting, this
resulted in a 3x indexing speedup.
This commit is contained in:
Adrien Grand 2020-09-16 13:05:22 +02:00 committed by GitHub
parent 7b8e72e553
commit 93094ef7e4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 62 additions and 22 deletions

View File

@ -243,6 +243,9 @@ Optimizations
* LUCENE-9373: FunctionMatchQuery now accepts a "matchCost" optimization hint.
(Maxim Glazkov, David Smiley)
* LUCENE-9510: Indexing with an index sort is now faster by not compressing
temporary representations of the data. (Adrien Grand)
Bug Fixes
---------------------

View File

@ -19,21 +19,65 @@ package org.apache.lucene.index;
import java.io.IOException;
import java.io.Reader;
import java.util.Map;
import java.util.Objects;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.StoredFieldsReader;
import org.apache.lucene.codecs.StoredFieldsWriter;
import org.apache.lucene.codecs.compressing.CompressingStoredFieldsFormat;
import org.apache.lucene.codecs.compressing.CompressionMode;
import org.apache.lucene.codecs.compressing.Compressor;
import org.apache.lucene.codecs.compressing.Decompressor;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
final class SortingStoredFieldsConsumer extends StoredFieldsConsumer {
static final CompressionMode NO_COMPRESSION = new CompressionMode() {
@Override
public Compressor newCompressor() {
return new Compressor() {
@Override
public void close() throws IOException {}
@Override
public void compress(byte[] bytes, int off, int len, DataOutput out) throws IOException {
out.writeBytes(bytes, off, len);
}
};
}
@Override
public Decompressor newDecompressor() {
return new Decompressor() {
@Override
public void decompress(DataInput in, int originalLength, int offset, int length, BytesRef bytes)
throws IOException {
bytes.bytes = ArrayUtil.grow(bytes.bytes, length);
in.skipBytes(offset);
in.readBytes(bytes.bytes, 0, length);
bytes.offset = 0;
bytes.length = length;
}
@Override
public Decompressor clone() {
return this;
}
};
}
};
private static final StoredFieldsFormat TEMP_STORED_FIELDS_FORMAT = new CompressingStoredFieldsFormat(
"TempStoredFields", NO_COMPRESSION, 128*1024, 1, 10);
TrackingTmpOutputDirectoryWrapper tmpDirectory;
SortingStoredFieldsConsumer(Codec codec, Directory directory, SegmentInfo info) {
@ -44,21 +88,14 @@ final class SortingStoredFieldsConsumer extends StoredFieldsConsumer {
protected void initStoredFieldsWriter() throws IOException {
if (writer == null) {
this.tmpDirectory = new TrackingTmpOutputDirectoryWrapper(directory);
this.writer = codec.storedFieldsFormat().fieldsWriter(tmpDirectory, info, IOContext.DEFAULT);
this.writer = TEMP_STORED_FIELDS_FORMAT.fieldsWriter(tmpDirectory, info, IOContext.DEFAULT);
}
}
@Override
void flush(SegmentWriteState state, Sorter.DocMap sortMap) throws IOException {
super.flush(state, sortMap);
if (sortMap == null) {
// we're lucky the index is already sorted, just rename the temporary file and return
for (Map.Entry<String, String> entry : tmpDirectory.getTemporaryFiles().entrySet()) {
tmpDirectory.rename(entry.getValue(), entry.getKey());
}
return;
}
StoredFieldsReader reader = codec.storedFieldsFormat()
StoredFieldsReader reader = TEMP_STORED_FIELDS_FORMAT
.fieldsReader(tmpDirectory, state.segmentInfo, state.fieldInfos, IOContext.DEFAULT);
// Don't pull a merge instance, since merge instances optimize for
// sequential access while we consume stored fields in random order here.
@ -69,7 +106,7 @@ final class SortingStoredFieldsConsumer extends StoredFieldsConsumer {
CopyVisitor visitor = new CopyVisitor(sortWriter);
for (int docID = 0; docID < state.segmentInfo.maxDoc(); docID++) {
sortWriter.startDocument();
reader.visitDocument(sortMap.newToOld(docID), visitor);
reader.visitDocument(sortMap == null ? docID : sortMap.newToOld(docID), visitor);
sortWriter.finishDocument();
}
sortWriter.finish(state.fieldInfos, state.segmentInfo.maxDoc());

View File

@ -23,8 +23,10 @@ import java.util.Map;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.TermVectorsReader;
import org.apache.lucene.codecs.TermVectorsWriter;
import org.apache.lucene.codecs.compressing.CompressingTermVectorsFormat;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FlushInfo;
@ -35,6 +37,9 @@ import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntBlockPool;
final class SortingTermVectorsConsumer extends TermVectorsConsumer {
private static final TermVectorsFormat TEMP_TERM_VECTORS_FORMAT = new CompressingTermVectorsFormat(
"TempTermVectors", "", SortingStoredFieldsConsumer.NO_COMPRESSION, 8*1024, 10);
TrackingTmpOutputDirectoryWrapper tmpDirectory;
SortingTermVectorsConsumer(final IntBlockPool.Allocator intBlockAllocator, final ByteBlockPool.Allocator byteBlockAllocator, Directory directory, SegmentInfo info, Codec codec) {
@ -45,22 +50,17 @@ final class SortingTermVectorsConsumer extends TermVectorsConsumer {
void flush(Map<String, TermsHashPerField> fieldsToFlush, final SegmentWriteState state, Sorter.DocMap sortMap, NormsProducer norms) throws IOException {
super.flush(fieldsToFlush, state, sortMap, norms);
if (tmpDirectory != null) {
if (sortMap == null) {
// we're lucky the index is already sorted, just rename the temporary file and return
for (Map.Entry<String, String> entry : tmpDirectory.getTemporaryFiles().entrySet()) {
tmpDirectory.rename(entry.getValue(), entry.getKey());
}
return;
}
TermVectorsReader reader = codec.termVectorsFormat()
TermVectorsReader reader = TEMP_TERM_VECTORS_FORMAT
.vectorsReader(tmpDirectory, state.segmentInfo, state.fieldInfos, IOContext.DEFAULT);
TermVectorsReader mergeReader = reader.getMergeInstance();
// Don't pull a merge instance, since merge instances optimize for
// sequential access while term vectors will likely be accessed in random
// order here.
TermVectorsWriter writer = codec.termVectorsFormat()
.vectorsWriter(state.directory, state.segmentInfo, IOContext.DEFAULT);
try {
reader.checkIntegrity();
for (int docID = 0; docID < state.segmentInfo.maxDoc(); docID++) {
Fields vectors = mergeReader.get(sortMap.newToOld(docID));
Fields vectors = reader.get(sortMap == null ? docID : sortMap.newToOld(docID));
writeTermVectors(writer, vectors, state.fieldInfos);
}
writer.finish(state.fieldInfos, state.segmentInfo.maxDoc());
@ -77,7 +77,7 @@ final class SortingTermVectorsConsumer extends TermVectorsConsumer {
if (writer == null) {
IOContext context = new IOContext(new FlushInfo(lastDocID, bytesUsed.get()));
tmpDirectory = new TrackingTmpOutputDirectoryWrapper(directory);
writer = codec.termVectorsFormat().vectorsWriter(tmpDirectory, info, context);
writer = TEMP_TERM_VECTORS_FORMAT.vectorsWriter(tmpDirectory, info, context);
lastDocID = 0;
}
}