LUCENE-9510: Don't compress temporary stored fields and term vectors when index sorting is enabled. (#1874)

When index sorting is enabled, stored fields and term vectors can't be written on the fly like in the normal case, so they are written into temporary files that then get resorted. For these temporary files, disabling compression speeds up indexing significantly. On a synthetic test that indexes stored fields and a doc value field populated with random values that is used for index sorting, this resulted in a 3x indexing speedup.
2020-09-16 13:05:22 +02:00 · 2020-09-16 13:05:22 +02:00 · 93094ef7e4
parent 7b8e72e553
commit 93094ef7e4
3 changed files with 62 additions and 22 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -243,6 +243,9 @@ Optimizations
 * LUCENE-9373: FunctionMatchQuery now accepts a "matchCost" optimization hint.
  (Maxim Glazkov, David Smiley)

+* LUCENE-9510: Indexing with an index sort is now faster by not compressing
+  temporary representations of the data. (Adrien Grand)
+
 Bug Fixes
 ---------------------

--- a/lucene/core/src/java/org/apache/lucene/index/SortingStoredFieldsConsumer.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SortingStoredFieldsConsumer.java
@ -19,21 +19,65 @@ package org.apache.lucene.index;

 import java.io.IOException;
 import java.io.Reader;
-import java.util.Map;
 import java.util.Objects;

 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.codecs.StoredFieldsFormat;
 import org.apache.lucene.codecs.StoredFieldsReader;
 import org.apache.lucene.codecs.StoredFieldsWriter;
+import org.apache.lucene.codecs.compressing.CompressingStoredFieldsFormat;
+import org.apache.lucene.codecs.compressing.CompressionMode;
+import org.apache.lucene.codecs.compressing.Compressor;
+import org.apache.lucene.codecs.compressing.Decompressor;
 import org.apache.lucene.document.StoredField;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.DataOutput;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
+import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IOUtils;

 final class SortingStoredFieldsConsumer extends StoredFieldsConsumer {
+
+  static final CompressionMode NO_COMPRESSION = new CompressionMode() {
+    @Override
+    public Compressor newCompressor() {
+      return new Compressor() {
+        @Override
+        public void close() throws IOException {}
+
+        @Override
+        public void compress(byte[] bytes, int off, int len, DataOutput out) throws IOException {
+          out.writeBytes(bytes, off, len);
+        }
+      };
+    }
+
+    @Override
+    public Decompressor newDecompressor() {
+      return new Decompressor() {
+        @Override
+        public void decompress(DataInput in, int originalLength, int offset, int length, BytesRef bytes)
+            throws IOException {
+          bytes.bytes = ArrayUtil.grow(bytes.bytes, length);
+          in.skipBytes(offset);
+          in.readBytes(bytes.bytes, 0, length);
+          bytes.offset = 0;
+          bytes.length = length;
+        }
+
+        @Override
+        public Decompressor clone() {
+          return this;
+        }
+      };
+    }
+  };
+  private static final StoredFieldsFormat TEMP_STORED_FIELDS_FORMAT = new CompressingStoredFieldsFormat(
+      "TempStoredFields", NO_COMPRESSION, 128*1024, 1, 10);
  TrackingTmpOutputDirectoryWrapper tmpDirectory;

  SortingStoredFieldsConsumer(Codec codec, Directory directory, SegmentInfo info) {
@ -44,21 +88,14 @@ final class SortingStoredFieldsConsumer extends StoredFieldsConsumer {
  protected void initStoredFieldsWriter() throws IOException {
    if (writer == null) {
      this.tmpDirectory = new TrackingTmpOutputDirectoryWrapper(directory);
-      this.writer = codec.storedFieldsFormat().fieldsWriter(tmpDirectory, info, IOContext.DEFAULT);
+      this.writer = TEMP_STORED_FIELDS_FORMAT.fieldsWriter(tmpDirectory, info, IOContext.DEFAULT);
    }
  }

  @Override
  void flush(SegmentWriteState state, Sorter.DocMap sortMap) throws IOException {
    super.flush(state, sortMap);
-    if (sortMap == null) {
-      // we're lucky the index is already sorted, just rename the temporary file and return
-      for (Map.Entry<String, String> entry : tmpDirectory.getTemporaryFiles().entrySet()) {
-        tmpDirectory.rename(entry.getValue(), entry.getKey());
-      }
-      return;
-    }
-    StoredFieldsReader reader = codec.storedFieldsFormat()
+    StoredFieldsReader reader = TEMP_STORED_FIELDS_FORMAT
        .fieldsReader(tmpDirectory, state.segmentInfo, state.fieldInfos, IOContext.DEFAULT);
    // Don't pull a merge instance, since merge instances optimize for
    // sequential access while we consume stored fields in random order here.
@ -69,7 +106,7 @@ final class SortingStoredFieldsConsumer extends StoredFieldsConsumer {
      CopyVisitor visitor = new CopyVisitor(sortWriter);
      for (int docID = 0; docID < state.segmentInfo.maxDoc(); docID++) {
        sortWriter.startDocument();
-        reader.visitDocument(sortMap.newToOld(docID), visitor);
+        reader.visitDocument(sortMap == null ? docID : sortMap.newToOld(docID), visitor);
        sortWriter.finishDocument();
      }
      sortWriter.finish(state.fieldInfos, state.segmentInfo.maxDoc());
--- a/lucene/core/src/java/org/apache/lucene/index/SortingTermVectorsConsumer.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SortingTermVectorsConsumer.java
@ -23,8 +23,10 @@ import java.util.Map;

 import org.apache.lucene.codecs.Codec;
 import org.apache.lucene.codecs.NormsProducer;
+import org.apache.lucene.codecs.TermVectorsFormat;
 import org.apache.lucene.codecs.TermVectorsReader;
 import org.apache.lucene.codecs.TermVectorsWriter;
+import org.apache.lucene.codecs.compressing.CompressingTermVectorsFormat;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FlushInfo;
@ -35,6 +37,9 @@ import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.IntBlockPool;

 final class SortingTermVectorsConsumer extends TermVectorsConsumer {
+
+  private static final TermVectorsFormat TEMP_TERM_VECTORS_FORMAT = new CompressingTermVectorsFormat(
+      "TempTermVectors", "", SortingStoredFieldsConsumer.NO_COMPRESSION, 8*1024, 10);
  TrackingTmpOutputDirectoryWrapper tmpDirectory;

  SortingTermVectorsConsumer(final IntBlockPool.Allocator intBlockAllocator, final ByteBlockPool.Allocator byteBlockAllocator, Directory directory, SegmentInfo info, Codec codec) {
@ -45,22 +50,17 @@ final class SortingTermVectorsConsumer extends TermVectorsConsumer {
  void flush(Map<String, TermsHashPerField> fieldsToFlush, final SegmentWriteState state, Sorter.DocMap sortMap, NormsProducer norms) throws IOException {
    super.flush(fieldsToFlush, state, sortMap, norms);
    if (tmpDirectory != null) {
-      if (sortMap == null) {
-        // we're lucky the index is already sorted, just rename the temporary file and return
-        for (Map.Entry<String, String> entry : tmpDirectory.getTemporaryFiles().entrySet()) {
-          tmpDirectory.rename(entry.getValue(), entry.getKey());
-        }
-        return;
-      }
-      TermVectorsReader reader = codec.termVectorsFormat()
+      TermVectorsReader reader = TEMP_TERM_VECTORS_FORMAT
          .vectorsReader(tmpDirectory, state.segmentInfo, state.fieldInfos, IOContext.DEFAULT);
-      TermVectorsReader mergeReader = reader.getMergeInstance();
+      // Don't pull a merge instance, since merge instances optimize for
+      // sequential access while term vectors will likely be accessed in random
+      // order here.
      TermVectorsWriter writer = codec.termVectorsFormat()
          .vectorsWriter(state.directory, state.segmentInfo, IOContext.DEFAULT);
      try {
        reader.checkIntegrity();
        for (int docID = 0; docID < state.segmentInfo.maxDoc(); docID++) {
-          Fields vectors = mergeReader.get(sortMap.newToOld(docID));
+          Fields vectors = reader.get(sortMap == null ? docID : sortMap.newToOld(docID));
          writeTermVectors(writer, vectors, state.fieldInfos);
        }
        writer.finish(state.fieldInfos, state.segmentInfo.maxDoc());
@ -77,7 +77,7 @@ final class SortingTermVectorsConsumer extends TermVectorsConsumer {
    if (writer == null) {
      IOContext context = new IOContext(new FlushInfo(lastDocID, bytesUsed.get()));
      tmpDirectory = new TrackingTmpOutputDirectoryWrapper(directory);
-      writer = codec.termVectorsFormat().vectorsWriter(tmpDirectory, info, context);
+      writer = TEMP_TERM_VECTORS_FORMAT.vectorsWriter(tmpDirectory, info, context);
      lastDocID = 0;
    }
  }