Fully reuse postings enums when flushing sorted indexes. (#12206)

Currently we're only half reusing postings enums when flushing sorted indexes as we still create new wrapper instances every time, which can be costly with fields that have many terms.
2023-03-16 13:51:33 +01:00 · 2023-03-16 13:51:33 +01:00 · 0782535017
parent d3b6ef3c86
commit 0782535017
2 changed files with 56 additions and 66 deletions
--- a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
@ -188,7 +188,7 @@ final class FreqProxTermsWriter extends TermsHash {
          wrapReuse = (SortingPostingsEnum) reuse;
          inReuse = wrapReuse.getWrapped();
        } else {
-          wrapReuse = null;
+          wrapReuse = new SortingPostingsEnum();
          inReuse = reuse;
        }
@ -201,8 +201,8 @@ final class FreqProxTermsWriter extends TermsHash {
            indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
        final boolean storeOffsets =
            indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
-        return new SortingPostingsEnum(
+        wrapReuse.reset(docMap, inDocsAndPositions, storePositions, storeOffsets);
-            docMap.size(), wrapReuse, inDocsAndPositions, docMap, storePositions, storeOffsets);
+        return wrapReuse;
      }
      final PostingsEnum inReuse;
@ -213,33 +213,29 @@ final class FreqProxTermsWriter extends TermsHash {
        wrapReuse = (SortingDocsEnum) reuse;
        inReuse = wrapReuse.getWrapped();
      } else {
-        wrapReuse = null;
+        wrapReuse = new SortingDocsEnum();
-        inReuse = null;
+        inReuse = reuse;
      }
      final PostingsEnum inDocs = in.postings(inReuse, flags);
-      return new SortingDocsEnum(docMap.size(), wrapReuse, inDocs, docMap);
+      wrapReuse.reset(docMap, inDocs);
      return wrapReuse;
    }
  }
  static class SortingDocsEnum extends PostingsEnum {
    private final PostingsEnum in;
    private final LSBRadixSorter sorter;
-    private int[] docs;
+    private PostingsEnum in;
-    private int docIt = -1;
+    private int[] docs = IntsRef.EMPTY_INTS;
-    private final int upTo;
+    private int docIt;
    private int upTo;
-    SortingDocsEnum(
+    SortingDocsEnum() {
-        int maxDoc, SortingDocsEnum reuse, final PostingsEnum in, final Sorter.DocMap docMap)
+      sorter = new LSBRadixSorter();
-        throws IOException {
+    }
-      if (reuse != null) {
+
-        sorter = reuse.sorter;
+    void reset(Sorter.DocMap docMap, PostingsEnum in) throws IOException {
        docs = reuse.docs;
      } else {
        sorter = new LSBRadixSorter();
        docs = IntsRef.EMPTY_INTS;
      }
      this.in = in;
      int i = 0;
      for (int doc = in.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = in.nextDoc()) {
@ -253,10 +249,12 @@ final class FreqProxTermsWriter extends TermsHash {
        docs = ArrayUtil.grow(docs);
      }
      docs[upTo] = DocIdSetIterator.NO_MORE_DOCS;
      final int maxDoc = docMap.size();
      final int numBits = PackedInts.bitsRequired(Math.max(0, maxDoc - 1));
      // Even though LSBRadixSorter cannot take advantage of partial ordering like TimSorter it is
      // often still faster for nearly-sorted inputs.
      sorter.sort(numBits, docs, upTo);
      docIt = -1;
    }
    PostingsEnum getWrapped() {
@ -311,7 +309,7 @@ final class FreqProxTermsWriter extends TermsHash {
    }
  }
-  static class SortingPostingsEnum extends FilterLeafReader.FilterPostingsEnum {
+  static class SortingPostingsEnum extends PostingsEnum {
    /**
     * A {@link TimSorter} which sorts two parallel arrays of doc IDs and offsets in one go. Everyti
@ -324,8 +322,8 @@ final class FreqProxTermsWriter extends TermsHash {
      private int[] tmpDocs;
      private long[] tmpOffsets;
-      public DocOffsetSorter(int maxDoc) {
+      public DocOffsetSorter(int numTempSlots) {
-        super(maxDoc / 8);
+        super(numTempSlots);
        this.tmpDocs = IntsRef.EMPTY_INTS;
        this.tmpOffsets = LongsRef.EMPTY_LONGS;
      }
@ -379,55 +377,38 @@ final class FreqProxTermsWriter extends TermsHash {
      }
    }
-    private final int maxDoc;
+    private DocOffsetSorter sorter;
-    private final DocOffsetSorter sorter;
+    private int[] docs = IntsRef.EMPTY_INTS;
-    private int[] docs;
+    private long[] offsets = LongsRef.EMPTY_LONGS;
-    private long[] offsets;
+    private int upto;
    private final int upto;
-    private final ByteBuffersDataInput postingInput;
+    private ByteBuffersDataInput postingInput;
-    private final boolean storePositions, storeOffsets;
+    private PostingsEnum in;
    private boolean storePositions, storeOffsets;
-    private int docIt = -1;
+    private int docIt;
    private int pos;
-    private int startOffset = -1;
+    private int startOffset;
-    private int endOffset = -1;
+    private int endOffset;
-    private final BytesRef payload;
+    private final BytesRef payload = new BytesRef();
    private int currFreq;
-    private final ByteBuffersDataOutput buffer;
+    private final ByteBuffersDataOutput buffer = ByteBuffersDataOutput.newResettableInstance();
-    SortingPostingsEnum(
+    void reset(Sorter.DocMap docMap, PostingsEnum in, boolean storePositions, boolean storeOffsets)
        int maxDoc,
        SortingPostingsEnum reuse,
        final PostingsEnum in,
        Sorter.DocMap docMap,
        boolean storePositions,
        boolean storeOffsets)
        throws IOException {
-      super(in);
+      this.in = in;
      this.maxDoc = maxDoc;
      this.storePositions = storePositions;
      this.storeOffsets = storeOffsets;
-      if (reuse != null) {
+      if (sorter == null) {
-        docs = reuse.docs;
+        final int numTempSlots = docMap.size() / 8;
-        offsets = reuse.offsets;
+        sorter = new DocOffsetSorter(numTempSlots);
        payload = reuse.payload;
        buffer = reuse.buffer;
        buffer.reset();
        if (reuse.maxDoc == maxDoc) {
          sorter = reuse.sorter;
        } else {
          sorter = new DocOffsetSorter(maxDoc);
        }
      } else {
        docs = new int[32];
        offsets = new long[32];
        payload = new BytesRef(32);
        buffer = ByteBuffersDataOutput.newResettableInstance();
        sorter = new DocOffsetSorter(maxDoc);
      }
      docIt = -1;
      startOffset = -1;
      endOffset = -1;
      buffer.reset();
      int doc;
      int i = 0;
      while ((doc = in.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
@ -548,5 +529,10 @@ final class FreqProxTermsWriter extends TermsHash {
    PostingsEnum getWrapped() {
      return in;
    }
    @Override
    public long cost() {
      return in.cost();
    }
  }
 }
--- a/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataInput.java
+++ b/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataInput.java
@ -55,10 +55,10 @@ public final class ByteBuffersDataInput extends DataInput
  public ByteBuffersDataInput(List<ByteBuffer> buffers) {
    ensureAssumptions(buffers);
-    this.blocks =
+    this.blocks = buffers.toArray(ByteBuffer[]::new);
-        buffers.stream()
+    for (int i = 0; i < blocks.length; ++i) {
-            .map(buf -> buf.asReadOnlyBuffer().order(ByteOrder.LITTLE_ENDIAN))
+      blocks[i] = blocks[i].asReadOnlyBuffer().order(ByteOrder.LITTLE_ENDIAN);
-            .toArray(ByteBuffer[]::new);
+    }
    // pre-allocate these arrays and create the view buffers lazily
    this.floatBuffers = new FloatBuffer[blocks.length * Float.BYTES];
    this.longBuffers = new LongBuffer[blocks.length * Long.BYTES];
@ -71,7 +71,11 @@ public final class ByteBuffersDataInput extends DataInput
      this.blockMask = (1 << blockBits) - 1;
    }
-    this.size = Arrays.stream(blocks).mapToLong(block -> block.remaining()).sum();
+    long size = 0;
    for (ByteBuffer block : blocks) {
      size += block.remaining();
    }
    this.size = size;
    // The initial "position" of this stream is shifted by the position of the first block.
    this.offset = blocks[0].position();