Fully reuse postings enums when flushing sorted indexes. (#12206)

Currently we're only half reusing postings enums when flushing sorted indexes as we still create new wrapper instances every time, which can be costly with fields that have many terms.
2023-03-16 13:51:33 +01:00 · 2023-03-16 13:51:33 +01:00 · 0782535017
parent d3b6ef3c86
commit 0782535017
2 changed files with 56 additions and 66 deletions
--- a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
@ -188,7 +188,7 @@ final class FreqProxTermsWriter extends TermsHash {
          wrapReuse = (SortingPostingsEnum) reuse;
          inReuse = wrapReuse.getWrapped();
        } else {
-          wrapReuse = null;
+          wrapReuse = new SortingPostingsEnum();
          inReuse = reuse;
        }

@ -201,8 +201,8 @@ final class FreqProxTermsWriter extends TermsHash {
            indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
        final boolean storeOffsets =
            indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
-        return new SortingPostingsEnum(
-            docMap.size(), wrapReuse, inDocsAndPositions, docMap, storePositions, storeOffsets);
+        wrapReuse.reset(docMap, inDocsAndPositions, storePositions, storeOffsets);
+        return wrapReuse;
      }

      final PostingsEnum inReuse;
@ -213,33 +213,29 @@ final class FreqProxTermsWriter extends TermsHash {
        wrapReuse = (SortingDocsEnum) reuse;
        inReuse = wrapReuse.getWrapped();
      } else {
-        wrapReuse = null;
-        inReuse = null;
+        wrapReuse = new SortingDocsEnum();
+        inReuse = reuse;
      }

      final PostingsEnum inDocs = in.postings(inReuse, flags);
-      return new SortingDocsEnum(docMap.size(), wrapReuse, inDocs, docMap);
+      wrapReuse.reset(docMap, inDocs);
+      return wrapReuse;
    }
  }

  static class SortingDocsEnum extends PostingsEnum {

-    private final PostingsEnum in;
    private final LSBRadixSorter sorter;
-    private int[] docs;
-    private int docIt = -1;
-    private final int upTo;
+    private PostingsEnum in;
+    private int[] docs = IntsRef.EMPTY_INTS;
+    private int docIt;
+    private int upTo;

-    SortingDocsEnum(
-        int maxDoc, SortingDocsEnum reuse, final PostingsEnum in, final Sorter.DocMap docMap)
-        throws IOException {
-      if (reuse != null) {
-        sorter = reuse.sorter;
-        docs = reuse.docs;
-      } else {
-        sorter = new LSBRadixSorter();
-        docs = IntsRef.EMPTY_INTS;
-      }
+    SortingDocsEnum() {
+      sorter = new LSBRadixSorter();
+    }
+
+    void reset(Sorter.DocMap docMap, PostingsEnum in) throws IOException {
      this.in = in;
      int i = 0;
      for (int doc = in.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = in.nextDoc()) {
@ -253,10 +249,12 @@ final class FreqProxTermsWriter extends TermsHash {
        docs = ArrayUtil.grow(docs);
      }
      docs[upTo] = DocIdSetIterator.NO_MORE_DOCS;
+      final int maxDoc = docMap.size();
      final int numBits = PackedInts.bitsRequired(Math.max(0, maxDoc - 1));
      // Even though LSBRadixSorter cannot take advantage of partial ordering like TimSorter it is
      // often still faster for nearly-sorted inputs.
      sorter.sort(numBits, docs, upTo);
+      docIt = -1;
    }

    PostingsEnum getWrapped() {
@ -311,7 +309,7 @@ final class FreqProxTermsWriter extends TermsHash {
    }
  }

-  static class SortingPostingsEnum extends FilterLeafReader.FilterPostingsEnum {
+  static class SortingPostingsEnum extends PostingsEnum {

    /**
     * A {@link TimSorter} which sorts two parallel arrays of doc IDs and offsets in one go. Everyti
@ -324,8 +322,8 @@ final class FreqProxTermsWriter extends TermsHash {
      private int[] tmpDocs;
      private long[] tmpOffsets;

-      public DocOffsetSorter(int maxDoc) {
-        super(maxDoc / 8);
+      public DocOffsetSorter(int numTempSlots) {
+        super(numTempSlots);
        this.tmpDocs = IntsRef.EMPTY_INTS;
        this.tmpOffsets = LongsRef.EMPTY_LONGS;
      }
@ -379,55 +377,38 @@ final class FreqProxTermsWriter extends TermsHash {
      }
    }

-    private final int maxDoc;
-    private final DocOffsetSorter sorter;
-    private int[] docs;
-    private long[] offsets;
-    private final int upto;
+    private DocOffsetSorter sorter;
+    private int[] docs = IntsRef.EMPTY_INTS;
+    private long[] offsets = LongsRef.EMPTY_LONGS;
+    private int upto;

-    private final ByteBuffersDataInput postingInput;
-    private final boolean storePositions, storeOffsets;
+    private ByteBuffersDataInput postingInput;
+    private PostingsEnum in;
+    private boolean storePositions, storeOffsets;

-    private int docIt = -1;
+    private int docIt;
    private int pos;
-    private int startOffset = -1;
-    private int endOffset = -1;
-    private final BytesRef payload;
+    private int startOffset;
+    private int endOffset;
+    private final BytesRef payload = new BytesRef();
    private int currFreq;

-    private final ByteBuffersDataOutput buffer;
+    private final ByteBuffersDataOutput buffer = ByteBuffersDataOutput.newResettableInstance();

-    SortingPostingsEnum(
-        int maxDoc,
-        SortingPostingsEnum reuse,
-        final PostingsEnum in,
-        Sorter.DocMap docMap,
-        boolean storePositions,
-        boolean storeOffsets)
+    void reset(Sorter.DocMap docMap, PostingsEnum in, boolean storePositions, boolean storeOffsets)
        throws IOException {
-      super(in);
-      this.maxDoc = maxDoc;
+      this.in = in;
      this.storePositions = storePositions;
      this.storeOffsets = storeOffsets;
-      if (reuse != null) {
-        docs = reuse.docs;
-        offsets = reuse.offsets;
-        payload = reuse.payload;
-        buffer = reuse.buffer;
-        buffer.reset();
-        if (reuse.maxDoc == maxDoc) {
-          sorter = reuse.sorter;
-        } else {
-          sorter = new DocOffsetSorter(maxDoc);
-        }
-      } else {
-        docs = new int[32];
-        offsets = new long[32];
-        payload = new BytesRef(32);
-        buffer = ByteBuffersDataOutput.newResettableInstance();
-        sorter = new DocOffsetSorter(maxDoc);
+      if (sorter == null) {
+        final int numTempSlots = docMap.size() / 8;
+        sorter = new DocOffsetSorter(numTempSlots);
      }
+      docIt = -1;
+      startOffset = -1;
+      endOffset = -1;

+      buffer.reset();
      int doc;
      int i = 0;
      while ((doc = in.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
@ -548,5 +529,10 @@ final class FreqProxTermsWriter extends TermsHash {
    PostingsEnum getWrapped() {
      return in;
    }
+
+    @Override
+    public long cost() {
+      return in.cost();
+    }
  }
 }
--- a/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataInput.java
+++ b/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataInput.java
@ -55,10 +55,10 @@ public final class ByteBuffersDataInput extends DataInput
  public ByteBuffersDataInput(List<ByteBuffer> buffers) {
    ensureAssumptions(buffers);

-    this.blocks =
-        buffers.stream()
-            .map(buf -> buf.asReadOnlyBuffer().order(ByteOrder.LITTLE_ENDIAN))
-            .toArray(ByteBuffer[]::new);
+    this.blocks = buffers.toArray(ByteBuffer[]::new);
+    for (int i = 0; i < blocks.length; ++i) {
+      blocks[i] = blocks[i].asReadOnlyBuffer().order(ByteOrder.LITTLE_ENDIAN);
+    }
    // pre-allocate these arrays and create the view buffers lazily
    this.floatBuffers = new FloatBuffer[blocks.length * Float.BYTES];
    this.longBuffers = new LongBuffer[blocks.length * Long.BYTES];
@ -71,7 +71,11 @@ public final class ByteBuffersDataInput extends DataInput
      this.blockMask = (1 << blockBits) - 1;
    }

-    this.size = Arrays.stream(blocks).mapToLong(block -> block.remaining()).sum();
+    long size = 0;
+    for (ByteBuffer block : blocks) {
+      size += block.remaining();
+    }
+    this.size = size;

    // The initial "position" of this stream is shifted by the position of the first block.
    this.offset = blocks[0].position();