LUCENE-9027: Try to get back some indexing speed.

2019-11-22 11:42:25 +01:00 · 2019-11-22 11:42:25 +01:00 · c51006c3c4
parent acd56b350d
commit c51006c3c4
3 changed files with 35 additions and 29 deletions
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene84/ForUtil.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene84/ForUtil.java
@ -19,7 +19,6 @@
 package org.apache.lucene.codecs.lucene84;

 import java.io.IOException;
-import java.util.Arrays;

 import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.DataOutput;
@ -237,15 +236,18 @@ final class ForUtil {
    }

    final int numLongsPerShift = bitsPerValue * 2;
-    Arrays.fill(tmp, 0L);
    int idx = 0;
-    for (int shift = nextPrimitive - bitsPerValue; shift >= 0; shift -= bitsPerValue) {
+    int shift = nextPrimitive - bitsPerValue;
+    for (int i = 0; i < numLongsPerShift; ++i) {
+      tmp[i] = longs[idx++] << shift;
+    }
+    for (shift = shift - bitsPerValue; shift >= 0; shift -= bitsPerValue) {
      for (int i = 0; i < numLongsPerShift; ++i) {
        tmp[i] |= longs[idx++] << shift;
      }
    }

-    final int remainingBitsPerLong = nextPrimitive % bitsPerValue;
+    final int remainingBitsPerLong = shift + bitsPerValue;
    final long maskRemainingBitsPerLong;
    if (nextPrimitive == 8) {
      maskRemainingBitsPerLong = mask8(remainingBitsPerLong);
@ -254,6 +256,7 @@ final class ForUtil {
    } else {
      maskRemainingBitsPerLong = mask32(remainingBitsPerLong);
    }
+
    int tmpIdx = 0;
    int remainingBitsPerValue = bitsPerValue;
    while (idx < numLongs) {
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene84/Lucene84PostingsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene84/Lucene84PostingsReader.java
@ -41,7 +41,6 @@ import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.PostingsEnum;
 import org.apache.lucene.index.SegmentReadState;
 import org.apache.lucene.index.SlowImpactsEnum;
-import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.util.ArrayUtil;
@ -413,13 +412,13 @@ public final class Lucene84PostingsReader extends PostingsReaderBase {
      } else if (docFreq == 1) {
        docBuffer[0] = singletonDocID;
        freqBuffer[0] = totalTermFreq;
-        Arrays.fill(docBuffer, 1, BLOCK_SIZE, DocIdSetIterator.NO_MORE_DOCS);
+        docBuffer[1] = NO_MORE_DOCS;
        blockUpto++;
      } else {
        // Read vInts:
        readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq);
        prefixSum(docBuffer, left, accum);
-        Arrays.fill(docBuffer, left, BLOCK_SIZE, DocIdSetIterator.NO_MORE_DOCS);
+        docBuffer[left] = NO_MORE_DOCS;
        blockUpto += left;
      }
      accum = docBuffer[BLOCK_SIZE - 1];
@ -516,8 +515,8 @@ public final class Lucene84PostingsReader extends PostingsReaderBase {
    final ForDeltaUtil forDeltaUtil = new ForDeltaUtil(forUtil);
    final PForUtil pforUtil = new PForUtil(forUtil);

-    private final long[] docBuffer = new long[BLOCK_SIZE];
-    private final long[] freqBuffer = new long[BLOCK_SIZE];
+    private final long[] docBuffer = new long[BLOCK_SIZE+1];
+    private final long[] freqBuffer = new long[BLOCK_SIZE+1];
    private final long[] posDeltaBuffer = new long[BLOCK_SIZE];

    private final long[] payloadLengthBuffer;
@ -550,7 +549,7 @@ public final class Lucene84PostingsReader extends PostingsReaderBase {

    private int docFreq;                              // number of docs in this posting list
    private long totalTermFreq;                       // number of positions in this posting list
-    private int docUpto;                              // how many docs we've read
+    private int blockUpto;                            // number of docs in or before the current block
    private int doc;                                  // doc we last read
    private long accum;                               // accumulator for doc deltas
    private int freq;                                 // freq we last read
@ -625,6 +624,9 @@ public final class Lucene84PostingsReader extends PostingsReaderBase {
        payloadBytes = null;
        payload = null;
      }
+
+      // We set the last element of docBuffer to NO_MORE_DOCS, it helps save conditionals in advance()
+      docBuffer[BLOCK_SIZE] = NO_MORE_DOCS;
    }

    public boolean canReuse(IndexInput docIn, FieldInfo fieldInfo) {
@ -664,7 +666,7 @@ public final class Lucene84PostingsReader extends PostingsReaderBase {

      doc = -1;
      accum = 0;
-      docUpto = 0;
+      blockUpto = 0;
      if (docFreq > BLOCK_SIZE) {
        nextSkipDoc = BLOCK_SIZE - 1; // we won't skip if target is found in first block
      } else {
@ -686,23 +688,27 @@ public final class Lucene84PostingsReader extends PostingsReaderBase {
    }

    private void refillDocs() throws IOException {
-      final int left = docFreq - docUpto;
+      final int left = docFreq - blockUpto;
      assert left >= 0;

      if (left >= BLOCK_SIZE) {
        forDeltaUtil.decodeAndPrefixSum(docIn, accum, docBuffer);
        pforUtil.decode(docIn, freqBuffer);
+        blockUpto += BLOCK_SIZE;
      } else if (docFreq == 1) {
        docBuffer[0] = singletonDocID;
        freqBuffer[0] = totalTermFreq;
-        Arrays.fill(docBuffer, 1, BLOCK_SIZE, DocIdSetIterator.NO_MORE_DOCS);
+        docBuffer[1] = NO_MORE_DOCS;
+        blockUpto++;
      } else {
        readVIntBlock(docIn, docBuffer, freqBuffer, left, true);
        prefixSum(docBuffer, left, accum);
-        Arrays.fill(docBuffer, left, BLOCK_SIZE, DocIdSetIterator.NO_MORE_DOCS);
+        docBuffer[left] = NO_MORE_DOCS;
+        blockUpto += left;
      }
      accum = docBuffer[BLOCK_SIZE - 1];
      docBufferUpto = 0;
+      assert docBuffer[BLOCK_SIZE] == NO_MORE_DOCS;
    }

    private void refillPositions() throws IOException {
@ -784,7 +790,6 @@ public final class Lucene84PostingsReader extends PostingsReaderBase {
      freq = (int) freqBuffer[docBufferUpto];
      posPendingCount += freq;
      docBufferUpto++;
-      docUpto++;

      position = 0;
      lastStartOffset = 0;
@ -813,10 +818,10 @@ public final class Lucene84PostingsReader extends PostingsReaderBase {

        final int newDocUpto = skipper.skipTo(target) + 1;

-        if (newDocUpto > docUpto) {
+        if (newDocUpto > blockUpto - BLOCK_SIZE + docBufferUpto) {
          // Skipper moved
          assert newDocUpto % BLOCK_SIZE == 0 : "got " + newDocUpto;
-          docUpto = newDocUpto;
+          blockUpto = newDocUpto;

          // Force to read next block
          docBufferUpto = BLOCK_SIZE;
@ -841,15 +846,10 @@ public final class Lucene84PostingsReader extends PostingsReaderBase {
        freq = (int) freqBuffer[docBufferUpto];
        posPendingCount += freq;
        docBufferUpto++;
-        docUpto++;

        if (doc >= target) {
          break;
        }
-
-        if (docBufferUpto == BLOCK_SIZE) {
-          return this.doc = NO_MORE_DOCS;
-        }
      }

      position = 0;
@ -1079,7 +1079,7 @@ public final class Lucene84PostingsReader extends PostingsReaderBase {
      } else {
        readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreqs);
        prefixSum(docBuffer, left, accum);
-        Arrays.fill(docBuffer, left, BLOCK_SIZE, DocIdSetIterator.NO_MORE_DOCS);
+        docBuffer[left] = NO_MORE_DOCS;
        blockUpto += left;
      }
      accum = docBuffer[BLOCK_SIZE - 1];
@ -1282,7 +1282,7 @@ public final class Lucene84PostingsReader extends PostingsReaderBase {
      } else {
        readVIntBlock(docIn, docBuffer, freqBuffer, left, true);
        prefixSum(docBuffer, left, accum);
-        Arrays.fill(docBuffer, left, BLOCK_SIZE, DocIdSetIterator.NO_MORE_DOCS);
+        docBuffer[left] = NO_MORE_DOCS;
      }
      accum = docBuffer[BLOCK_SIZE - 1];
      docBufferUpto = 0;
@ -1664,7 +1664,7 @@ public final class Lucene84PostingsReader extends PostingsReaderBase {
      } else {
        readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq);
        prefixSum(docBuffer, left, accum);
-        Arrays.fill(docBuffer, left, BLOCK_SIZE, DocIdSetIterator.NO_MORE_DOCS);
+        docBuffer[left] = NO_MORE_DOCS;
      }
      accum = docBuffer[BLOCK_SIZE - 1];
      docBufferUpto = 0;
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene84/gen_ForUtil.py
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene84/gen_ForUtil.py
@ -42,7 +42,6 @@ HEADER = """// This file has been automatically generated, DO NOT EDIT
 package org.apache.lucene.codecs.lucene84;

 import java.io.IOException;
-import java.util.Arrays;

 import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.DataOutput;
@ -260,15 +259,18 @@ final class ForUtil {
    }

    final int numLongsPerShift = bitsPerValue * 2;
-    Arrays.fill(tmp, 0L);
    int idx = 0;
-    for (int shift = nextPrimitive - bitsPerValue; shift >= 0; shift -= bitsPerValue) {
+    int shift = nextPrimitive - bitsPerValue;
+    for (int i = 0; i < numLongsPerShift; ++i) {
+      tmp[i] = longs[idx++] << shift;
+    }
+    for (shift = shift - bitsPerValue; shift >= 0; shift -= bitsPerValue) {
      for (int i = 0; i < numLongsPerShift; ++i) {
        tmp[i] |= longs[idx++] << shift;
      }
    }

-    final int remainingBitsPerLong = nextPrimitive % bitsPerValue;
+    final int remainingBitsPerLong = shift + bitsPerValue;
    final long maskRemainingBitsPerLong;
    if (nextPrimitive == 8) {
      maskRemainingBitsPerLong = mask8(remainingBitsPerLong);
@ -277,6 +279,7 @@ final class ForUtil {
    } else {
      maskRemainingBitsPerLong = mask32(remainingBitsPerLong);
    }
+
    int tmpIdx = 0;
    int remainingBitsPerValue = bitsPerValue;
    while (idx < numLongs) {