From bc758601cd8f77136e0f8bb8467927c3e37c7ddf Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Fri, 22 Nov 2019 11:42:25 +0100 Subject: [PATCH] LUCENE-9027: Try to get back some indexing speed. --- .../lucene/codecs/lucene84/ForUtil.java | 11 +++-- .../lucene84/Lucene84PostingsReader.java | 42 +++++++++---------- .../lucene/codecs/lucene84/gen_ForUtil.py | 11 +++-- 3 files changed, 35 insertions(+), 29 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene84/ForUtil.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene84/ForUtil.java index f16aa8377fe..74b72abb273 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene84/ForUtil.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene84/ForUtil.java @@ -19,7 +19,6 @@ package org.apache.lucene.codecs.lucene84; import java.io.IOException; -import java.util.Arrays; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataOutput; @@ -237,15 +236,18 @@ final class ForUtil { } final int numLongsPerShift = bitsPerValue * 2; - Arrays.fill(tmp, 0L); int idx = 0; - for (int shift = nextPrimitive - bitsPerValue; shift >= 0; shift -= bitsPerValue) { + int shift = nextPrimitive - bitsPerValue; + for (int i = 0; i < numLongsPerShift; ++i) { + tmp[i] = longs[idx++] << shift; + } + for (shift = shift - bitsPerValue; shift >= 0; shift -= bitsPerValue) { for (int i = 0; i < numLongsPerShift; ++i) { tmp[i] |= longs[idx++] << shift; } } - final int remainingBitsPerLong = nextPrimitive % bitsPerValue; + final int remainingBitsPerLong = shift + bitsPerValue; final long maskRemainingBitsPerLong; if (nextPrimitive == 8) { maskRemainingBitsPerLong = mask8(remainingBitsPerLong); @@ -254,6 +256,7 @@ final class ForUtil { } else { maskRemainingBitsPerLong = mask32(remainingBitsPerLong); } + int tmpIdx = 0; int remainingBitsPerValue = bitsPerValue; while (idx < numLongs) { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene84/Lucene84PostingsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene84/Lucene84PostingsReader.java index fe4514d3ad4..b0620997726 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene84/Lucene84PostingsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene84/Lucene84PostingsReader.java @@ -41,7 +41,6 @@ import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SlowImpactsEnum; -import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.ArrayUtil; @@ -413,13 +412,13 @@ public final class Lucene84PostingsReader extends PostingsReaderBase { } else if (docFreq == 1) { docBuffer[0] = singletonDocID; freqBuffer[0] = totalTermFreq; - Arrays.fill(docBuffer, 1, BLOCK_SIZE, DocIdSetIterator.NO_MORE_DOCS); + docBuffer[1] = NO_MORE_DOCS; blockUpto++; } else { // Read vInts: readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq); prefixSum(docBuffer, left, accum); - Arrays.fill(docBuffer, left, BLOCK_SIZE, DocIdSetIterator.NO_MORE_DOCS); + docBuffer[left] = NO_MORE_DOCS; blockUpto += left; } accum = docBuffer[BLOCK_SIZE - 1]; @@ -516,8 +515,8 @@ public final class Lucene84PostingsReader extends PostingsReaderBase { final ForDeltaUtil forDeltaUtil = new ForDeltaUtil(forUtil); final PForUtil pforUtil = new PForUtil(forUtil); - private final long[] docBuffer = new long[BLOCK_SIZE]; - private final long[] freqBuffer = new long[BLOCK_SIZE]; + private final long[] docBuffer = new long[BLOCK_SIZE+1]; + private final long[] freqBuffer = new long[BLOCK_SIZE+1]; private final long[] posDeltaBuffer = new long[BLOCK_SIZE]; private final long[] payloadLengthBuffer; @@ -550,7 +549,7 @@ public final class Lucene84PostingsReader extends PostingsReaderBase { private int docFreq; // number of docs in this posting list private long totalTermFreq; // number of positions in this posting list - private int docUpto; // how many docs we've read + private int blockUpto; // number of docs in or before the current block private int doc; // doc we last read private long accum; // accumulator for doc deltas private int freq; // freq we last read @@ -625,6 +624,9 @@ public final class Lucene84PostingsReader extends PostingsReaderBase { payloadBytes = null; payload = null; } + + // We set the last element of docBuffer to NO_MORE_DOCS, it helps save conditionals in advance() + docBuffer[BLOCK_SIZE] = NO_MORE_DOCS; } public boolean canReuse(IndexInput docIn, FieldInfo fieldInfo) { @@ -664,7 +666,7 @@ public final class Lucene84PostingsReader extends PostingsReaderBase { doc = -1; accum = 0; - docUpto = 0; + blockUpto = 0; if (docFreq > BLOCK_SIZE) { nextSkipDoc = BLOCK_SIZE - 1; // we won't skip if target is found in first block } else { @@ -686,23 +688,27 @@ public final class Lucene84PostingsReader extends PostingsReaderBase { } private void refillDocs() throws IOException { - final int left = docFreq - docUpto; + final int left = docFreq - blockUpto; assert left >= 0; if (left >= BLOCK_SIZE) { forDeltaUtil.decodeAndPrefixSum(docIn, accum, docBuffer); pforUtil.decode(docIn, freqBuffer); + blockUpto += BLOCK_SIZE; } else if (docFreq == 1) { docBuffer[0] = singletonDocID; freqBuffer[0] = totalTermFreq; - Arrays.fill(docBuffer, 1, BLOCK_SIZE, DocIdSetIterator.NO_MORE_DOCS); + docBuffer[1] = NO_MORE_DOCS; + blockUpto++; } else { readVIntBlock(docIn, docBuffer, freqBuffer, left, true); prefixSum(docBuffer, left, accum); - Arrays.fill(docBuffer, left, BLOCK_SIZE, DocIdSetIterator.NO_MORE_DOCS); + docBuffer[left] = NO_MORE_DOCS; + blockUpto += left; } accum = docBuffer[BLOCK_SIZE - 1]; docBufferUpto = 0; + assert docBuffer[BLOCK_SIZE] == NO_MORE_DOCS; } private void refillPositions() throws IOException { @@ -784,7 +790,6 @@ public final class Lucene84PostingsReader extends PostingsReaderBase { freq = (int) freqBuffer[docBufferUpto]; posPendingCount += freq; docBufferUpto++; - docUpto++; position = 0; lastStartOffset = 0; @@ -813,10 +818,10 @@ public final class Lucene84PostingsReader extends PostingsReaderBase { final int newDocUpto = skipper.skipTo(target) + 1; - if (newDocUpto > docUpto) { + if (newDocUpto > blockUpto - BLOCK_SIZE + docBufferUpto) { // Skipper moved assert newDocUpto % BLOCK_SIZE == 0 : "got " + newDocUpto; - docUpto = newDocUpto; + blockUpto = newDocUpto; // Force to read next block docBufferUpto = BLOCK_SIZE; @@ -841,15 +846,10 @@ public final class Lucene84PostingsReader extends PostingsReaderBase { freq = (int) freqBuffer[docBufferUpto]; posPendingCount += freq; docBufferUpto++; - docUpto++; if (doc >= target) { break; } - - if (docBufferUpto == BLOCK_SIZE) { - return this.doc = NO_MORE_DOCS; - } } position = 0; @@ -1079,7 +1079,7 @@ public final class Lucene84PostingsReader extends PostingsReaderBase { } else { readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreqs); prefixSum(docBuffer, left, accum); - Arrays.fill(docBuffer, left, BLOCK_SIZE, DocIdSetIterator.NO_MORE_DOCS); + docBuffer[left] = NO_MORE_DOCS; blockUpto += left; } accum = docBuffer[BLOCK_SIZE - 1]; @@ -1282,7 +1282,7 @@ public final class Lucene84PostingsReader extends PostingsReaderBase { } else { readVIntBlock(docIn, docBuffer, freqBuffer, left, true); prefixSum(docBuffer, left, accum); - Arrays.fill(docBuffer, left, BLOCK_SIZE, DocIdSetIterator.NO_MORE_DOCS); + docBuffer[left] = NO_MORE_DOCS; } accum = docBuffer[BLOCK_SIZE - 1]; docBufferUpto = 0; @@ -1664,7 +1664,7 @@ public final class Lucene84PostingsReader extends PostingsReaderBase { } else { readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq); prefixSum(docBuffer, left, accum); - Arrays.fill(docBuffer, left, BLOCK_SIZE, DocIdSetIterator.NO_MORE_DOCS); + docBuffer[left] = NO_MORE_DOCS; } accum = docBuffer[BLOCK_SIZE - 1]; docBufferUpto = 0; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene84/gen_ForUtil.py b/lucene/core/src/java/org/apache/lucene/codecs/lucene84/gen_ForUtil.py index dfcfa01fedb..260a6834ab3 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene84/gen_ForUtil.py +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene84/gen_ForUtil.py @@ -42,7 +42,6 @@ HEADER = """// This file has been automatically generated, DO NOT EDIT package org.apache.lucene.codecs.lucene84; import java.io.IOException; -import java.util.Arrays; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataOutput; @@ -260,15 +259,18 @@ final class ForUtil { } final int numLongsPerShift = bitsPerValue * 2; - Arrays.fill(tmp, 0L); int idx = 0; - for (int shift = nextPrimitive - bitsPerValue; shift >= 0; shift -= bitsPerValue) { + int shift = nextPrimitive - bitsPerValue; + for (int i = 0; i < numLongsPerShift; ++i) { + tmp[i] = longs[idx++] << shift; + } + for (shift = shift - bitsPerValue; shift >= 0; shift -= bitsPerValue) { for (int i = 0; i < numLongsPerShift; ++i) { tmp[i] |= longs[idx++] << shift; } } - final int remainingBitsPerLong = nextPrimitive % bitsPerValue; + final int remainingBitsPerLong = shift + bitsPerValue; final long maskRemainingBitsPerLong; if (nextPrimitive == 8) { maskRemainingBitsPerLong = mask8(remainingBitsPerLong); @@ -277,6 +279,7 @@ final class ForUtil { } else { maskRemainingBitsPerLong = mask32(remainingBitsPerLong); } + int tmpIdx = 0; int remainingBitsPerValue = bitsPerValue; while (idx < numLongs) {