mirror of https://github.com/apache/lucene.git
Speed up PostingsEnum when reading positions. (#14032)
This PR changes the following: - As much work as possible is moved from `nextDoc()`/`advance()` to `nextPosition()`. This helps only pay the overhead of reading positions when all query terms agree on a candidate. - Frequencies are read lazily. Again, this helps in case a document is needed in a block, but clauses do not agree on a common candidate match, so frequencies are never decoded. - A few other minor optimizations.
This commit is contained in:
parent
067b472a32
commit
b2a10e3643
|
@ -119,6 +119,9 @@ Optimizations
|
|||
* GITHUB#14023: Make JVM inlining decisions more predictable in our main
|
||||
queries. (Adrien Grand)
|
||||
|
||||
* GITHUB#14032: Speed up PostingsEnum when positions are requested.
|
||||
(Adrien Grand)
|
||||
|
||||
Bug Fixes
|
||||
---------------------
|
||||
* GITHUB#13832: Fixed an issue where the DefaultPassageFormatter.format method did not format passages as intended
|
||||
|
|
|
@ -638,9 +638,13 @@ public final class Lucene101PostingsReader extends PostingsReaderBase {
|
|||
final boolean indexHasPayloads;
|
||||
final boolean indexHasOffsetsOrPayloads;
|
||||
|
||||
private int freq; // freq we last read
|
||||
private long freqFP; // offset of the freq block
|
||||
|
||||
private int position; // current position
|
||||
|
||||
// value of docBufferUpto on the last doc ID when positions have been read
|
||||
private int posDocBufferUpto;
|
||||
|
||||
// how many positions "behind" we are; nextPosition must
|
||||
// skip these to "catch up":
|
||||
private int posPendingCount;
|
||||
|
@ -662,6 +666,7 @@ public final class Lucene101PostingsReader extends PostingsReaderBase {
|
|||
|
||||
private boolean needsOffsets; // true if we actually need offsets
|
||||
private boolean needsPayloads; // true if we actually need payloads
|
||||
private boolean needsPayloadsOrOffsets;
|
||||
|
||||
public EverythingEnum(FieldInfo fieldInfo) throws IOException {
|
||||
super(fieldInfo);
|
||||
|
@ -745,8 +750,11 @@ public final class Lucene101PostingsReader extends PostingsReaderBase {
|
|||
lastPosBlockFP = posTermStartFP + termState.lastPosBlockOffset;
|
||||
}
|
||||
|
||||
this.needsOffsets = PostingsEnum.featureRequested(flags, PostingsEnum.OFFSETS);
|
||||
this.needsPayloads = PostingsEnum.featureRequested(flags, PostingsEnum.PAYLOADS);
|
||||
this.needsOffsets =
|
||||
indexHasOffsets && PostingsEnum.featureRequested(flags, PostingsEnum.OFFSETS);
|
||||
this.needsPayloads =
|
||||
indexHasPayloads && PostingsEnum.featureRequested(flags, PostingsEnum.PAYLOADS);
|
||||
this.needsPayloadsOrOffsets = this.needsPayloads || this.needsOffsets;
|
||||
|
||||
level1BlockPosUpto = 0;
|
||||
level1BlockPayUpto = 0;
|
||||
|
@ -758,8 +766,13 @@ public final class Lucene101PostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
|
||||
@Override
|
||||
public int freq() {
|
||||
return freq;
|
||||
public int freq() throws IOException {
|
||||
if (freqFP != -1) {
|
||||
docIn.seek(freqFP);
|
||||
pforUtil.decode(docInUtil, freqBuffer);
|
||||
freqFP = -1;
|
||||
}
|
||||
return freqBuffer[docBufferUpto - 1];
|
||||
}
|
||||
|
||||
private void refillDocs() throws IOException {
|
||||
|
@ -768,11 +781,13 @@ public final class Lucene101PostingsReader extends PostingsReaderBase {
|
|||
|
||||
if (left >= BLOCK_SIZE) {
|
||||
forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer);
|
||||
pforUtil.decode(docInUtil, freqBuffer);
|
||||
freqFP = docIn.getFilePointer();
|
||||
PForUtil.skip(docIn);
|
||||
docCountUpto += BLOCK_SIZE;
|
||||
} else if (docFreq == 1) {
|
||||
docBuffer[0] = singletonDocID;
|
||||
freqBuffer[0] = (int) totalTermFreq;
|
||||
freqFP = -1;
|
||||
docBuffer[1] = NO_MORE_DOCS;
|
||||
docCountUpto++;
|
||||
docBufferSize = 1;
|
||||
|
@ -781,11 +796,13 @@ public final class Lucene101PostingsReader extends PostingsReaderBase {
|
|||
PostingsUtil.readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq, true);
|
||||
prefixSum(docBuffer, left, prevDocID);
|
||||
docBuffer[left] = NO_MORE_DOCS;
|
||||
freqFP = -1;
|
||||
docCountUpto += left;
|
||||
docBufferSize = left;
|
||||
}
|
||||
prevDocID = docBuffer[BLOCK_SIZE - 1];
|
||||
docBufferUpto = 0;
|
||||
posDocBufferUpto = 0;
|
||||
assert docBuffer[docBufferSize] == NO_MORE_DOCS;
|
||||
}
|
||||
|
||||
|
@ -846,6 +863,8 @@ public final class Lucene101PostingsReader extends PostingsReaderBase {
|
|||
payloadByteUpto = level0BlockPayUpto;
|
||||
}
|
||||
posBufferUpto = BLOCK_SIZE;
|
||||
} else {
|
||||
posPendingCount += sumOverRange(freqBuffer, posDocBufferUpto, BLOCK_SIZE);
|
||||
}
|
||||
|
||||
if (docFreq - docCountUpto >= BLOCK_SIZE) {
|
||||
|
@ -875,34 +894,23 @@ public final class Lucene101PostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
|
||||
this.doc = docBuffer[docBufferUpto];
|
||||
this.freq = freqBuffer[docBufferUpto];
|
||||
docBufferUpto++;
|
||||
posPendingCount += freq;
|
||||
position = 0;
|
||||
lastStartOffset = 0;
|
||||
return doc;
|
||||
}
|
||||
|
||||
private void skipLevel0To(int target) throws IOException {
|
||||
long posFP;
|
||||
int posUpto;
|
||||
long payFP;
|
||||
int payUpto;
|
||||
|
||||
while (true) {
|
||||
prevDocID = level0LastDocID;
|
||||
|
||||
// If nextBlockPosFP is less than the current FP, it means that the block of positions for
|
||||
// the first docs of the next block are already decoded. In this case we just accumulate
|
||||
// frequencies into posPendingCount instead of seeking backwards and decoding the same pos
|
||||
// block again.
|
||||
if (level0PosEndFP >= posIn.getFilePointer()) {
|
||||
posIn.seek(level0PosEndFP);
|
||||
posPendingCount = level0BlockPosUpto;
|
||||
if (indexHasOffsetsOrPayloads) {
|
||||
assert level0PayEndFP >= payIn.getFilePointer();
|
||||
payIn.seek(level0PayEndFP);
|
||||
payloadByteUpto = level0BlockPayUpto;
|
||||
}
|
||||
posBufferUpto = BLOCK_SIZE;
|
||||
} else {
|
||||
posPendingCount += sumOverRange(freqBuffer, docBufferUpto, BLOCK_SIZE);
|
||||
}
|
||||
posFP = level0PosEndFP;
|
||||
posUpto = level0BlockPosUpto;
|
||||
payFP = level0PayEndFP;
|
||||
payUpto = level0BlockPayUpto;
|
||||
|
||||
if (docFreq - docCountUpto >= BLOCK_SIZE) {
|
||||
docIn.readVLong(); // skip0 num bytes
|
||||
|
@ -931,6 +939,23 @@ public final class Lucene101PostingsReader extends PostingsReaderBase {
|
|||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// If nextBlockPosFP is less than the current FP, it means that the block of positions for
|
||||
// the first docs of the next block are already decoded. In this case we just accumulate
|
||||
// frequencies into posPendingCount instead of seeking backwards and decoding the same pos
|
||||
// block again.
|
||||
if (posFP >= posIn.getFilePointer()) {
|
||||
posIn.seek(posFP);
|
||||
posPendingCount = posUpto;
|
||||
if (indexHasOffsetsOrPayloads) {
|
||||
assert level0PayEndFP >= payIn.getFilePointer();
|
||||
payIn.seek(payFP);
|
||||
payloadByteUpto = payUpto;
|
||||
}
|
||||
posBufferUpto = BLOCK_SIZE;
|
||||
} else {
|
||||
posPendingCount += sumOverRange(freqBuffer, posDocBufferUpto, BLOCK_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -947,16 +972,12 @@ public final class Lucene101PostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
|
||||
int next = VectorUtil.findNextGEQ(docBuffer, target, docBufferUpto, docBufferSize);
|
||||
posPendingCount += sumOverRange(freqBuffer, docBufferUpto, next + 1);
|
||||
this.freq = freqBuffer[next];
|
||||
this.docBufferUpto = next + 1;
|
||||
position = 0;
|
||||
lastStartOffset = 0;
|
||||
|
||||
return this.doc = docBuffer[next];
|
||||
}
|
||||
|
||||
private void skipPositions() throws IOException {
|
||||
private void skipPositions(int freq) throws IOException {
|
||||
// Skip positions now:
|
||||
int toSkip = posPendingCount - freq;
|
||||
// if (DEBUG) {
|
||||
|
@ -1003,8 +1024,7 @@ public final class Lucene101PostingsReader extends PostingsReaderBase {
|
|||
lastStartOffset = 0;
|
||||
}
|
||||
|
||||
private void refillPositions() throws IOException {
|
||||
if (posIn.getFilePointer() == lastPosBlockFP) {
|
||||
private void refillLastPositionBlock() throws IOException {
|
||||
final int count = (int) (totalTermFreq % BLOCK_SIZE);
|
||||
int payloadLength = 0;
|
||||
int offsetLength = 0;
|
||||
|
@ -1038,6 +1058,11 @@ public final class Lucene101PostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
}
|
||||
payloadByteUpto = 0;
|
||||
}
|
||||
|
||||
private void refillPositions() throws IOException {
|
||||
if (posIn.getFilePointer() == lastPosBlockFP) {
|
||||
refillLastPositionBlock();
|
||||
} else {
|
||||
pforUtil.decode(posInUtil, posDeltaBuffer);
|
||||
|
||||
|
@ -1054,8 +1079,7 @@ public final class Lucene101PostingsReader extends PostingsReaderBase {
|
|||
// this works, because when writing a vint block we always force the first length to be
|
||||
// written
|
||||
PForUtil.skip(payIn); // skip over lengths
|
||||
int numBytes = payIn.readVInt(); // read length of payloadBytes
|
||||
payIn.seek(payIn.getFilePointer() + numBytes); // skip over payloadBytes
|
||||
payIn.skipBytes(payIn.readVInt()); // skip over payloadBytes
|
||||
}
|
||||
payloadByteUpto = 0;
|
||||
}
|
||||
|
@ -1074,33 +1098,50 @@ public final class Lucene101PostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
}
|
||||
|
||||
private void accumulatePayloadAndOffsets() {
|
||||
if (needsPayloads) {
|
||||
payloadLength = payloadLengthBuffer[posBufferUpto];
|
||||
payload.bytes = payloadBytes;
|
||||
payload.offset = payloadByteUpto;
|
||||
payload.length = payloadLength;
|
||||
payloadByteUpto += payloadLength;
|
||||
}
|
||||
|
||||
if (needsOffsets) {
|
||||
startOffset = lastStartOffset + offsetStartDeltaBuffer[posBufferUpto];
|
||||
endOffset = startOffset + offsetLengthBuffer[posBufferUpto];
|
||||
lastStartOffset = startOffset;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextPosition() throws IOException {
|
||||
if (posDocBufferUpto != docBufferUpto) {
|
||||
int freq = freq(); // triggers lazy decoding of freqs
|
||||
|
||||
// First position that is being read on this doc.
|
||||
posPendingCount += sumOverRange(freqBuffer, posDocBufferUpto, docBufferUpto);
|
||||
posDocBufferUpto = docBufferUpto;
|
||||
|
||||
assert posPendingCount > 0;
|
||||
|
||||
if (posPendingCount > freq) {
|
||||
skipPositions();
|
||||
skipPositions(freq);
|
||||
posPendingCount = freq;
|
||||
}
|
||||
|
||||
position = 0;
|
||||
lastStartOffset = 0;
|
||||
}
|
||||
|
||||
if (posBufferUpto == BLOCK_SIZE) {
|
||||
refillPositions();
|
||||
posBufferUpto = 0;
|
||||
}
|
||||
position += posDeltaBuffer[posBufferUpto];
|
||||
|
||||
if (indexHasPayloads) {
|
||||
payloadLength = payloadLengthBuffer[posBufferUpto];
|
||||
payload.bytes = payloadBytes;
|
||||
payload.offset = payloadByteUpto;
|
||||
payload.length = payloadLength;
|
||||
payloadByteUpto += payloadLength;
|
||||
}
|
||||
|
||||
if (indexHasOffsets) {
|
||||
startOffset = lastStartOffset + offsetStartDeltaBuffer[posBufferUpto];
|
||||
endOffset = startOffset + offsetLengthBuffer[posBufferUpto];
|
||||
lastStartOffset = startOffset;
|
||||
if (needsPayloadsOrOffsets) {
|
||||
accumulatePayloadAndOffsets();
|
||||
}
|
||||
|
||||
posBufferUpto++;
|
||||
|
@ -1110,17 +1151,23 @@ public final class Lucene101PostingsReader extends PostingsReaderBase {
|
|||
|
||||
@Override
|
||||
public int startOffset() {
|
||||
if (needsOffsets == false) {
|
||||
return -1;
|
||||
}
|
||||
return startOffset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() {
|
||||
if (needsOffsets == false) {
|
||||
return -1;
|
||||
}
|
||||
return endOffset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef getPayload() {
|
||||
if (payloadLength == 0) {
|
||||
if (needsPayloads == false || payloadLength == 0) {
|
||||
return null;
|
||||
} else {
|
||||
return payload;
|
||||
|
@ -1466,9 +1513,13 @@ public final class Lucene101PostingsReader extends PostingsReaderBase {
|
|||
final boolean indexHasPayloads;
|
||||
final boolean indexHasOffsetsOrPayloads;
|
||||
|
||||
private int freq; // freq we last read
|
||||
private long freqFP; // offset of the freq block
|
||||
|
||||
private int position; // current position
|
||||
|
||||
// value of docBufferUpto on the last doc ID when positions have been read
|
||||
private int posDocBufferUpto;
|
||||
|
||||
// how many positions "behind" we are; nextPosition must
|
||||
// skip these to "catch up":
|
||||
private int posPendingCount;
|
||||
|
@ -1516,8 +1567,13 @@ public final class Lucene101PostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
|
||||
@Override
|
||||
public int freq() {
|
||||
return freq;
|
||||
public int freq() throws IOException {
|
||||
if (freqFP != -1) {
|
||||
docIn.seek(freqFP);
|
||||
pforUtil.decode(docInUtil, freqBuffer);
|
||||
freqFP = -1;
|
||||
}
|
||||
return freqBuffer[docBufferUpto - 1];
|
||||
}
|
||||
|
||||
private void refillDocs() throws IOException {
|
||||
|
@ -1526,24 +1582,30 @@ public final class Lucene101PostingsReader extends PostingsReaderBase {
|
|||
|
||||
if (left >= BLOCK_SIZE) {
|
||||
forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer);
|
||||
pforUtil.decode(docInUtil, freqBuffer);
|
||||
freqFP = docIn.getFilePointer();
|
||||
PForUtil.skip(docIn);
|
||||
docCountUpto += BLOCK_SIZE;
|
||||
} else if (docFreq == 1) {
|
||||
docBuffer[0] = singletonDocID;
|
||||
freqBuffer[0] = (int) totalTermFreq;
|
||||
freqFP = -1;
|
||||
docBuffer[1] = NO_MORE_DOCS;
|
||||
docCountUpto++;
|
||||
docBufferSize = 1;
|
||||
|
||||
} else {
|
||||
// Read vInts:
|
||||
PostingsUtil.readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq, true);
|
||||
prefixSum(docBuffer, left, prevDocID);
|
||||
docBuffer[left] = NO_MORE_DOCS;
|
||||
freqFP = -1;
|
||||
docCountUpto += left;
|
||||
docBufferSize = left;
|
||||
freqFP = -1;
|
||||
}
|
||||
prevDocID = docBuffer[BLOCK_SIZE - 1];
|
||||
docBufferUpto = 0;
|
||||
posDocBufferUpto = 0;
|
||||
assert docBuffer[docBufferSize] == NO_MORE_DOCS;
|
||||
}
|
||||
|
||||
|
@ -1585,20 +1647,14 @@ public final class Lucene101PostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
|
||||
private void skipLevel0To(int target) throws IOException {
|
||||
long posFP;
|
||||
int posUpto;
|
||||
|
||||
while (true) {
|
||||
prevDocID = level0LastDocID;
|
||||
|
||||
// If nextBlockPosFP is less than the current FP, it means that the block of positions for
|
||||
// the first docs of the next block are already decoded. In this case we just accumulate
|
||||
// frequencies into posPendingCount instead of seeking backwards and decoding the same pos
|
||||
// block again.
|
||||
if (level0PosEndFP >= posIn.getFilePointer()) {
|
||||
posIn.seek(level0PosEndFP);
|
||||
posPendingCount = level0BlockPosUpto;
|
||||
posBufferUpto = BLOCK_SIZE;
|
||||
} else {
|
||||
posPendingCount += sumOverRange(freqBuffer, docBufferUpto, BLOCK_SIZE);
|
||||
}
|
||||
posFP = level0PosEndFP;
|
||||
posUpto = level0BlockPosUpto;
|
||||
|
||||
if (docFreq - docCountUpto >= BLOCK_SIZE) {
|
||||
docIn.readVLong(); // skip0 num bytes
|
||||
|
@ -1631,6 +1687,18 @@ public final class Lucene101PostingsReader extends PostingsReaderBase {
|
|||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// If nextBlockPosFP is less than the current FP, it means that the block of positions for
|
||||
// the first docs of the next block are already decoded. In this case we just accumulate
|
||||
// frequencies into posPendingCount instead of seeking backwards and decoding the same pos
|
||||
// block again.
|
||||
if (posFP >= posIn.getFilePointer()) {
|
||||
posIn.seek(posFP);
|
||||
posPendingCount = posUpto;
|
||||
posBufferUpto = BLOCK_SIZE;
|
||||
} else {
|
||||
posPendingCount += sumOverRange(freqBuffer, posDocBufferUpto, BLOCK_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -1660,30 +1728,25 @@ public final class Lucene101PostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
|
||||
doc = docBuffer[docBufferUpto];
|
||||
freq = freqBuffer[docBufferUpto];
|
||||
posPendingCount += freq;
|
||||
docBufferUpto++;
|
||||
position = 0;
|
||||
return this.doc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(int target) throws IOException {
|
||||
if (target > level0LastDocID || needsRefilling) {
|
||||
advanceShallow(target);
|
||||
if (needsRefilling) {
|
||||
assert needsRefilling;
|
||||
refillDocs();
|
||||
needsRefilling = false;
|
||||
}
|
||||
|
||||
int next = VectorUtil.findNextGEQ(docBuffer, target, docBufferUpto, docBufferSize);
|
||||
posPendingCount += sumOverRange(freqBuffer, docBufferUpto, next + 1);
|
||||
freq = freqBuffer[next];
|
||||
docBufferUpto = next + 1;
|
||||
position = 0;
|
||||
return this.doc = docBuffer[next];
|
||||
}
|
||||
|
||||
private void skipPositions() throws IOException {
|
||||
private void skipPositions(int freq) throws IOException {
|
||||
// Skip positions now:
|
||||
int toSkip = posPendingCount - freq;
|
||||
// if (DEBUG) {
|
||||
|
@ -1703,8 +1766,6 @@ public final class Lucene101PostingsReader extends PostingsReaderBase {
|
|||
refillPositions();
|
||||
posBufferUpto = toSkip;
|
||||
}
|
||||
|
||||
position = 0;
|
||||
}
|
||||
|
||||
private void refillPositions() throws IOException {
|
||||
|
@ -1739,13 +1800,23 @@ public final class Lucene101PostingsReader extends PostingsReaderBase {
|
|||
|
||||
@Override
|
||||
public int nextPosition() throws IOException {
|
||||
if (posDocBufferUpto != docBufferUpto) {
|
||||
int freq = freq(); // triggers lazy decoding of freqs
|
||||
|
||||
// First position that is being read on this doc.
|
||||
posPendingCount += sumOverRange(freqBuffer, posDocBufferUpto, docBufferUpto);
|
||||
posDocBufferUpto = docBufferUpto;
|
||||
|
||||
assert posPendingCount > 0;
|
||||
|
||||
if (posPendingCount > freq) {
|
||||
skipPositions();
|
||||
skipPositions(freq);
|
||||
posPendingCount = freq;
|
||||
}
|
||||
|
||||
position = 0;
|
||||
}
|
||||
|
||||
if (posBufferUpto == BLOCK_SIZE) {
|
||||
refillPositions();
|
||||
posBufferUpto = 0;
|
||||
|
|
Loading…
Reference in New Issue