From 05ed6ebd3b981f99665ab7f2dcdebfb68cd7752d Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Wed, 4 Dec 2024 15:19:41 +0100 Subject: [PATCH] Combine all postings enum impls of the default codec into a single class (#14033) Recent speedups by making call sites bimorphic made me want to play with combining all postings enums and impacts enums of the default codec into a single class, in order to reduce polymorphism. Unfortunately, it does not yield a speedup since the major polymorphic call sites we have that hurt performance (DefaultBulkScorer, ConjunctionDISI) are still 3-polymorphic or more. Yet, reduced polymorphism at little performance impact is a good trade-off as it would help make call sites bimorphic for users who don't have as much query diversity as nightly benchmarks, or in the future when we remove other causes of polymorphism. --- lucene/CHANGES.txt | 3 + .../lucene101/Lucene101PostingsReader.java | 1571 +++++------------ .../org/apache/lucene/search/PhraseQuery.java | 7 +- .../intervals/TermIntervalsSource.java | 4 +- 4 files changed, 454 insertions(+), 1131 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 0029c94bdd4..1bbd3b179ec 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -34,6 +34,9 @@ Improvements segments for merges below the floor segment size, even if maxMergeAtOnce is bigger than segsPerTier. (Adrien Grand) +* GITHUB#14033: Combine all postings enum impls of the default codec into a + single class. (Adrien Grand) + Optimizations --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java index d879a58b4ab..010ed43135f 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java @@ -44,7 +44,6 @@ import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.SegmentReadState; -import org.apache.lucene.index.SlowImpactsEnum; import org.apache.lucene.internal.vectorization.PostingDecodingUtil; import org.apache.lucene.internal.vectorization.VectorizationProvider; import org.apache.lucene.store.ByteArrayDataInput; @@ -272,347 +271,61 @@ public final class Lucene101PostingsReader extends PostingsReaderBase { public PostingsEnum postings( FieldInfo fieldInfo, BlockTermState termState, PostingsEnum reuse, int flags) throws IOException { - if (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0 - || PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS) == false) { - return (reuse instanceof BlockDocsEnum blockDocsEnum - && blockDocsEnum.canReuse(docIn, fieldInfo) - ? blockDocsEnum - : new BlockDocsEnum(fieldInfo)) - .reset((IntBlockTermState) termState, flags); - } else { - return (reuse instanceof EverythingEnum everythingEnum - && everythingEnum.canReuse(docIn, fieldInfo) - ? everythingEnum - : new EverythingEnum(fieldInfo)) - .reset((IntBlockTermState) termState, flags); - } + return (reuse instanceof BlockPostingsEnum everythingEnum + && everythingEnum.canReuse(docIn, fieldInfo, flags, false) + ? everythingEnum + : new BlockPostingsEnum(fieldInfo, flags, false)) + .reset((IntBlockTermState) termState, flags); } @Override public ImpactsEnum impacts(FieldInfo fieldInfo, BlockTermState state, int flags) throws IOException { - final IndexOptions options = fieldInfo.getIndexOptions(); - final boolean indexHasPositions = - options.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; - - if (options.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0 - && (indexHasPositions == false - || PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS) == false)) { - return new BlockImpactsDocsEnum(indexHasPositions, (IntBlockTermState) state); - } - - if (indexHasPositions - && (options.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) < 0 - || PostingsEnum.featureRequested(flags, PostingsEnum.OFFSETS) == false) - && (fieldInfo.hasPayloads() == false - || PostingsEnum.featureRequested(flags, PostingsEnum.PAYLOADS) == false)) { - return new BlockImpactsPostingsEnum(fieldInfo, (IntBlockTermState) state); - } - - return new SlowImpactsEnum(postings(fieldInfo, state, null, flags)); + return new BlockPostingsEnum(fieldInfo, flags, true).reset((IntBlockTermState) state, flags); } - private static long sumOverRange(int[] arr, int start, int end) { - long res = 0L; + private static int sumOverRange(int[] arr, int start, int end) { + int res = 0; for (int i = start; i < end; i++) { res += arr[i]; } return res; } - private abstract class AbstractPostingsEnum extends PostingsEnum { + final class BlockPostingsEnum extends ImpactsEnum { - protected ForDeltaUtil forDeltaUtil; - protected PForUtil pforUtil; + private ForDeltaUtil forDeltaUtil; + private PForUtil pforUtil; - protected final int[] docBuffer = new int[BLOCK_SIZE + 1]; - protected final boolean indexHasFreq; + private final int[] docBuffer = new int[BLOCK_SIZE + 1]; - protected int doc; // doc we last read + private int doc; // doc we last read // level 0 skip data - protected int level0LastDocID; + private int level0LastDocID; + private long level0DocEndFP; // level 1 skip data - protected int level1LastDocID; - protected long level1DocEndFP; - protected int level1DocCountUpto; + private int level1LastDocID; + private long level1DocEndFP; + private int level1DocCountUpto; - protected int docFreq; // number of docs in this posting list - protected long - totalTermFreq; // sum of freqBuffer in this posting list (or docFreq when omitted) + private int docFreq; // number of docs in this posting list + private long totalTermFreq; // sum of freqBuffer in this posting list (or docFreq when omitted) - protected int singletonDocID; // docid when there is a single pulsed posting, otherwise -1 + private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1 - protected int docCountUpto; // number of docs in or before the current block - protected int prevDocID; // last doc ID of the previous block + private int docCountUpto; // number of docs in or before the current block + private int prevDocID; // last doc ID of the previous block - protected int docBufferSize; - protected int docBufferUpto; + private int docBufferSize; + private int docBufferUpto; - protected IndexInput docIn; - protected PostingDecodingUtil docInUtil; - - protected AbstractPostingsEnum(FieldInfo fieldInfo) { - indexHasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; - // We set the last element of docBuffer to NO_MORE_DOCS, it helps save conditionals in - // advance() - docBuffer[BLOCK_SIZE] = NO_MORE_DOCS; - } - - @Override - public int docID() { - return doc; - } - - protected void resetIndexInput(IntBlockTermState termState) throws IOException { - docFreq = termState.docFreq; - singletonDocID = termState.singletonDocID; - if (docFreq > 1) { - if (docIn == null) { - // lazy init - docIn = Lucene101PostingsReader.this.docIn.clone(); - docInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(docIn); - } - prefetchPostings(docIn, termState); - } - } - - protected PostingsEnum resetIdsAndLevelParams(IntBlockTermState termState) throws IOException { - doc = -1; - prevDocID = -1; - docCountUpto = 0; - level0LastDocID = -1; - if (docFreq < LEVEL1_NUM_DOCS) { - level1LastDocID = NO_MORE_DOCS; - if (docFreq > 1) { - docIn.seek(termState.docStartFP); - } - } else { - level1LastDocID = -1; - level1DocEndFP = termState.docStartFP; - } - level1DocCountUpto = 0; - docBufferSize = BLOCK_SIZE; - docBufferUpto = BLOCK_SIZE; - return this; - } - } - - final class BlockDocsEnum extends AbstractPostingsEnum { + private IndexInput docIn; + private PostingDecodingUtil docInUtil; private final int[] freqBuffer = new int[BLOCK_SIZE]; - - private boolean needsFreq; // true if the caller actually needs frequencies - private long freqFP; - - public BlockDocsEnum(FieldInfo fieldInfo) { - super(fieldInfo); - } - - public boolean canReuse(IndexInput docIn, FieldInfo fieldInfo) { - final IndexOptions options = fieldInfo.getIndexOptions(); - return docIn == Lucene101PostingsReader.this.docIn - && indexHasFreq == (options.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0); - } - - public PostingsEnum reset(IntBlockTermState termState, int flags) throws IOException { - resetIndexInput(termState); - if (pforUtil == null && docFreq >= BLOCK_SIZE) { - pforUtil = new PForUtil(); - forDeltaUtil = new ForDeltaUtil(); - } - totalTermFreq = indexHasFreq ? termState.totalTermFreq : docFreq; - - this.needsFreq = PostingsEnum.featureRequested(flags, PostingsEnum.FREQS); - if (indexHasFreq == false || needsFreq == false) { - // Filling this buffer may not be cheap when doing primary key lookups, so we make sure to - // not fill more than `docFreq` entries. - Arrays.fill(freqBuffer, 0, Math.min(ForUtil.BLOCK_SIZE, docFreq), 1); - } - freqFP = -1; - return resetIdsAndLevelParams(termState); - } - - @Override - public int freq() throws IOException { - if (freqFP != -1) { - docIn.seek(freqFP); - pforUtil.decode(docInUtil, freqBuffer); - freqFP = -1; - } - - return freqBuffer[docBufferUpto - 1]; - } - - @Override - public int nextPosition() { - return -1; - } - - @Override - public int startOffset() { - return -1; - } - - @Override - public int endOffset() { - return -1; - } - - @Override - public BytesRef getPayload() { - return null; - } - - private void refillFullBlock() throws IOException { - assert docFreq - docCountUpto >= BLOCK_SIZE; - - forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer); - - if (indexHasFreq) { - if (needsFreq) { - freqFP = docIn.getFilePointer(); - } - PForUtil.skip(docIn); - } - docCountUpto += BLOCK_SIZE; - prevDocID = docBuffer[BLOCK_SIZE - 1]; - docBufferUpto = 0; - assert docBuffer[docBufferSize] == NO_MORE_DOCS; - } - - private void refillRemainder() throws IOException { - final int left = docFreq - docCountUpto; - assert left >= 0; - assert left < BLOCK_SIZE; - - if (docFreq == 1) { - docBuffer[0] = singletonDocID; - freqBuffer[0] = (int) totalTermFreq; - docBuffer[1] = NO_MORE_DOCS; - docCountUpto++; - } else { - // Read vInts: - PostingsUtil.readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq, needsFreq); - prefixSum(docBuffer, left, prevDocID); - docBuffer[left] = NO_MORE_DOCS; - docCountUpto += left; - } - docBufferUpto = 0; - docBufferSize = left; - freqFP = -1; - } - - private void skipLevel1To(int target) throws IOException { - while (true) { - prevDocID = level1LastDocID; - level0LastDocID = level1LastDocID; - docIn.seek(level1DocEndFP); - docCountUpto = level1DocCountUpto; - level1DocCountUpto += LEVEL1_NUM_DOCS; - - if (docFreq - docCountUpto < LEVEL1_NUM_DOCS) { - level1LastDocID = NO_MORE_DOCS; - break; - } - - level1LastDocID += docIn.readVInt(); - level1DocEndFP = docIn.readVLong() + docIn.getFilePointer(); - - if (level1LastDocID >= target) { - if (indexHasFreq) { - // skip impacts and pos skip data - docIn.skipBytes(docIn.readShort()); - } - break; - } - } - } - - private void skipLevel0To(int target) throws IOException { - while (true) { - prevDocID = level0LastDocID; - if (docFreq - docCountUpto >= BLOCK_SIZE) { - long skip0NumBytes = docIn.readVLong(); - // end offset of skip data (before the actual data starts) - long skip0EndFP = docIn.getFilePointer() + skip0NumBytes; - int docDelta = readVInt15(docIn); - level0LastDocID += docDelta; - - if (target <= level0LastDocID) { - docIn.seek(skip0EndFP); - break; - } - - // skip block - docIn.skipBytes(readVLong15(docIn)); - docCountUpto += BLOCK_SIZE; - } else { - level0LastDocID = NO_MORE_DOCS; - break; - } - } - } - - private void moveToNextLevel0Block() throws IOException { - if (doc == level1LastDocID) { // advance skip data on level 1 - skipLevel1To(doc + 1); - } - - prevDocID = level0LastDocID; - if (docFreq - docCountUpto >= BLOCK_SIZE) { - docIn.skipBytes(docIn.readVLong()); - refillFullBlock(); - level0LastDocID = docBuffer[BLOCK_SIZE - 1]; - } else { - level0LastDocID = NO_MORE_DOCS; - refillRemainder(); - } - } - - @Override - public int nextDoc() throws IOException { - if (docBufferUpto == BLOCK_SIZE) { // advance skip data on level 0 - moveToNextLevel0Block(); - } - - return this.doc = docBuffer[docBufferUpto++]; - } - - @Override - public int advance(int target) throws IOException { - if (target > level0LastDocID) { // advance skip data on level 0 - - if (target > level1LastDocID) { // advance skip data on level 1 - skipLevel1To(target); - } - - skipLevel0To(target); - - if (docFreq - docCountUpto >= BLOCK_SIZE) { - refillFullBlock(); - } else { - refillRemainder(); - } - } - - int next = VectorUtil.findNextGEQ(docBuffer, target, docBufferUpto, docBufferSize); - this.doc = docBuffer[next]; - docBufferUpto = next + 1; - return doc; - } - - @Override - public long cost() { - return docFreq; - } - } - - final class EverythingEnum extends AbstractPostingsEnum { - - private final int[] freqBuffer = new int[BLOCK_SIZE + 1]; - private final int[] posDeltaBuffer = new int[BLOCK_SIZE]; + private final int[] posDeltaBuffer; private final int[] payloadLengthBuffer; private final int[] offsetStartDeltaBuffer; @@ -634,10 +347,22 @@ public final class Lucene101PostingsReader extends PostingsReaderBase { final PostingDecodingUtil payInUtil; final BytesRef payload; + final IndexOptions options; + final boolean indexHasFreq; + final boolean indexHasPos; final boolean indexHasOffsets; final boolean indexHasPayloads; final boolean indexHasOffsetsOrPayloads; + final int flags; + final boolean needsFreq; + final boolean needsPos; + final boolean needsOffsets; + final boolean needsPayloads; + final boolean needsOffsetsOrPayloads; + final boolean needsImpacts; + final boolean needsDocsAndFreqsOnly; + private long freqFP; // offset of the freq block private int position; // current position @@ -654,40 +379,84 @@ public final class Lucene101PostingsReader extends PostingsReaderBase { // decode vs vInt decode the block: private long lastPosBlockFP; + // level 0 skip data private long level0PosEndFP; private int level0BlockPosUpto; private long level0PayEndFP; private int level0BlockPayUpto; + private final BytesRef level0SerializedImpacts; + private final MutableImpactList level0Impacts; + // level 1 skip data private long level1PosEndFP; private int level1BlockPosUpto; private long level1PayEndFP; private int level1BlockPayUpto; + private final BytesRef level1SerializedImpacts; + private final MutableImpactList level1Impacts; - private boolean needsOffsets; // true if we actually need offsets - private boolean needsPayloads; // true if we actually need payloads - private boolean needsPayloadsOrOffsets; + // true if we shallow-advanced to a new block that we have not decoded yet + private boolean needsRefilling; - public EverythingEnum(FieldInfo fieldInfo) throws IOException { - super(fieldInfo); + public BlockPostingsEnum(FieldInfo fieldInfo, int flags, boolean needsImpacts) + throws IOException { + options = fieldInfo.getIndexOptions(); + indexHasFreq = options.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; + indexHasPos = options.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; indexHasOffsets = - fieldInfo - .getIndexOptions() - .compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) - >= 0; + options.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; indexHasPayloads = fieldInfo.hasPayloads(); indexHasOffsetsOrPayloads = indexHasOffsets || indexHasPayloads; - this.posIn = Lucene101PostingsReader.this.posIn.clone(); - posInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(posIn); - if (indexHasOffsetsOrPayloads) { + this.flags = flags; + needsFreq = indexHasFreq && PostingsEnum.featureRequested(flags, PostingsEnum.FREQS); + needsPos = indexHasPos && PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS); + needsOffsets = indexHasOffsets && PostingsEnum.featureRequested(flags, PostingsEnum.OFFSETS); + needsPayloads = + indexHasPayloads && PostingsEnum.featureRequested(flags, PostingsEnum.PAYLOADS); + needsOffsetsOrPayloads = needsOffsets || needsPayloads; + this.needsImpacts = needsImpacts; + needsDocsAndFreqsOnly = needsPos == false && needsImpacts == false; + + // We set the last element of docBuffer to NO_MORE_DOCS, it helps save conditionals in + // advance() + docBuffer[BLOCK_SIZE] = NO_MORE_DOCS; + + if (needsFreq == false) { + Arrays.fill(freqBuffer, 1); + } + + if (needsFreq && needsImpacts) { + level0SerializedImpacts = new BytesRef(maxImpactNumBytesAtLevel0); + level1SerializedImpacts = new BytesRef(maxImpactNumBytesAtLevel1); + level0Impacts = new MutableImpactList(maxNumImpactsAtLevel0); + level1Impacts = new MutableImpactList(maxNumImpactsAtLevel1); + } else { + level0SerializedImpacts = null; + level1SerializedImpacts = null; + level0Impacts = null; + level1Impacts = null; + } + + if (needsPos) { + this.posIn = Lucene101PostingsReader.this.posIn.clone(); + posInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(posIn); + posDeltaBuffer = new int[BLOCK_SIZE]; + } else { + this.posIn = null; + this.posInUtil = null; + posDeltaBuffer = null; + } + + if (needsOffsets || needsPayloads) { this.payIn = Lucene101PostingsReader.this.payIn.clone(); payInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(payIn); } else { this.payIn = null; payInUtil = null; } - if (indexHasOffsets) { + + if (needsOffsets) { offsetStartDeltaBuffer = new int[BLOCK_SIZE]; offsetLengthBuffer = new int[BLOCK_SIZE]; } else { @@ -708,33 +477,45 @@ public final class Lucene101PostingsReader extends PostingsReaderBase { } } - public boolean canReuse(IndexInput docIn, FieldInfo fieldInfo) { + public boolean canReuse( + IndexInput docIn, FieldInfo fieldInfo, int flags, boolean needsImpacts) { return docIn == Lucene101PostingsReader.this.docIn - && indexHasOffsets - == (fieldInfo - .getIndexOptions() - .compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) - >= 0) - && indexHasPayloads == fieldInfo.hasPayloads(); + && options == fieldInfo.getIndexOptions() + && indexHasPayloads == fieldInfo.hasPayloads() + && this.flags == flags + && this.needsImpacts == needsImpacts; } - public PostingsEnum reset(IntBlockTermState termState, int flags) throws IOException { - resetIndexInput(termState); + public BlockPostingsEnum reset(IntBlockTermState termState, int flags) throws IOException { + docFreq = termState.docFreq; + singletonDocID = termState.singletonDocID; + if (docFreq > 1) { + if (docIn == null) { + // lazy init + docIn = Lucene101PostingsReader.this.docIn.clone(); + docInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(docIn); + } + prefetchPostings(docIn, termState); + } + if (forDeltaUtil == null && docFreq >= BLOCK_SIZE) { forDeltaUtil = new ForDeltaUtil(); } - totalTermFreq = termState.totalTermFreq; - if (pforUtil == null && totalTermFreq >= BLOCK_SIZE) { + totalTermFreq = indexHasFreq ? termState.totalTermFreq : termState.docFreq; + if (needsFreq && pforUtil == null && totalTermFreq >= BLOCK_SIZE) { pforUtil = new PForUtil(); } + // Where this term's postings start in the .pos file: final long posTermStartFP = termState.posStartFP; // Where this term's payloads/offsets start in the .pay // file: final long payTermStartFP = termState.payStartFP; - posIn.seek(posTermStartFP); - if (indexHasOffsetsOrPayloads) { - payIn.seek(payTermStartFP); + if (posIn != null) { + posIn.seek(posTermStartFP); + if (payIn != null) { + payIn.seek(payTermStartFP); + } } level1PosEndFP = posTermStartFP; level1PayEndFP = payTermStartFP; @@ -750,19 +531,37 @@ public final class Lucene101PostingsReader extends PostingsReaderBase { lastPosBlockFP = posTermStartFP + termState.lastPosBlockOffset; } - this.needsOffsets = - indexHasOffsets && PostingsEnum.featureRequested(flags, PostingsEnum.OFFSETS); - this.needsPayloads = - indexHasPayloads && PostingsEnum.featureRequested(flags, PostingsEnum.PAYLOADS); - this.needsPayloadsOrOffsets = this.needsPayloads || this.needsOffsets; - level1BlockPosUpto = 0; level1BlockPayUpto = 0; level0BlockPosUpto = 0; level0BlockPayUpto = 0; posBufferUpto = BLOCK_SIZE; - return resetIdsAndLevelParams(termState); + doc = -1; + prevDocID = -1; + docCountUpto = 0; + freqFP = -1L; + level0LastDocID = -1; + if (docFreq < LEVEL1_NUM_DOCS) { + level1LastDocID = NO_MORE_DOCS; + if (docFreq > 1) { + docIn.seek(termState.docStartFP); + } + } else { + level1LastDocID = -1; + level1DocEndFP = termState.docStartFP; + } + level1DocCountUpto = 0; + docBufferSize = BLOCK_SIZE; + docBufferUpto = BLOCK_SIZE; + posDocBufferUpto = BLOCK_SIZE; + + return this; + } + + @Override + public int docID() { + return doc; } @Override @@ -775,28 +574,37 @@ public final class Lucene101PostingsReader extends PostingsReaderBase { return freqBuffer[docBufferUpto - 1]; } - private void refillDocs() throws IOException { - final int left = docFreq - docCountUpto; - assert left >= 0; - - if (left >= BLOCK_SIZE) { - forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer); - freqFP = docIn.getFilePointer(); + private void refillFullBlock() throws IOException { + forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer); + if (indexHasFreq) { + if (needsFreq) { + freqFP = docIn.getFilePointer(); + } PForUtil.skip(docIn); - docCountUpto += BLOCK_SIZE; - } else if (docFreq == 1) { + } + docCountUpto += BLOCK_SIZE; + prevDocID = docBuffer[BLOCK_SIZE - 1]; + docBufferUpto = 0; + posDocBufferUpto = 0; + assert docBuffer[docBufferSize] == NO_MORE_DOCS; + } + + private void refillRemainder() throws IOException { + final int left = docFreq - docCountUpto; + assert left >= 0 && left < BLOCK_SIZE; + if (docFreq == 1) { docBuffer[0] = singletonDocID; freqBuffer[0] = (int) totalTermFreq; - freqFP = -1; docBuffer[1] = NO_MORE_DOCS; + assert freqFP == -1; docCountUpto++; docBufferSize = 1; } else { // Read vInts: - PostingsUtil.readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq, true); + PostingsUtil.readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq, needsFreq); prefixSum(docBuffer, left, prevDocID); docBuffer[left] = NO_MORE_DOCS; - freqFP = -1; + freqFP = -1L; docCountUpto += left; docBufferSize = left; } @@ -806,6 +614,17 @@ public final class Lucene101PostingsReader extends PostingsReaderBase { assert docBuffer[docBufferSize] == NO_MORE_DOCS; } + private void refillDocs() throws IOException { + final int left = docFreq - docCountUpto; + assert left >= 0; + + if (left >= BLOCK_SIZE) { + refillFullBlock(); + } else { + refillRemainder(); + } + } + private void skipLevel1To(int target) throws IOException { while (true) { prevDocID = level1LastDocID; @@ -813,10 +632,8 @@ public final class Lucene101PostingsReader extends PostingsReaderBase { docIn.seek(level1DocEndFP); level0PosEndFP = level1PosEndFP; level0BlockPosUpto = level1BlockPosUpto; - if (indexHasOffsetsOrPayloads) { - level0PayEndFP = level1PayEndFP; - level0BlockPayUpto = level1BlockPayUpto; - } + level0PayEndFP = level1PayEndFP; + level0BlockPayUpto = level1BlockPayUpto; docCountUpto = level1DocCountUpto; level1DocCountUpto += LEVEL1_NUM_DOCS; @@ -829,15 +646,25 @@ public final class Lucene101PostingsReader extends PostingsReaderBase { long delta = docIn.readVLong(); level1DocEndFP = delta + docIn.getFilePointer(); - long skip1EndFP = docIn.readShort() + docIn.getFilePointer(); - docIn.skipBytes(docIn.readShort()); // impacts - level1PosEndFP += docIn.readVLong(); - level1BlockPosUpto = docIn.readByte(); - if (indexHasOffsetsOrPayloads) { - level1PayEndFP += docIn.readVLong(); - level1BlockPayUpto = docIn.readVInt(); + if (indexHasFreq) { + long skip1EndFP = docIn.readShort() + docIn.getFilePointer(); + int numImpactBytes = docIn.readShort(); + if (needsImpacts && level1LastDocID >= target) { + docIn.readBytes(level1SerializedImpacts.bytes, 0, numImpactBytes); + level1SerializedImpacts.length = numImpactBytes; + } else { + docIn.skipBytes(numImpactBytes); + } + if (indexHasPos) { + level1PosEndFP += docIn.readVLong(); + level1BlockPosUpto = docIn.readByte(); + if (indexHasOffsetsOrPayloads) { + level1PayEndFP += docIn.readVLong(); + level1BlockPayUpto = docIn.readVInt(); + } + } + assert docIn.getFilePointer() == skip1EndFP; } - assert docIn.getFilePointer() == skip1EndFP; if (level1LastDocID >= target) { break; @@ -845,6 +672,55 @@ public final class Lucene101PostingsReader extends PostingsReaderBase { } } + private void doMoveToNextLevel0Block() throws IOException { + assert docBufferUpto == BLOCK_SIZE; + if (posIn != null) { + if (level0PosEndFP >= posIn.getFilePointer()) { + posIn.seek(level0PosEndFP); + posPendingCount = level0BlockPosUpto; + if (payIn != null) { + assert level0PayEndFP >= payIn.getFilePointer(); + payIn.seek(level0PayEndFP); + payloadByteUpto = level0BlockPayUpto; + } + posBufferUpto = BLOCK_SIZE; + } else { + assert freqFP == -1L; + posPendingCount += sumOverRange(freqBuffer, posDocBufferUpto, BLOCK_SIZE); + } + } + + if (docFreq - docCountUpto >= BLOCK_SIZE) { + docIn.readVLong(); // level0NumBytes + int docDelta = readVInt15(docIn); + level0LastDocID += docDelta; + long blockLength = readVLong15(docIn); + level0DocEndFP = docIn.getFilePointer() + blockLength; + if (indexHasFreq) { + int numImpactBytes = docIn.readVInt(); + if (needsImpacts) { + docIn.readBytes(level0SerializedImpacts.bytes, 0, numImpactBytes); + level0SerializedImpacts.length = numImpactBytes; + } else { + docIn.skipBytes(numImpactBytes); + } + + if (indexHasPos) { + level0PosEndFP += docIn.readVLong(); + level0BlockPosUpto = docIn.readByte(); + if (indexHasOffsetsOrPayloads) { + level0PayEndFP += docIn.readVLong(); + level0BlockPayUpto = docIn.readVInt(); + } + } + } + refillFullBlock(); + } else { + level0LastDocID = NO_MORE_DOCS; + refillRemainder(); + } + } + private void moveToNextLevel0Block() throws IOException { if (doc == level1LastDocID) { // advance level 1 skip data skipLevel1To(doc + 1); @@ -853,49 +729,43 @@ public final class Lucene101PostingsReader extends PostingsReaderBase { // Now advance level 0 skip data prevDocID = level0LastDocID; - assert docBufferUpto == BLOCK_SIZE; - if (level0PosEndFP >= posIn.getFilePointer()) { - posIn.seek(level0PosEndFP); - posPendingCount = level0BlockPosUpto; - if (indexHasOffsetsOrPayloads) { + if (needsDocsAndFreqsOnly && docFreq - docCountUpto >= BLOCK_SIZE) { + // Optimize the common path for exhaustive evaluation + long level0NumBytes = docIn.readVLong(); + docIn.skipBytes(level0NumBytes); + refillFullBlock(); + level0LastDocID = docBuffer[BLOCK_SIZE - 1]; + } else { + doMoveToNextLevel0Block(); + } + } + + private void readLevel0PosData() throws IOException { + level0PosEndFP += docIn.readVLong(); + level0BlockPosUpto = docIn.readByte(); + if (indexHasOffsetsOrPayloads) { + level0PayEndFP += docIn.readVLong(); + level0BlockPayUpto = docIn.readVInt(); + } + } + + private void seekPosData(long posFP, int posUpto, long payFP, int payUpto) throws IOException { + // If nextBlockPosFP is less than the current FP, it means that the block of positions for + // the first docs of the next block are already decoded. In this case we just accumulate + // frequencies into posPendingCount instead of seeking backwards and decoding the same pos + // block again. + if (posFP >= posIn.getFilePointer()) { + posIn.seek(posFP); + posPendingCount = posUpto; + if (payIn != null) { // needs payloads or offsets assert level0PayEndFP >= payIn.getFilePointer(); - payIn.seek(level0PayEndFP); - payloadByteUpto = level0BlockPayUpto; + payIn.seek(payFP); + payloadByteUpto = payUpto; } posBufferUpto = BLOCK_SIZE; } else { posPendingCount += sumOverRange(freqBuffer, posDocBufferUpto, BLOCK_SIZE); } - - if (docFreq - docCountUpto >= BLOCK_SIZE) { - docIn.readVLong(); // skip0 num bytes - int docDelta = readVInt15(docIn); - level0LastDocID += docDelta; - readVLong15(docIn); // block length - docIn.skipBytes(docIn.readVLong()); // impacts - - level0PosEndFP += docIn.readVLong(); - level0BlockPosUpto = docIn.readByte(); - if (indexHasOffsetsOrPayloads) { - level0PayEndFP += docIn.readVLong(); - level0BlockPayUpto = docIn.readVInt(); - } - } else { - level0LastDocID = NO_MORE_DOCS; - } - - refillDocs(); - } - - @Override - public int nextDoc() throws IOException { - if (docBufferUpto == BLOCK_SIZE) { // advance level 0 skip data - moveToNextLevel0Block(); - } - - this.doc = docBuffer[docBufferUpto]; - docBufferUpto++; - return doc; } private void skipLevel0To(int target) throws IOException { @@ -913,26 +783,39 @@ public final class Lucene101PostingsReader extends PostingsReaderBase { payUpto = level0BlockPayUpto; if (docFreq - docCountUpto >= BLOCK_SIZE) { - docIn.readVLong(); // skip0 num bytes + long numSkipBytes = docIn.readVLong(); + long skip0End = docIn.getFilePointer() + numSkipBytes; int docDelta = readVInt15(docIn); level0LastDocID += docDelta; - + boolean found = target <= level0LastDocID; long blockLength = readVLong15(docIn); - long blockEndFP = docIn.getFilePointer() + blockLength; - docIn.skipBytes(docIn.readVLong()); // impacts + level0DocEndFP = docIn.getFilePointer() + blockLength; - level0PosEndFP += docIn.readVLong(); - level0BlockPosUpto = docIn.readByte(); - if (indexHasOffsetsOrPayloads) { - level0PayEndFP += docIn.readVLong(); - level0BlockPayUpto = docIn.readVInt(); + if (indexHasFreq) { + if (found == false && needsPos == false) { + docIn.seek(skip0End); + } else { + int numImpactBytes = docIn.readVInt(); + if (needsImpacts && found) { + docIn.readBytes(level0SerializedImpacts.bytes, 0, numImpactBytes); + level0SerializedImpacts.length = numImpactBytes; + } else { + docIn.skipBytes(numImpactBytes); + } + + if (needsPos) { + readLevel0PosData(); + } else { + docIn.seek(skip0End); + } + } } - if (target <= level0LastDocID) { + if (found) { break; } - docIn.seek(blockEndFP); + docIn.seek(level0DocEndFP); docCountUpto += BLOCK_SIZE; } else { level0LastDocID = NO_MORE_DOCS; @@ -940,41 +823,62 @@ public final class Lucene101PostingsReader extends PostingsReaderBase { } } - // If nextBlockPosFP is less than the current FP, it means that the block of positions for - // the first docs of the next block are already decoded. In this case we just accumulate - // frequencies into posPendingCount instead of seeking backwards and decoding the same pos - // block again. - if (posFP >= posIn.getFilePointer()) { - posIn.seek(posFP); - posPendingCount = posUpto; - if (indexHasOffsetsOrPayloads) { - assert level0PayEndFP >= payIn.getFilePointer(); - payIn.seek(payFP); - payloadByteUpto = payUpto; - } - posBufferUpto = BLOCK_SIZE; - } else { - posPendingCount += sumOverRange(freqBuffer, posDocBufferUpto, BLOCK_SIZE); + if (posIn != null) { // needs positions + seekPosData(posFP, posUpto, payFP, payUpto); } } @Override - public int advance(int target) throws IOException { + public void advanceShallow(int target) throws IOException { if (target > level0LastDocID) { // advance level 0 skip data + doAdvanceShallow(target); - if (target > level1LastDocID) { // advance level 1 skip data - skipLevel1To(target); + // If we are on the last doc ID of a block and we are advancing on the doc ID just beyond + // this block, then we decode the block. This may not be necessary, but this helps avoid + // having to check whether we are in a block that is not decoded yet in #nextDoc(). + if (docBufferUpto == BLOCK_SIZE && target == doc + 1) { + refillDocs(); + needsRefilling = false; + } else { + needsRefilling = true; } + } + } - skipLevel0To(target); + private void doAdvanceShallow(int target) throws IOException { + if (target > level1LastDocID) { // advance skip data on level 1 + skipLevel1To(target); + } else if (needsRefilling) { + docIn.seek(level0DocEndFP); + docCountUpto += BLOCK_SIZE; + } + skipLevel0To(target); + } + + @Override + public int nextDoc() throws IOException { + if (docBufferUpto == BLOCK_SIZE) { + moveToNextLevel0Block(); + } + + return this.doc = docBuffer[docBufferUpto++]; + } + + @Override + public int advance(int target) throws IOException { + if (target > level0LastDocID || needsRefilling) { + if (target > level0LastDocID) { + doAdvanceShallow(target); + } refillDocs(); + needsRefilling = false; } int next = VectorUtil.findNextGEQ(docBuffer, target, docBufferUpto, docBufferSize); - this.docBufferUpto = next + 1; - - return this.doc = docBuffer[next]; + this.doc = docBuffer[next]; + docBufferUpto = next + 1; + return doc; } private void skipPositions(int freq) throws IOException { @@ -987,7 +891,7 @@ public final class Lucene101PostingsReader extends PostingsReaderBase { final int leftInBlock = BLOCK_SIZE - posBufferUpto; if (toSkip < leftInBlock) { int end = posBufferUpto + toSkip; - if (indexHasPayloads) { + if (needsPayloads) { payloadByteUpto += sumOverRange(payloadLengthBuffer, posBufferUpto, end); } posBufferUpto = end; @@ -997,31 +901,29 @@ public final class Lucene101PostingsReader extends PostingsReaderBase { assert posIn.getFilePointer() != lastPosBlockFP; PForUtil.skip(posIn); - if (indexHasPayloads) { - // Skip payloadLength block: - PForUtil.skip(payIn); + if (payIn != null) { + if (indexHasPayloads) { + // Skip payloadLength block: + PForUtil.skip(payIn); - // Skip payloadBytes block: - int numBytes = payIn.readVInt(); - payIn.seek(payIn.getFilePointer() + numBytes); - } + // Skip payloadBytes block: + int numBytes = payIn.readVInt(); + payIn.seek(payIn.getFilePointer() + numBytes); + } - if (indexHasOffsets) { - PForUtil.skip(payIn); - PForUtil.skip(payIn); + if (indexHasOffsets) { + PForUtil.skip(payIn); + PForUtil.skip(payIn); + } } toSkip -= BLOCK_SIZE; } refillPositions(); - payloadByteUpto = 0; - if (indexHasPayloads) { - payloadByteUpto += sumOverRange(payloadLengthBuffer, 0, toSkip); + if (needsPayloads) { + payloadByteUpto = sumOverRange(payloadLengthBuffer, 0, toSkip); } posBufferUpto = toSkip; } - - position = 0; - lastStartOffset = 0; } private void refillLastPositionBlock() throws IOException { @@ -1035,14 +937,18 @@ public final class Lucene101PostingsReader extends PostingsReaderBase { if ((code & 1) != 0) { payloadLength = posIn.readVInt(); } - payloadLengthBuffer[i] = payloadLength; - posDeltaBuffer[i] = code >>> 1; - if (payloadLength != 0) { - if (payloadByteUpto + payloadLength > payloadBytes.length) { - payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payloadLength); + if (payloadLengthBuffer != null) { // needs payloads + payloadLengthBuffer[i] = payloadLength; + posDeltaBuffer[i] = code >>> 1; + if (payloadLength != 0) { + if (payloadByteUpto + payloadLength > payloadBytes.length) { + payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payloadLength); + } + posIn.readBytes(payloadBytes, payloadByteUpto, payloadLength); + payloadByteUpto += payloadLength; } - posIn.readBytes(payloadBytes, payloadByteUpto, payloadLength); - payloadByteUpto += payloadLength; + } else { + posIn.skipBytes(payloadLength); } } else { posDeltaBuffer[i] = code; @@ -1053,48 +959,70 @@ public final class Lucene101PostingsReader extends PostingsReaderBase { if ((deltaCode & 1) != 0) { offsetLength = posIn.readVInt(); } - offsetStartDeltaBuffer[i] = deltaCode >>> 1; - offsetLengthBuffer[i] = offsetLength; + if (offsetStartDeltaBuffer != null) { // needs offsets + offsetStartDeltaBuffer[i] = deltaCode >>> 1; + offsetLengthBuffer[i] = offsetLength; + } } } payloadByteUpto = 0; } + private void refillOffsetsOrPayloads() throws IOException { + if (indexHasPayloads) { + if (needsPayloads) { + pforUtil.decode(payInUtil, payloadLengthBuffer); + int numBytes = payIn.readVInt(); + + if (numBytes > payloadBytes.length) { + payloadBytes = ArrayUtil.growNoCopy(payloadBytes, numBytes); + } + payIn.readBytes(payloadBytes, 0, numBytes); + } else if (payIn != null) { // needs offsets + // this works, because when writing a vint block we always force the first length to be + // written + PForUtil.skip(payIn); // skip over lengths + int numBytes = payIn.readVInt(); // read length of payloadBytes + payIn.seek(payIn.getFilePointer() + numBytes); // skip over payloadBytes + } + payloadByteUpto = 0; + } + + if (indexHasOffsets) { + if (needsOffsets) { + pforUtil.decode(payInUtil, offsetStartDeltaBuffer); + pforUtil.decode(payInUtil, offsetLengthBuffer); + } else if (payIn != null) { // needs payloads + // this works, because when writing a vint block we always force the first length to be + // written + PForUtil.skip(payIn); // skip over starts + PForUtil.skip(payIn); // skip over lengths + } + } + } + private void refillPositions() throws IOException { if (posIn.getFilePointer() == lastPosBlockFP) { refillLastPositionBlock(); - } else { - pforUtil.decode(posInUtil, posDeltaBuffer); + return; + } + pforUtil.decode(posInUtil, posDeltaBuffer); - if (indexHasPayloads) { - if (needsPayloads) { - pforUtil.decode(payInUtil, payloadLengthBuffer); - int numBytes = payIn.readVInt(); + if (indexHasOffsetsOrPayloads) { + refillOffsetsOrPayloads(); + } + } - if (numBytes > payloadBytes.length) { - payloadBytes = ArrayUtil.growNoCopy(payloadBytes, numBytes); - } - payIn.readBytes(payloadBytes, 0, numBytes); - } else { - // this works, because when writing a vint block we always force the first length to be - // written - PForUtil.skip(payIn); // skip over lengths - payIn.skipBytes(payIn.readVInt()); // skip over payloadBytes - } - payloadByteUpto = 0; - } + private void accumulatePendingPositions() throws IOException { + int freq = freq(); // trigger lazy decoding of freqs + posPendingCount += sumOverRange(freqBuffer, posDocBufferUpto, docBufferUpto); + posDocBufferUpto = docBufferUpto; - if (indexHasOffsets) { - if (needsOffsets) { - pforUtil.decode(payInUtil, offsetStartDeltaBuffer); - pforUtil.decode(payInUtil, offsetLengthBuffer); - } else { - // this works, because when writing a vint block we always force the first length to be - // written - PForUtil.skip(payIn); // skip over starts - PForUtil.skip(payIn); // skip over lengths - } - } + assert posPendingCount > 0; + + if (posPendingCount > freq) { + skipPositions(freq); + posPendingCount = freq; } } @@ -1116,20 +1044,14 @@ public final class Lucene101PostingsReader extends PostingsReaderBase { @Override public int nextPosition() throws IOException { + if (needsPos == false) { + return -1; + } + + assert posDocBufferUpto <= docBufferUpto; if (posDocBufferUpto != docBufferUpto) { - int freq = freq(); // triggers lazy decoding of freqs - - // First position that is being read on this doc. - posPendingCount += sumOverRange(freqBuffer, posDocBufferUpto, docBufferUpto); - posDocBufferUpto = docBufferUpto; - - assert posPendingCount > 0; - - if (posPendingCount > freq) { - skipPositions(freq); - posPendingCount = freq; - } - + // First position we're reading on this doc + accumulatePendingPositions(); position = 0; lastStartOffset = 0; } @@ -1140,7 +1062,7 @@ public final class Lucene101PostingsReader extends PostingsReaderBase { } position += posDeltaBuffer[posBufferUpto]; - if (needsPayloadsOrOffsets) { + if (needsOffsetsOrPayloads) { accumulatePayloadAndOffsets(); } @@ -1178,96 +1100,6 @@ public final class Lucene101PostingsReader extends PostingsReaderBase { public long cost() { return docFreq; } - } - - private abstract class BlockImpactsEnum extends ImpactsEnum { - - protected final ForDeltaUtil forDeltaUtil = new ForDeltaUtil(); - protected final PForUtil pforUtil = new PForUtil(); - - protected final int[] docBuffer = new int[BLOCK_SIZE + 1]; - protected final int[] freqBuffer = new int[BLOCK_SIZE]; - - protected final int docFreq; // number of docs in this posting list - // sum of freqBuffer in this posting list (or docFreq when omitted) - protected final long totalTermFreq; - protected final int singletonDocID; // docid when there is a single pulsed posting, otherwise -1 - - protected final IndexInput docIn; - protected final PostingDecodingUtil docInUtil; - - protected int docCountUpto; // number of docs in or before the current block - protected int doc = -1; // doc we last read - protected int prevDocID = -1; // last doc ID of the previous block - protected int docBufferSize = BLOCK_SIZE; - protected int docBufferUpto = BLOCK_SIZE; - - // true if we shallow-advanced to a new block that we have not decoded yet - protected boolean needsRefilling; - - // level 0 skip data - protected int level0LastDocID = -1; - protected long level0DocEndFP; - protected final BytesRef level0SerializedImpacts; - protected final MutableImpactList level0Impacts; - // level 1 skip data - protected int level1LastDocID; - protected long level1DocEndFP; - protected int level1DocCountUpto = 0; - protected final BytesRef level1SerializedImpacts; - protected final MutableImpactList level1Impacts; - - private BlockImpactsEnum(IntBlockTermState termState) throws IOException { - this.docFreq = termState.docFreq; - this.docIn = Lucene101PostingsReader.this.docIn.clone(); - this.docInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(docIn); - if (docFreq > 1) { - prefetchPostings(docIn, termState); - } - this.singletonDocID = termState.singletonDocID; - this.totalTermFreq = termState.totalTermFreq; - level0SerializedImpacts = new BytesRef(maxImpactNumBytesAtLevel0); - level1SerializedImpacts = new BytesRef(maxImpactNumBytesAtLevel1); - level0Impacts = new MutableImpactList(maxNumImpactsAtLevel0); - level1Impacts = new MutableImpactList(maxNumImpactsAtLevel1); - if (docFreq < LEVEL1_NUM_DOCS) { - level1LastDocID = NO_MORE_DOCS; - if (docFreq > 1) { - docIn.seek(termState.docStartFP); - } - } else { - level1LastDocID = -1; - level1DocEndFP = termState.docStartFP; - } - // We set the last element of docBuffer to NO_MORE_DOCS, it helps save conditionals in - // advance() - docBuffer[BLOCK_SIZE] = NO_MORE_DOCS; - } - - @Override - public int docID() { - return doc; - } - - @Override - public int startOffset() { - return -1; - } - - @Override - public int endOffset() { - return -1; - } - - @Override - public BytesRef getPayload() { - return null; - } - - @Override - public long cost() { - return docFreq; - } private final Impacts impacts = new Impacts() { @@ -1276,11 +1108,14 @@ public final class Lucene101PostingsReader extends PostingsReaderBase { @Override public int numLevels() { - return level1LastDocID == NO_MORE_DOCS ? 1 : 2; + return indexHasFreq == false || level1LastDocID == NO_MORE_DOCS ? 1 : 2; } @Override public int getDocIdUpTo(int level) { + if (indexHasFreq == false) { + return NO_MORE_DOCS; + } if (level == 0) { return level0LastDocID; } @@ -1289,11 +1124,13 @@ public final class Lucene101PostingsReader extends PostingsReaderBase { @Override public List getImpacts(int level) { - if (level == 0 && level0LastDocID != NO_MORE_DOCS) { - return readImpacts(level0SerializedImpacts, level0Impacts); - } - if (level == 1) { - return readImpacts(level1SerializedImpacts, level1Impacts); + if (indexHasFreq) { + if (level == 0 && level0LastDocID != NO_MORE_DOCS) { + return readImpacts(level0SerializedImpacts, level0Impacts); + } + if (level == 1) { + return readImpacts(level1SerializedImpacts, level1Impacts); + } } return DUMMY_IMPACTS; } @@ -1308,527 +1145,11 @@ public final class Lucene101PostingsReader extends PostingsReaderBase { @Override public Impacts getImpacts() { + assert needsImpacts; return impacts; } } - final class BlockImpactsDocsEnum extends BlockImpactsEnum { - final boolean indexHasPos; - - private long freqFP; - - public BlockImpactsDocsEnum(boolean indexHasPos, IntBlockTermState termState) - throws IOException { - super(termState); - this.indexHasPos = indexHasPos; - freqFP = -1; - } - - @Override - public int freq() throws IOException { - if (freqFP != -1) { - docIn.seek(freqFP); - pforUtil.decode(docInUtil, freqBuffer); - freqFP = -1; - } - return freqBuffer[docBufferUpto - 1]; - } - - @Override - public int nextPosition() { - return -1; - } - - private void refillDocs() throws IOException { - final int left = docFreq - docCountUpto; - assert left >= 0; - - if (left >= BLOCK_SIZE) { - forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer); - freqFP = docIn.getFilePointer(); - PForUtil.skip(docIn); - docCountUpto += BLOCK_SIZE; - } else if (docFreq == 1) { - docBuffer[0] = singletonDocID; - freqBuffer[0] = (int) totalTermFreq; - freqFP = -1; - docBuffer[1] = NO_MORE_DOCS; - docCountUpto++; - docBufferSize = 1; - } else { - // Read vInts: - PostingsUtil.readVIntBlock(docIn, docBuffer, freqBuffer, left, true, true); - prefixSum(docBuffer, left, prevDocID); - docBuffer[left] = NO_MORE_DOCS; - freqFP = -1; - docCountUpto += left; - docBufferSize = left; - } - prevDocID = docBuffer[BLOCK_SIZE - 1]; - docBufferUpto = 0; - assert docBuffer[docBufferSize] == NO_MORE_DOCS; - } - - private void skipLevel1To(int target) throws IOException { - while (true) { - prevDocID = level1LastDocID; - level0LastDocID = level1LastDocID; - docIn.seek(level1DocEndFP); - docCountUpto = level1DocCountUpto; - level1DocCountUpto += LEVEL1_NUM_DOCS; - - if (docFreq - docCountUpto < LEVEL1_NUM_DOCS) { - level1LastDocID = NO_MORE_DOCS; - break; - } - - level1LastDocID += docIn.readVInt(); - level1DocEndFP = docIn.readVLong() + docIn.getFilePointer(); - - if (level1LastDocID >= target) { - long skip1EndFP = docIn.readShort() + docIn.getFilePointer(); - int numImpactBytes = docIn.readShort(); - docIn.readBytes(level1SerializedImpacts.bytes, 0, numImpactBytes); - level1SerializedImpacts.length = numImpactBytes; - assert indexHasPos || docIn.getFilePointer() == skip1EndFP; - docIn.seek(skip1EndFP); - break; - } - } - } - - private void skipLevel0To(int target) throws IOException { - while (true) { - prevDocID = level0LastDocID; - if (docFreq - docCountUpto >= BLOCK_SIZE) { - long skip0NumBytes = docIn.readVLong(); - // end offset of skip data (before the actual data starts) - long skip0End = docIn.getFilePointer() + skip0NumBytes; - int docDelta = readVInt15(docIn); - long blockLength = readVLong15(docIn); - - level0LastDocID += docDelta; - - if (target <= level0LastDocID) { - level0DocEndFP = docIn.getFilePointer() + blockLength; - int numImpactBytes = docIn.readVInt(); - docIn.readBytes(level0SerializedImpacts.bytes, 0, numImpactBytes); - level0SerializedImpacts.length = numImpactBytes; - docIn.seek(skip0End); - break; - } - - // skip block - docIn.skipBytes(blockLength); - docCountUpto += BLOCK_SIZE; - } else { - level0LastDocID = NO_MORE_DOCS; - break; - } - } - } - - @Override - public void advanceShallow(int target) throws IOException { - if (target > level0LastDocID) { // advance skip data on level 0 - if (target > level1LastDocID) { // advance skip data on level 1 - skipLevel1To(target); - } else if (needsRefilling) { - docIn.seek(level0DocEndFP); - docCountUpto += BLOCK_SIZE; - } - - skipLevel0To(target); - - needsRefilling = true; - } - } - - private void moveToNextLevel0Block() throws IOException { - if (doc == level1LastDocID) { - skipLevel1To(doc + 1); - } else if (needsRefilling) { - docIn.seek(level0DocEndFP); - docCountUpto += BLOCK_SIZE; - } - - prevDocID = level0LastDocID; - if (docFreq - docCountUpto >= BLOCK_SIZE) { - final long skip0Len = docIn.readVLong(); // skip len - final long skip0End = docIn.getFilePointer() + skip0Len; - final int docDelta = readVInt15(docIn); - final long blockLength = readVLong15(docIn); - level0LastDocID += docDelta; - level0DocEndFP = docIn.getFilePointer() + blockLength; - final int numImpactBytes = docIn.readVInt(); - docIn.readBytes(level0SerializedImpacts.bytes, 0, numImpactBytes); - level0SerializedImpacts.length = numImpactBytes; - docIn.seek(skip0End); - } else { - level0LastDocID = NO_MORE_DOCS; - } - - refillDocs(); - needsRefilling = false; - } - - @Override - public int nextDoc() throws IOException { - if (docBufferUpto == BLOCK_SIZE) { - if (needsRefilling) { - refillDocs(); - needsRefilling = false; - } else { - moveToNextLevel0Block(); - } - } - - return this.doc = docBuffer[docBufferUpto++]; - } - - @Override - public int advance(int target) throws IOException { - if (target > level0LastDocID || needsRefilling) { - advanceShallow(target); - refillDocs(); - needsRefilling = false; - } - - int next = VectorUtil.findNextGEQ(docBuffer, target, docBufferUpto, docBufferSize); - this.doc = docBuffer[next]; - docBufferUpto = next + 1; - return doc; - } - } - - final class BlockImpactsPostingsEnum extends BlockImpactsEnum { - private final int[] posDeltaBuffer = new int[BLOCK_SIZE]; - - private int posBufferUpto; - final IndexInput posIn; - final PostingDecodingUtil posInUtil; - - final boolean indexHasFreq; - final boolean indexHasOffsets; - final boolean indexHasPayloads; - final boolean indexHasOffsetsOrPayloads; - - private long freqFP; // offset of the freq block - - private int position; // current position - - // value of docBufferUpto on the last doc ID when positions have been read - private int posDocBufferUpto; - - // how many positions "behind" we are; nextPosition must - // skip these to "catch up": - private int posPendingCount; - - // File pointer where the last (vInt encoded) pos delta - // block is. We need this to know whether to bulk - // decode vs vInt decode the block: - private final long lastPosBlockFP; - - // level 0 skip data - private long level0PosEndFP; - private int level0BlockPosUpto; - // level 1 skip data - private long level1PosEndFP; - private int level1BlockPosUpto; - - public BlockImpactsPostingsEnum(FieldInfo fieldInfo, IntBlockTermState termState) - throws IOException { - super(termState); - final IndexOptions options = fieldInfo.getIndexOptions(); - indexHasFreq = options.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; - indexHasOffsets = - options.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; - indexHasPayloads = fieldInfo.hasPayloads(); - indexHasOffsetsOrPayloads = indexHasOffsets || indexHasPayloads; - - this.posIn = Lucene101PostingsReader.this.posIn.clone(); - posInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(posIn); - - // Where this term's postings start in the .pos file: - final long posTermStartFP = termState.posStartFP; - posIn.seek(posTermStartFP); - level1PosEndFP = posTermStartFP; - level0PosEndFP = posTermStartFP; - posPendingCount = 0; - if (termState.totalTermFreq < BLOCK_SIZE) { - lastPosBlockFP = posTermStartFP; - } else if (termState.totalTermFreq == BLOCK_SIZE) { - lastPosBlockFP = -1; - } else { - lastPosBlockFP = posTermStartFP + termState.lastPosBlockOffset; - } - level1BlockPosUpto = 0; - posBufferUpto = BLOCK_SIZE; - } - - @Override - public int freq() throws IOException { - if (freqFP != -1) { - docIn.seek(freqFP); - pforUtil.decode(docInUtil, freqBuffer); - freqFP = -1; - } - return freqBuffer[docBufferUpto - 1]; - } - - private void refillDocs() throws IOException { - final int left = docFreq - docCountUpto; - assert left >= 0; - - if (left >= BLOCK_SIZE) { - forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer); - freqFP = docIn.getFilePointer(); - PForUtil.skip(docIn); - docCountUpto += BLOCK_SIZE; - } else if (docFreq == 1) { - docBuffer[0] = singletonDocID; - freqBuffer[0] = (int) totalTermFreq; - freqFP = -1; - docBuffer[1] = NO_MORE_DOCS; - docCountUpto++; - docBufferSize = 1; - - } else { - // Read vInts: - PostingsUtil.readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq, true); - prefixSum(docBuffer, left, prevDocID); - docBuffer[left] = NO_MORE_DOCS; - freqFP = -1; - docCountUpto += left; - docBufferSize = left; - freqFP = -1; - } - prevDocID = docBuffer[BLOCK_SIZE - 1]; - docBufferUpto = 0; - posDocBufferUpto = 0; - assert docBuffer[docBufferSize] == NO_MORE_DOCS; - } - - private void skipLevel1To(int target) throws IOException { - while (true) { - prevDocID = level1LastDocID; - level0LastDocID = level1LastDocID; - docIn.seek(level1DocEndFP); - level0PosEndFP = level1PosEndFP; - level0BlockPosUpto = level1BlockPosUpto; - docCountUpto = level1DocCountUpto; - level1DocCountUpto += LEVEL1_NUM_DOCS; - - if (docFreq - docCountUpto < LEVEL1_NUM_DOCS) { - level1LastDocID = NO_MORE_DOCS; - break; - } - - level1LastDocID += docIn.readVInt(); - level1DocEndFP = docIn.readVLong() + docIn.getFilePointer(); - - long skip1EndFP = docIn.readShort() + docIn.getFilePointer(); - int numImpactBytes = docIn.readShort(); - if (level1LastDocID >= target) { - docIn.readBytes(level1SerializedImpacts.bytes, 0, numImpactBytes); - level1SerializedImpacts.length = numImpactBytes; - } else { - docIn.skipBytes(numImpactBytes); - } - level1PosEndFP += docIn.readVLong(); - level1BlockPosUpto = docIn.readByte(); - assert indexHasOffsetsOrPayloads || docIn.getFilePointer() == skip1EndFP; - - if (level1LastDocID >= target) { - docIn.seek(skip1EndFP); - break; - } - } - } - - private void skipLevel0To(int target) throws IOException { - long posFP; - int posUpto; - - while (true) { - prevDocID = level0LastDocID; - - posFP = level0PosEndFP; - posUpto = level0BlockPosUpto; - - if (docFreq - docCountUpto >= BLOCK_SIZE) { - docIn.readVLong(); // skip0 num bytes - int docDelta = readVInt15(docIn); - long blockLength = readVLong15(docIn); - level0DocEndFP = docIn.getFilePointer() + blockLength; - - level0LastDocID += docDelta; - - if (target <= level0LastDocID) { - int numImpactBytes = docIn.readVInt(); - docIn.readBytes(level0SerializedImpacts.bytes, 0, numImpactBytes); - level0SerializedImpacts.length = numImpactBytes; - level0PosEndFP += docIn.readVLong(); - level0BlockPosUpto = docIn.readByte(); - if (indexHasOffsetsOrPayloads) { - docIn.readVLong(); // pay fp delta - docIn.readVInt(); // pay upto - } - break; - } - // skip block - docIn.skipBytes(docIn.readVLong()); // impacts - level0PosEndFP += docIn.readVLong(); - level0BlockPosUpto = docIn.readVInt(); - docIn.seek(level0DocEndFP); - docCountUpto += BLOCK_SIZE; - } else { - level0LastDocID = NO_MORE_DOCS; - break; - } - } - - // If nextBlockPosFP is less than the current FP, it means that the block of positions for - // the first docs of the next block are already decoded. In this case we just accumulate - // frequencies into posPendingCount instead of seeking backwards and decoding the same pos - // block again. - if (posFP >= posIn.getFilePointer()) { - posIn.seek(posFP); - posPendingCount = posUpto; - posBufferUpto = BLOCK_SIZE; - } else { - posPendingCount += sumOverRange(freqBuffer, posDocBufferUpto, BLOCK_SIZE); - } - } - - @Override - public void advanceShallow(int target) throws IOException { - if (target > level0LastDocID) { // advance level 0 skip data - - if (target > level1LastDocID) { // advance skip data on level 1 - skipLevel1To(target); - } else if (needsRefilling) { - docIn.seek(level0DocEndFP); - docCountUpto += BLOCK_SIZE; - } - - skipLevel0To(target); - - needsRefilling = true; - } - } - - @Override - public int nextDoc() throws IOException { - if (docBufferUpto == BLOCK_SIZE) { - advanceShallow(doc + 1); - assert needsRefilling; - refillDocs(); - needsRefilling = false; - } - - doc = docBuffer[docBufferUpto]; - docBufferUpto++; - return this.doc; - } - - @Override - public int advance(int target) throws IOException { - if (target > level0LastDocID || needsRefilling) { - advanceShallow(target); - assert needsRefilling; - refillDocs(); - needsRefilling = false; - } - - int next = VectorUtil.findNextGEQ(docBuffer, target, docBufferUpto, docBufferSize); - docBufferUpto = next + 1; - return this.doc = docBuffer[next]; - } - - private void skipPositions(int freq) throws IOException { - // Skip positions now: - int toSkip = posPendingCount - freq; - // if (DEBUG) { - // System.out.println(" FPR.skipPositions: toSkip=" + toSkip); - // } - - final int leftInBlock = BLOCK_SIZE - posBufferUpto; - if (toSkip < leftInBlock) { - posBufferUpto += toSkip; - } else { - toSkip -= leftInBlock; - while (toSkip >= BLOCK_SIZE) { - assert posIn.getFilePointer() != lastPosBlockFP; - PForUtil.skip(posIn); - toSkip -= BLOCK_SIZE; - } - refillPositions(); - posBufferUpto = toSkip; - } - } - - private void refillPositions() throws IOException { - if (posIn.getFilePointer() == lastPosBlockFP) { - final int count = (int) (totalTermFreq % BLOCK_SIZE); - int payloadLength = 0; - for (int i = 0; i < count; i++) { - int code = posIn.readVInt(); - if (indexHasPayloads) { - if ((code & 1) != 0) { - payloadLength = posIn.readVInt(); - } - posDeltaBuffer[i] = code >>> 1; - if (payloadLength != 0) { - posIn.skipBytes(payloadLength); - } - } else { - posDeltaBuffer[i] = code; - } - - if (indexHasOffsets) { - int deltaCode = posIn.readVInt(); - if ((deltaCode & 1) != 0) { - posIn.readVInt(); // offset length - } - } - } - } else { - pforUtil.decode(posInUtil, posDeltaBuffer); - } - } - - @Override - public int nextPosition() throws IOException { - if (posDocBufferUpto != docBufferUpto) { - int freq = freq(); // triggers lazy decoding of freqs - - // First position that is being read on this doc. - posPendingCount += sumOverRange(freqBuffer, posDocBufferUpto, docBufferUpto); - posDocBufferUpto = docBufferUpto; - - assert posPendingCount > 0; - - if (posPendingCount > freq) { - skipPositions(freq); - posPendingCount = freq; - } - - position = 0; - } - - if (posBufferUpto == BLOCK_SIZE) { - refillPositions(); - posBufferUpto = 0; - } - position += posDeltaBuffer[posBufferUpto]; - - posBufferUpto++; - posPendingCount--; - return position; - } - } - /** * @see Lucene101PostingsWriter#writeVInt15(org.apache.lucene.store.DataOutput, int) */ diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java index 1c5436dcf0a..b1d283cc389 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java @@ -399,7 +399,7 @@ public class PhraseQuery extends Query { /** * A guess of the average number of simple operations for the initial seek and buffer refill per * document for the positions of a term. See also {@link - * Lucene101PostingsReader.BlockImpactsPostingsEnum#nextPosition()}. + * Lucene101PostingsReader.BlockPostingsEnum#nextPosition()}. * *

Aside: Instead of being constant this could depend among others on {@link * Lucene101PostingsFormat#BLOCK_SIZE}, {@link TermsEnum#docFreq()}, {@link @@ -410,9 +410,8 @@ public class PhraseQuery extends Query { private static final int TERM_POSNS_SEEK_OPS_PER_DOC = 128; /** - * Number of simple operations in {@link - * Lucene101PostingsReader.BlockImpactsPostingsEnum#nextPosition()} when no seek or buffer refill - * is done. + * Number of simple operations in {@link Lucene101PostingsReader.BlockPostingsEnum#nextPosition()} + * when no seek or buffer refill is done. */ private static final int TERM_OPS_PER_POS = 7; diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/TermIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/TermIntervalsSource.java index c0f2f61d7cc..c1360a9d9a7 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/TermIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/TermIntervalsSource.java @@ -261,7 +261,7 @@ class TermIntervalsSource extends IntervalsSource { /** * A guess of the average number of simple operations for the initial seek and buffer refill per * document for the positions of a term. See also {@link - * Lucene101PostingsReader.EverythingEnum#nextPosition()}. + * Lucene101PostingsReader.BlockPostingsEnum#nextPosition()}. * *

Aside: Instead of being constant this could depend among others on {@link * Lucene101PostingsFormat#BLOCK_SIZE}, {@link TermsEnum#docFreq()}, {@link @@ -272,7 +272,7 @@ class TermIntervalsSource extends IntervalsSource { private static final int TERM_POSNS_SEEK_OPS_PER_DOC = 128; /** - * Number of simple operations in {@link Lucene101PostingsReader.EverythingEnum#nextPosition()} + * Number of simple operations in {@link Lucene101PostingsReader.BlockPostingsEnum#nextPosition()} * when no seek or buffer refill is done. */ private static final int TERM_OPS_PER_POS = 7;