From 0be8bfb4fcec0ebd5da5e7f996fe4b9fefedc6ee Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Thu, 9 Aug 2012 09:16:09 +0000 Subject: [PATCH] LUCENE-3892: backporting r1371010 and r1371011 to BlockPacked. I changed the comment on the value of BLOCK_SIZE. Indeed, since PackedInts encoding/decoding is long-aligned, all numbers of bits per value that are co-prime with 64 cannot encode/decode less than 64 values per iteration, so the block size should be at least 64. Switching it to a lower value (eg. 32) should work but will perform many unnecessary operations in the encoding/decoding step. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/pforcodec_3892@1371114 13f79535-47bb-0310-9956-ffa450edef68 --- .../BlockPackedPostingsFormat.java | 4 + .../BlockPackedPostingsReader.java | 115 ++++++++++-------- .../blockpacked/BlockPackedSkipWriter.java | 2 +- 3 files changed, 72 insertions(+), 49 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blockpacked/BlockPackedPostingsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/blockpacked/BlockPackedPostingsFormat.java index 3ad292ffad2..da0d017881f 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/blockpacked/BlockPackedPostingsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/blockpacked/BlockPackedPostingsFormat.java @@ -41,6 +41,10 @@ public final class BlockPackedPostingsFormat extends PostingsFormat { private final int minTermBlockSize; private final int maxTermBlockSize; + + // nocommit is this right?: + // NOTE: should be at least 64 because of PackedInts long-aligned encoding/decoding + // NOTE: must be factor of ... 64? public final static int BLOCK_SIZE = 128; public BlockPackedPostingsFormat() { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blockpacked/BlockPackedPostingsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/blockpacked/BlockPackedPostingsReader.java index 1f5ddb4d404..72fc8aaa183 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/blockpacked/BlockPackedPostingsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/blockpacked/BlockPackedPostingsReader.java @@ -384,7 +384,6 @@ public final class BlockPackedPostingsReader extends PostingsReaderBase { } private void refillDocs() throws IOException { - //System.out.println("["+docFreq+"]"+" refillDoc"); final int left = docFreq - docUpto; assert left > 0; @@ -426,7 +425,6 @@ public final class BlockPackedPostingsReader extends PostingsReaderBase { } return doc = NO_MORE_DOCS; } - //System.out.println("["+docFreq+"]"+" nextDoc"); if (docBufferUpto == BLOCK_SIZE) { refillDocs(); } @@ -486,15 +484,15 @@ public final class BlockPackedPostingsReader extends PostingsReaderBase { skipped = true; } - final int newDocUpto = skipper.skipTo(target); + final int newDocUpto = skipper.skipTo(target) + 1; if (newDocUpto > docUpto) { // Skipper moved if (DEBUG) { System.out.println("skipper moved to docUpto=" + newDocUpto + " vs current=" + docUpto + "; docID=" + skipper.getDoc() + " fp=" + skipper.getDocPointer()); } - assert newDocUpto % BLOCK_SIZE == (BLOCK_SIZE - 1): "got " + newDocUpto; - docUpto = newDocUpto+1; + assert newDocUpto % BLOCK_SIZE == 0 : "got " + newDocUpto; + docUpto = newDocUpto; // Force to read next block docBufferUpto = BLOCK_SIZE; @@ -503,6 +501,12 @@ public final class BlockPackedPostingsReader extends PostingsReaderBase { } nextSkipDoc = skipper.getNextSkipDoc(); } + if (docUpto == docFreq) { + return doc = NO_MORE_DOCS; + } + if (docBufferUpto == BLOCK_SIZE) { + refillDocs(); + } // Now scan... this is an inlined/pared down version // of nextDoc(): @@ -510,18 +514,6 @@ public final class BlockPackedPostingsReader extends PostingsReaderBase { if (DEBUG) { System.out.println(" scan doc=" + accum + " docBufferUpto=" + docBufferUpto); } - if (docUpto == docFreq) { - return doc = NO_MORE_DOCS; - } - - // nocommit: in theory we should not hit this? ie - // skipper should already have moved us to the block - // containing the doc? yet assert false trips ... i - // think because if you advance w/o having done a - // nextDoc yet()... can we assert/remove this? - if (docBufferUpto == BLOCK_SIZE) { - refillDocs(); - } accum += docDeltaBuffer[docBufferUpto]; docUpto++; @@ -529,6 +521,9 @@ public final class BlockPackedPostingsReader extends PostingsReaderBase { break; } docBufferUpto++; + if (docUpto == docFreq) { + return doc = NO_MORE_DOCS; + } } if (liveDocs == null || liveDocs.get(accum)) { @@ -666,9 +661,9 @@ public final class BlockPackedPostingsReader extends PostingsReaderBase { } private void refillDocs() throws IOException { - //System.out.println("["+docFreq+"]"+" refillDoc"); final int left = docFreq - docUpto; assert left > 0; + if (left >= BLOCK_SIZE) { if (DEBUG) { System.out.println(" fill doc block from fp=" + docIn.getFilePointer()); @@ -797,7 +792,7 @@ public final class BlockPackedPostingsReader extends PostingsReaderBase { skipped = true; } - final int newDocUpto = skipper.skipTo(target); + final int newDocUpto = skipper.skipTo(target) + 1; if (newDocUpto > docUpto) { // Skipper moved @@ -805,8 +800,8 @@ public final class BlockPackedPostingsReader extends PostingsReaderBase { System.out.println(" skipper moved to docUpto=" + newDocUpto + " vs current=" + docUpto + "; docID=" + skipper.getDoc() + " fp=" + skipper.getDocPointer() + " pos.fp=" + skipper.getPosPointer() + " pos.bufferUpto=" + skipper.getPosBufferUpto()); } - assert newDocUpto % BLOCK_SIZE == (BLOCK_SIZE - 1): "got " + newDocUpto; - docUpto = newDocUpto+1; + assert newDocUpto % BLOCK_SIZE == 0 : "got " + newDocUpto; + docUpto = newDocUpto; // Force to read next block docBufferUpto = BLOCK_SIZE; @@ -817,6 +812,12 @@ public final class BlockPackedPostingsReader extends PostingsReaderBase { } nextSkipDoc = skipper.getNextSkipDoc(); } + if (docUpto == docFreq) { + return doc = NO_MORE_DOCS; + } + if (docBufferUpto == BLOCK_SIZE) { + refillDocs(); + } // Now scan... this is an inlined/pared down version // of nextDoc(): @@ -827,16 +828,6 @@ public final class BlockPackedPostingsReader extends PostingsReaderBase { if (docUpto == docFreq) { return doc = NO_MORE_DOCS; } - // nocommit: in theory we should not hit this? ie - // skipper should already have moved us to the block - // containing the doc? yet assert false trips ... i - // think because if you advance w/o having done a - // nextDoc yet()... can we assert/remove this? - if (docBufferUpto == BLOCK_SIZE) { - // nocommit hmm skip freq? but: we don't ever - // scan over more than one block? - refillDocs(); - } accum += docDeltaBuffer[docBufferUpto]; freq = freqBuffer[docBufferUpto]; posPendingCount += freq; @@ -846,6 +837,9 @@ public final class BlockPackedPostingsReader extends PostingsReaderBase { if (accum >= target) { break; } + if (docUpto == docFreq) { + return doc = NO_MORE_DOCS; + } } if (liveDocs == null || liveDocs.get(accum)) { @@ -1110,7 +1104,6 @@ public final class BlockPackedPostingsReader extends PostingsReaderBase { } private void refillDocs() throws IOException { - //System.out.println("["+docFreq+"]"+" refillDoc"); final int left = docFreq - docUpto; assert left > 0; @@ -1226,7 +1219,6 @@ public final class BlockPackedPostingsReader extends PostingsReaderBase { if (docUpto == docFreq) { return doc = NO_MORE_DOCS; } - //System.out.println("["+docFreq+"]"+" nextDoc"); if (docBufferUpto == BLOCK_SIZE) { refillDocs(); } @@ -1293,15 +1285,15 @@ public final class BlockPackedPostingsReader extends PostingsReaderBase { skipped = true; } - final int newDocUpto = skipper.skipTo(target); + final int newDocUpto = skipper.skipTo(target) + 1; if (newDocUpto > docUpto) { // Skipper moved if (DEBUG) { System.out.println(" skipper moved to docUpto=" + newDocUpto + " vs current=" + docUpto + "; docID=" + skipper.getDoc() + " fp=" + skipper.getDocPointer() + " pos.fp=" + skipper.getPosPointer() + " pos.bufferUpto=" + skipper.getPosBufferUpto() + " pay.fp=" + skipper.getPayPointer() + " lastStartOffset=" + lastStartOffset); } - assert newDocUpto % BLOCK_SIZE == (BLOCK_SIZE - 1): "got " + newDocUpto; - docUpto = newDocUpto+1; + assert newDocUpto % BLOCK_SIZE == 0 : "got " + newDocUpto; + docUpto = newDocUpto; // Force to read next block docBufferUpto = BLOCK_SIZE; @@ -1315,24 +1307,51 @@ public final class BlockPackedPostingsReader extends PostingsReaderBase { } nextSkipDoc = skipper.getNextSkipDoc(); } - - // nocommit inline nextDoc here + if (docUpto == docFreq) { + return doc = NO_MORE_DOCS; + } + if (docBufferUpto == BLOCK_SIZE) { + refillDocs(); + } // Now scan: - while (nextDoc() != NO_MORE_DOCS) { - if (doc >= target) { - if (DEBUG) { - System.out.println(" advance return doc=" + doc); - } - return doc; + // Now scan: + while (true) { + if (DEBUG) { + System.out.println(" scan doc=" + accum + " docBufferUpto=" + docBufferUpto); + } + accum += docDeltaBuffer[docBufferUpto]; + freq = freqBuffer[docBufferUpto]; + posPendingCount += freq; + docBufferUpto++; + docUpto++; + + if (accum >= target) { + break; + } + if (docUpto == docFreq) { + return doc = NO_MORE_DOCS; } } - if (DEBUG) { - System.out.println(" advance return doc=END"); + if (liveDocs == null || liveDocs.get(accum)) { + if (DEBUG) { + System.out.println(" return doc=" + accum); + } + if (indexHasPayloads) { + payloadByteUpto += payloadLength; + payloadLength = 0; + } + position = 0; + payloadLength = 0; + lastStartOffset = 0; + return doc = accum; + } else { + if (DEBUG) { + System.out.println(" now do nextDoc()"); + } + return nextDoc(); } - - return NO_MORE_DOCS; } // nocommit in theory we could avoid loading frq block diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blockpacked/BlockPackedSkipWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/blockpacked/BlockPackedSkipWriter.java index 599a3f5e45c..387dd6da9ff 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/blockpacked/BlockPackedSkipWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/blockpacked/BlockPackedSkipWriter.java @@ -37,7 +37,7 @@ import org.apache.lucene.codecs.MultiLevelSkipListWriter; * block, only record skip data at the start its start point(if it exist). * * For each skip point, we will record: - * 1. lastDocID, + * 1. docID in former position, i.e. for position 12, record docID[11], etc. * 2. its related file points(position, payload), * 3. related numbers or uptos(position, payload). * 4. start offset.