diff --git a/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsReader.java index 482a3e20712..fb776f7f9aa 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsReader.java @@ -24,7 +24,6 @@ import java.nio.IntBuffer; import org.apache.lucene.codecs.BlockTermState; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.PostingsReaderBase; -import org.apache.lucene.codecs.pfor.ForUtil; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.FieldInfo.IndexOptions; @@ -43,8 +42,6 @@ import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; -// nocommit move ForUtil here? - // nocommit javadocs public final class BlockPostingsReader extends PostingsReaderBase { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsWriter.java index 68e4c402d43..a4b530dfd59 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsWriter.java @@ -26,7 +26,6 @@ import java.util.List; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.PostingsWriterBase; import org.apache.lucene.codecs.TermStats; -import org.apache.lucene.codecs.pfor.ForUtil; // nocommit move here? import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.FieldInfo; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/pfor/ForUtil.java b/lucene/core/src/java/org/apache/lucene/codecs/block/ForUtil.java similarity index 99% rename from lucene/core/src/java/org/apache/lucene/codecs/pfor/ForUtil.java rename to lucene/core/src/java/org/apache/lucene/codecs/block/ForUtil.java index b660c0b38e6..374c0aaa452 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/pfor/ForUtil.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/block/ForUtil.java @@ -1,4 +1,4 @@ -package org.apache.lucene.codecs.pfor; +package org.apache.lucene.codecs.block; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with diff --git a/lucene/core/src/java/org/apache/lucene/codecs/pfor/PackedIntsDecompress.java b/lucene/core/src/java/org/apache/lucene/codecs/block/PackedIntsDecompress.java similarity index 99% rename from lucene/core/src/java/org/apache/lucene/codecs/pfor/PackedIntsDecompress.java rename to lucene/core/src/java/org/apache/lucene/codecs/block/PackedIntsDecompress.java index 6f41d65da42..79600403206 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/pfor/PackedIntsDecompress.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/block/PackedIntsDecompress.java @@ -1,4 +1,4 @@ -package org.apache.lucene.codecs.pfor; +package org.apache.lucene.codecs.block; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with diff --git a/lucene/core/src/java/org/apache/lucene/codecs/pfor/gendecompress.py b/lucene/core/src/java/org/apache/lucene/codecs/block/gendecompress.py similarity index 98% rename from lucene/core/src/java/org/apache/lucene/codecs/pfor/gendecompress.py rename to lucene/core/src/java/org/apache/lucene/codecs/block/gendecompress.py index 3ea9f6042ed..29b44672965 100755 --- a/lucene/core/src/java/org/apache/lucene/codecs/pfor/gendecompress.py +++ b/lucene/core/src/java/org/apache/lucene/codecs/block/gendecompress.py @@ -48,7 +48,7 @@ def genDecompress(): f = open(fileName, 'w') w = f.write try: - w("package org.apache.lucene.codecs.pfor;\n") + w("package org.apache.lucene.codecs.block;\n") w("""/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with diff --git a/lucene/core/src/java/org/apache/lucene/codecs/pfor/ForPostingsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/blockpacked/BlockPackedPostingsFormat.java similarity index 62% rename from lucene/core/src/java/org/apache/lucene/codecs/pfor/ForPostingsFormat.java rename to lucene/core/src/java/org/apache/lucene/codecs/blockpacked/BlockPackedPostingsFormat.java index b1fb043bfbe..211d55117e1 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/pfor/ForPostingsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/blockpacked/BlockPackedPostingsFormat.java @@ -1,4 +1,4 @@ -package org.apache.lucene.codecs.pfor; +package org.apache.lucene.codecs.blockpacked; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -18,49 +18,41 @@ package org.apache.lucene.codecs.pfor; */ import java.io.IOException; -import java.util.Set; import org.apache.lucene.codecs.BlockTreeTermsReader; import org.apache.lucene.codecs.BlockTreeTermsWriter; import org.apache.lucene.codecs.FieldsConsumer; import org.apache.lucene.codecs.FieldsProducer; -import org.apache.lucene.codecs.FixedGapTermsIndexReader; -import org.apache.lucene.codecs.FixedGapTermsIndexWriter; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.PostingsReaderBase; import org.apache.lucene.codecs.PostingsWriterBase; -import org.apache.lucene.codecs.TermsIndexReaderBase; -import org.apache.lucene.codecs.TermsIndexWriterBase; -import org.apache.lucene.codecs.sep.SepPostingsReader; -import org.apache.lucene.codecs.sep.SepPostingsWriter; -import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; -import org.apache.lucene.store.IOContext; -import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; /** * Pass ForFactory to a PostingsWriter/ReaderBase, and get * customized postings format plugged. */ -public final class ForPostingsFormat extends PostingsFormat { - private final int minBlockSize; - private final int maxBlockSize; +public final class BlockPackedPostingsFormat extends PostingsFormat { + public static final String DOC_EXTENSION = "doc"; + public static final String POS_EXTENSION = "pos"; + public static final String PAY_EXTENSION = "pay"; + + private final int minTermBlockSize; + private final int maxTermBlockSize; public final static int DEFAULT_BLOCK_SIZE = 128; - public ForPostingsFormat() { - super("For"); - this.minBlockSize = BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE; - this.maxBlockSize = BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE; + public BlockPackedPostingsFormat() { + this(BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE); } - public ForPostingsFormat(int minBlockSize, int maxBlockSize) { - super("For"); - this.minBlockSize = minBlockSize; - assert minBlockSize > 1; - this.maxBlockSize = maxBlockSize; - assert minBlockSize <= maxBlockSize; + public BlockPackedPostingsFormat(int minTermBlockSize, int maxTermBlockSize) { + super("BlockPacked"); + this.minTermBlockSize = minTermBlockSize; + assert minTermBlockSize > 1; + this.maxTermBlockSize = maxTermBlockSize; + assert minTermBlockSize <= maxTermBlockSize; } @Override @@ -71,13 +63,14 @@ public final class ForPostingsFormat extends PostingsFormat { @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { // TODO: implement a new PostingsWriterBase to improve skip-settings - PostingsWriterBase postingsWriter = new SepPostingsWriter(state, new ForFactory()); + PostingsWriterBase postingsWriter = new BlockPackedPostingsWriter(state, 128); + boolean success = false; try { FieldsConsumer ret = new BlockTreeTermsWriter(state, postingsWriter, - minBlockSize, - maxBlockSize); + minTermBlockSize, + maxTermBlockSize); success = true; return ret; } finally { @@ -89,13 +82,12 @@ public final class ForPostingsFormat extends PostingsFormat { @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - PostingsReaderBase postingsReader = new SepPostingsReader(state.dir, - state.fieldInfos, - state.segmentInfo, - state.context, - new ForFactory(), - state.segmentSuffix); - + PostingsReaderBase postingsReader = new BlockPackedPostingsReader(state.dir, + state.fieldInfos, + state.segmentInfo, + state.context, + state.segmentSuffix, + 128); boolean success = false; try { FieldsProducer ret = new BlockTreeTermsReader(state.dir, diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blockpacked/BlockPackedPostingsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/blockpacked/BlockPackedPostingsReader.java new file mode 100644 index 00000000000..08312c4a1a1 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/blockpacked/BlockPackedPostingsReader.java @@ -0,0 +1,1527 @@ +package org.apache.lucene.codecs.blockpacked; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.IntBuffer; +import java.nio.LongBuffer; + +import org.apache.lucene.codecs.BlockTermState; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.PostingsReaderBase; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.TermState; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; + +// nocommit javadocs +public final class BlockPackedPostingsReader extends PostingsReaderBase { + + private final IndexInput docIn; + private final IndexInput posIn; + private final IndexInput payIn; + + public static boolean DEBUG = false; + + // nocommit + final String segment; + + // NOTE: not private to avoid access$NNN methods: + final int blockSize; + + public BlockPackedPostingsReader(Directory dir, FieldInfos fieldInfos, SegmentInfo segmentInfo, IOContext ioContext, String segmentSuffix, int blockSize) throws IOException { + boolean success = false; + segment = segmentInfo.name; + IndexInput docIn = null; + IndexInput posIn = null; + IndexInput payIn = null; + try { + docIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, BlockPackedPostingsFormat.DOC_EXTENSION), + ioContext); + CodecUtil.checkHeader(docIn, + BlockPackedPostingsWriter.DOC_CODEC, + BlockPackedPostingsWriter.VERSION_START, + BlockPackedPostingsWriter.VERSION_START); + + if (fieldInfos.hasProx()) { + posIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, BlockPackedPostingsFormat.POS_EXTENSION), + ioContext); + CodecUtil.checkHeader(posIn, + BlockPackedPostingsWriter.POS_CODEC, + BlockPackedPostingsWriter.VERSION_START, + BlockPackedPostingsWriter.VERSION_START); + + if (fieldInfos.hasPayloads() || fieldInfos.hasOffsets()) { + payIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, BlockPackedPostingsFormat.PAY_EXTENSION), + ioContext); + CodecUtil.checkHeader(payIn, + BlockPackedPostingsWriter.PAY_CODEC, + BlockPackedPostingsWriter.VERSION_START, + BlockPackedPostingsWriter.VERSION_START); + } + } + + this.docIn = docIn; + this.posIn = posIn; + this.payIn = payIn; + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(docIn, posIn, payIn); + } + } + + this.blockSize = blockSize; + } + + @Override + public void init(IndexInput termsIn) throws IOException { + // Make sure we are talking to the matching past writer + CodecUtil.checkHeader(termsIn, + BlockPackedPostingsWriter.TERMS_CODEC, + BlockPackedPostingsWriter.VERSION_START, + BlockPackedPostingsWriter.VERSION_START); + final int indexBlockSize = termsIn.readVInt(); + if (indexBlockSize != blockSize) { + throw new IllegalStateException("index-time blockSize (" + indexBlockSize + ") != read-time blockSize (" + blockSize + ")"); + } + } + + static void readBlock(IndexInput in, byte[] encoded, LongBuffer encodedBuffer, LongBuffer buffer) throws IOException { + int header = in.readVInt(); + in.readBytes(encoded, 0, ForUtil.getEncodedSize(header)); + ForUtil.decompress(buffer, encodedBuffer, header); + } + + static void skipBlock(IndexInput in) throws IOException { + int header = in.readVInt(); + in.seek(in.getFilePointer() + ForUtil.getEncodedSize(header)); + } + + // Must keep final because we do non-standard clone + private final static class IntBlockTermState extends BlockTermState { + long docStartFP; + long posStartFP; + long payStartFP; + int skipOffset; + int lastPosBlockOffset; + + // Only used by the "primary" TermState -- clones don't + // copy this (basically they are "transient"): + ByteArrayDataInput bytesReader; // TODO: should this NOT be in the TermState...? + byte[] bytes; + + @Override + public IntBlockTermState clone() { + IntBlockTermState other = new IntBlockTermState(); + other.copyFrom(this); + return other; + } + + @Override + public void copyFrom(TermState _other) { + super.copyFrom(_other); + IntBlockTermState other = (IntBlockTermState) _other; + docStartFP = other.docStartFP; + posStartFP = other.posStartFP; + payStartFP = other.payStartFP; + lastPosBlockOffset = other.lastPosBlockOffset; + skipOffset = other.skipOffset; + + // Do not copy bytes, bytesReader (else TermState is + // very heavy, ie drags around the entire block's + // byte[]). On seek back, if next() is in fact used + // (rare!), they will be re-read from disk. + } + + @Override + public String toString() { + return super.toString() + " docStartFP=" + docStartFP + " posStartFP=" + posStartFP + " payStartFP=" + payStartFP + " lastPosBlockOffset=" + lastPosBlockOffset; + } + } + + @Override + public IntBlockTermState newTermState() { + return new IntBlockTermState(); + } + + @Override + public void close() throws IOException { + IOUtils.close(docIn, posIn, payIn); + } + + /* Reads but does not decode the byte[] blob holding + metadata for the current terms block */ + @Override + public void readTermsBlock(IndexInput termsIn, FieldInfo fieldInfo, BlockTermState _termState) throws IOException { + final IntBlockTermState termState = (IntBlockTermState) _termState; + + final int numBytes = termsIn.readVInt(); + + if (termState.bytes == null) { + termState.bytes = new byte[ArrayUtil.oversize(numBytes, 1)]; + termState.bytesReader = new ByteArrayDataInput(); + } else if (termState.bytes.length < numBytes) { + termState.bytes = new byte[ArrayUtil.oversize(numBytes, 1)]; + } + + termsIn.readBytes(termState.bytes, 0, numBytes); + termState.bytesReader.reset(termState.bytes, 0, numBytes); + } + + @Override + public void nextTerm(FieldInfo fieldInfo, BlockTermState _termState) + throws IOException { + final IntBlockTermState termState = (IntBlockTermState) _termState; + final boolean isFirstTerm = termState.termBlockOrd == 0; + final boolean fieldHasPositions = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + final boolean fieldHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + final boolean fieldHasPayloads = fieldInfo.hasPayloads(); + + final DataInput in = termState.bytesReader; + if (isFirstTerm) { + termState.docStartFP = in.readVLong(); + if (fieldHasPositions) { + termState.posStartFP = in.readVLong(); + if (termState.totalTermFreq > blockSize) { + termState.lastPosBlockOffset = in.readVInt(); + } else { + termState.lastPosBlockOffset = -1; + } + if ((fieldHasPayloads || fieldHasOffsets) && termState.totalTermFreq >= blockSize) { + termState.payStartFP = in.readVLong(); + } else { + termState.payStartFP = -1; + } + } + } else { + termState.docStartFP += in.readVLong(); + if (fieldHasPositions) { + termState.posStartFP += in.readVLong(); + if (termState.totalTermFreq > blockSize) { + termState.lastPosBlockOffset = in.readVInt(); + } else { + termState.lastPosBlockOffset = -1; + } + if ((fieldHasPayloads || fieldHasOffsets) && termState.totalTermFreq >= blockSize) { + long delta = in.readVLong(); + if (termState.payStartFP == -1) { + termState.payStartFP = delta; + } else { + termState.payStartFP += delta; + } + } + } + } + + if (termState.docFreq > blockSize) { + termState.skipOffset = in.readVInt(); + } else { + termState.skipOffset = -1; + } + } + + @Override + public DocsEnum docs(FieldInfo fieldInfo, BlockTermState termState, Bits liveDocs, DocsEnum reuse, boolean needsFreqs) throws IOException { + BlockDocsEnum docsEnum; + if (reuse instanceof BlockDocsEnum) { + docsEnum = (BlockDocsEnum) reuse; + if (!docsEnum.canReuse(docIn, fieldInfo)) { + docsEnum = new BlockDocsEnum(fieldInfo); + } + } else { + docsEnum = new BlockDocsEnum(fieldInfo); + } + return docsEnum.reset(liveDocs, (IntBlockTermState) termState); + } + + // TODO: specialize to liveDocs vs not, and freqs vs not + + @Override + public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState termState, Bits liveDocs, + DocsAndPositionsEnum reuse, boolean needsOffsets) + throws IOException { + + // nocommit use needsPayloads here: + if (!needsOffsets && !fieldInfo.hasPayloads()) { + BlockDocsAndPositionsEnum docsAndPositionsEnum; + if (reuse instanceof BlockDocsAndPositionsEnum) { + docsAndPositionsEnum = (BlockDocsAndPositionsEnum) reuse; + if (!docsAndPositionsEnum.canReuse(docIn, fieldInfo)) { + docsAndPositionsEnum = new BlockDocsAndPositionsEnum(fieldInfo); + } + } else { + docsAndPositionsEnum = new BlockDocsAndPositionsEnum(fieldInfo); + } + return docsAndPositionsEnum.reset(liveDocs, (IntBlockTermState) termState); + } else { + EverythingEnum everythingEnum; + if (reuse instanceof EverythingEnum) { + everythingEnum = (EverythingEnum) reuse; + if (!everythingEnum.canReuse(docIn, fieldInfo)) { + everythingEnum = new EverythingEnum(fieldInfo); + } + } else { + everythingEnum = new EverythingEnum(fieldInfo); + } + return everythingEnum.reset(liveDocs, (IntBlockTermState) termState); + } + } + + final class BlockDocsEnum extends DocsEnum { + private final byte[] encoded; + private final LongBuffer encodedBuffer; + + private final long[] docDeltaBuffer = new long[blockSize]; + private final long[] freqBuffer = new long[blockSize]; + private final LongBuffer docDeltaLBuffer = LongBuffer.wrap(docDeltaBuffer); + private final LongBuffer freqLBuffer = LongBuffer.wrap(freqBuffer); + + private int docBufferUpto; + + private BlockPackedSkipReader skipper; + private boolean skipped; + + final IndexInput startDocIn; + + final IndexInput docIn; + final boolean indexHasFreq; + final boolean indexHasPos; + final boolean indexHasOffsets; + final boolean indexHasPayloads; + + private int docFreq; // number of docs in this posting list + private int docUpto; // how many docs we've read + private int doc; // doc we last read + private int accum; // accumulator for doc deltas + private int freq; // freq we last read + + // Where this term's postings start in the .doc file: + private long docTermStartFP; + + // Where this term's skip data starts (after + // docTermStartFP) in the .doc file (or -1 if there is + // no skip data for this term): + private int skipOffset; + + private Bits liveDocs; + + public BlockDocsEnum(FieldInfo fieldInfo) throws IOException { + this.startDocIn = BlockPackedPostingsReader.this.docIn; + this.docIn = (IndexInput) startDocIn.clone(); + indexHasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; + indexHasPos = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + indexHasPayloads = fieldInfo.hasPayloads(); + encoded = new byte[blockSize*4]; + encodedBuffer = ByteBuffer.wrap(encoded).asLongBuffer(); + } + + public boolean canReuse(IndexInput docIn, FieldInfo fieldInfo) { + return docIn == startDocIn && + indexHasFreq == (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0) && + indexHasPos == (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) && + indexHasPayloads == fieldInfo.hasPayloads(); + } + + public DocsEnum reset(Bits liveDocs, IntBlockTermState termState) throws IOException { + this.liveDocs = liveDocs; + if (DEBUG) { + System.out.println(" FPR.reset: seg=" + segment + " termState=" + termState); + } + docFreq = termState.docFreq; + docTermStartFP = termState.docStartFP; + docIn.seek(docTermStartFP); + skipOffset = termState.skipOffset; + + doc = -1; + accum = 0; + docUpto = 0; + docBufferUpto = blockSize; + skipped = false; + return this; + } + + @Override + public int freq() throws IOException { + return freq; + } + + @Override + public int docID() { + return doc; + } + + private void refillDocs() throws IOException { + final int left = docFreq - docUpto; + assert left > 0; + + if (left >= blockSize) { + if (DEBUG) { + System.out.println(" fill doc block from fp=" + docIn.getFilePointer()); + } + readBlock(docIn, encoded, encodedBuffer, docDeltaLBuffer); + + if (indexHasFreq) { + if (DEBUG) { + System.out.println(" fill freq block from fp=" + docIn.getFilePointer()); + } + readBlock(docIn, encoded, encodedBuffer, freqLBuffer); + } + } else { + // Read vInts: + if (DEBUG) { + System.out.println(" fill last vInt block from fp=" + docIn.getFilePointer()); + } + for(int i=0;i>> 1; + if ((code & 1) != 0) { + freqBuffer[i] = 1; + } else { + freqBuffer[i] = docIn.readVInt(); + } + } else { + docDeltaBuffer[i] = code; + } + } + } + docBufferUpto = 0; + } + + @Override + public int nextDoc() throws IOException { + + if (DEBUG) { + System.out.println("\nFPR.nextDoc"); + } + + while (true) { + if (DEBUG) { + System.out.println(" docUpto=" + docUpto + " (of df=" + docFreq + ") docBufferUpto=" + docBufferUpto); + } + + if (docUpto == docFreq) { + if (DEBUG) { + System.out.println(" return doc=END"); + } + return doc = NO_MORE_DOCS; + } + + if (docBufferUpto == blockSize) { + refillDocs(); + } + + if (DEBUG) { + System.out.println(" accum=" + accum + " docDeltaBuffer[" + docBufferUpto + "]=" + docDeltaBuffer[docBufferUpto]); + } + accum += docDeltaBuffer[docBufferUpto]; + docUpto++; + + if (liveDocs == null || liveDocs.get(accum)) { + doc = accum; + freq = (int)freqBuffer[docBufferUpto]; + docBufferUpto++; + if (DEBUG) { + System.out.println(" return doc=" + doc + " freq=" + freq); + } + return doc; + } + + if (DEBUG) { + System.out.println(" doc=" + accum + " is deleted; try next doc"); + } + + docBufferUpto++; + } + } + + @Override + public int advance(int target) throws IOException { + // nocommit make frq block load lazy/skippable + + // nocommit 2 is heuristic guess!! + // nocommit put cheating back! does it help? + // nocommit use skipper!!! it has next last doc id!! + //if (docFreq > blockSize && target - (blockSize - docBufferUpto) - 2*blockSize > accum) { + if (docFreq > blockSize && target - accum > blockSize) { + + if (DEBUG) { + System.out.println("load skipper"); + } + + if (skipper == null) { + // Lazy init: first time this enum has ever been used for skipping + skipper = new BlockPackedSkipReader((IndexInput) docIn.clone(), + BlockPackedPostingsWriter.maxSkipLevels, + blockSize, + indexHasPos, + indexHasOffsets, + indexHasPayloads); + } + + if (!skipped) { + assert skipOffset != -1; + // This is the first time this enum has skipped + // since reset() was called; load the skip data: + skipper.init(docTermStartFP+skipOffset, docTermStartFP, 0, 0, docFreq); + skipped = true; + } + + final int newDocUpto = skipper.skipTo(target); + + if (newDocUpto > docUpto) { + // Skipper moved + + if (DEBUG) { + System.out.println("skipper moved to docUpto=" + newDocUpto + " vs current=" + docUpto + "; docID=" + skipper.getDoc() + " fp=" + skipper.getDocPointer()); + } + + assert newDocUpto % blockSize == (blockSize-1): "got " + newDocUpto; + docUpto = newDocUpto+1; + + // Force block read next: + docBufferUpto = blockSize; + accum = skipper.getDoc(); + docIn.seek(skipper.getDocPointer()); + } + } + + // Now scan: + while (nextDoc() != NO_MORE_DOCS) { + if (doc >= target) { + if (DEBUG) { + System.out.println(" advance return doc=" + doc); + } + return doc; + } + } + + if (DEBUG) { + System.out.println(" advance return doc=END"); + } + + return NO_MORE_DOCS; + } + } + + + final class BlockDocsAndPositionsEnum extends DocsAndPositionsEnum { + + private final byte[] encoded; + private final LongBuffer encodedBuffer; + + private final long[] docDeltaBuffer = new long[blockSize]; + private final long[] freqBuffer = new long[blockSize]; + private final long[] posDeltaBuffer = new long[blockSize]; + + + private final LongBuffer docDeltaLBuffer = LongBuffer.wrap(docDeltaBuffer); + private final LongBuffer freqLBuffer = LongBuffer.wrap(freqBuffer); + private final LongBuffer posDeltaLBuffer = LongBuffer.wrap(posDeltaBuffer); + + private int docBufferUpto; + private int posBufferUpto; + + private BlockPackedSkipReader skipper; + private boolean skipped; + + final IndexInput startDocIn; + + final IndexInput docIn; + final IndexInput posIn; + + final boolean indexHasOffsets; + final boolean indexHasPayloads; + + private int docFreq; // number of docs in this posting list + private int docUpto; // how many docs we've read + private int doc; // doc we last read + private int accum; // accumulator for doc deltas + private int freq; // freq we last read + private int position; // current position + + // how many positions "behind" we are; nextPosition must + // skip these to "catch up": + private int posPendingCount; + + // Lazy pos seek: if != -1 then we must seek to this FP + // before reading positions: + private long posPendingFP; + + // Where this term's postings start in the .doc file: + private long docTermStartFP; + + // Where this term's postings start in the .pos file: + private long posTermStartFP; + + // Where this term's payloads/offsets start in the .pay + // file: + private long payTermStartFP; + + // File pointer where the last (vInt encoded) pos delta + // block is. We need this to know whether to bulk + // decode vs vInt decode the block: + private long lastPosBlockFP; + + // Where this term's skip data starts (after + // docTermStartFP) in the .doc file (or -1 if there is + // no skip data for this term): + private int skipOffset; + + private Bits liveDocs; + + public BlockDocsAndPositionsEnum(FieldInfo fieldInfo) throws IOException { + this.startDocIn = BlockPackedPostingsReader.this.docIn; + this.docIn = (IndexInput) startDocIn.clone(); + this.posIn = (IndexInput) BlockPackedPostingsReader.this.posIn.clone(); + encoded = new byte[blockSize*4]; + encodedBuffer = ByteBuffer.wrap(encoded).asLongBuffer(); + indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + indexHasPayloads = fieldInfo.hasPayloads(); + } + + public boolean canReuse(IndexInput docIn, FieldInfo fieldInfo) { + return docIn == startDocIn && + indexHasOffsets == (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) && + indexHasPayloads == fieldInfo.hasPayloads(); + } + + public DocsAndPositionsEnum reset(Bits liveDocs, IntBlockTermState termState) throws IOException { + this.liveDocs = liveDocs; + if (DEBUG) { + System.out.println(" FPR.reset: termState=" + termState); + } + docFreq = termState.docFreq; + docTermStartFP = termState.docStartFP; + posTermStartFP = termState.posStartFP; + payTermStartFP = termState.payStartFP; + docIn.seek(docTermStartFP); + skipOffset = termState.skipOffset; + posPendingFP = posTermStartFP; + posPendingCount = 0; + if (termState.totalTermFreq < blockSize) { + lastPosBlockFP = posTermStartFP; + } else if (termState.totalTermFreq == blockSize) { + lastPosBlockFP = -1; + } else { + lastPosBlockFP = posTermStartFP + termState.lastPosBlockOffset; + } + + doc = -1; + accum = 0; + docUpto = 0; + docBufferUpto = blockSize; + skipped = false; + return this; + } + + @Override + public int freq() throws IOException { + return freq; + } + + @Override + public int docID() { + return doc; + } + + private void refillDocs() throws IOException { + final int left = docFreq - docUpto; + assert left > 0; + + if (left >= blockSize) { + if (DEBUG) { + System.out.println(" fill doc block from fp=" + docIn.getFilePointer()); + } + + readBlock(docIn, encoded, encodedBuffer, docDeltaLBuffer); + + if (DEBUG) { + System.out.println(" fill freq block from fp=" + docIn.getFilePointer()); + } + + readBlock(docIn, encoded, encodedBuffer, freqLBuffer); + } else { + // Read vInts: + if (DEBUG) { + System.out.println(" fill last vInt doc block from fp=" + docIn.getFilePointer()); + } + for(int i=0;i>> 1; + if ((code & 1) != 0) { + freqBuffer[i] = 1; + } else { + freqBuffer[i] = docIn.readVInt(); + } + } + } + docBufferUpto = 0; + } + + private void refillPositions() throws IOException { + if (DEBUG) { + System.out.println(" refillPositions"); + } + if (posIn.getFilePointer() == lastPosBlockFP) { + if (DEBUG) { + System.out.println(" vInt pos block @ fp=" + posIn.getFilePointer() + " hasPayloads=" + indexHasPayloads + " hasOffsets=" + indexHasOffsets); + } + final int count = posIn.readVInt(); + int payloadLength = 0; + for(int i=0;i>> 1; + if (payloadLength != 0) { + posIn.seek(posIn.getFilePointer() + payloadLength); + } + } else { + posDeltaBuffer[i] = code; + } + + if (indexHasOffsets) { + posIn.readVInt(); + posIn.readVInt(); + } + } + } else { + if (DEBUG) { + System.out.println(" bulk pos block @ fp=" + posIn.getFilePointer()); + } + readBlock(posIn, encoded, encodedBuffer, posDeltaLBuffer); + } + } + + @Override + public int nextDoc() throws IOException { + + if (DEBUG) { + System.out.println(" FPR.nextDoc"); + } + + while (true) { + if (DEBUG) { + System.out.println(" docUpto=" + docUpto + " (of df=" + docFreq + ") docBufferUpto=" + docBufferUpto); + } + + if (docUpto == docFreq) { + return doc = NO_MORE_DOCS; + } + + if (docBufferUpto == blockSize) { + refillDocs(); + } + + if (DEBUG) { + System.out.println(" accum=" + accum + " docDeltaBuffer[" + docBufferUpto + "]=" + docDeltaBuffer[docBufferUpto]); + } + accum += (int)docDeltaBuffer[docBufferUpto]; + freq = (int)freqBuffer[docBufferUpto]; + posPendingCount += freq; + docBufferUpto++; + docUpto++; + + if (liveDocs == null || liveDocs.get(accum)) { + doc = accum; + if (DEBUG) { + System.out.println(" return doc=" + doc + " freq=" + freq + " posPendingCount=" + posPendingCount); + } + position = 0; + return doc; + } + + if (DEBUG) { + System.out.println(" doc=" + accum + " is deleted; try next doc"); + } + } + } + + @Override + public int advance(int target) throws IOException { + // nocommit make frq block load lazy/skippable + if (DEBUG) { + System.out.println(" FPR.advance target=" + target); + } + + // nocommit 2 is heuristic guess!! + // nocommit put cheating back! does it help? + // nocommit use skipper!!! it has next last doc id!! + //if (docFreq > blockSize && target - (blockSize - docBufferUpto) - 2*blockSize > accum) { + if (docFreq > blockSize && target - accum > blockSize) { + + if (DEBUG) { + System.out.println(" try skipper"); + } + + if (skipper == null) { + // Lazy init: first time this enum has ever been used for skipping + if (DEBUG) { + System.out.println(" create skipper"); + } + skipper = new BlockPackedSkipReader((IndexInput) docIn.clone(), + BlockPackedPostingsWriter.maxSkipLevels, + blockSize, + true, + indexHasOffsets, + indexHasPayloads); + } + + if (!skipped) { + assert skipOffset != -1; + // This is the first time this enum has skipped + // since reset() was called; load the skip data: + if (DEBUG) { + System.out.println(" init skipper"); + } + skipper.init(docTermStartFP+skipOffset, docTermStartFP, posTermStartFP, payTermStartFP, docFreq); + skipped = true; + } + + final int newDocUpto = skipper.skipTo(target); + + if (newDocUpto > docUpto) { + // Skipper moved + + if (DEBUG) { + System.out.println(" skipper moved to docUpto=" + newDocUpto + " vs current=" + docUpto + "; docID=" + skipper.getDoc() + " fp=" + skipper.getDocPointer() + " pos.fp=" + skipper.getPosPointer() + " pos.bufferUpto=" + skipper.getPosBufferUpto()); + } + + assert newDocUpto % blockSize == (blockSize-1): "got " + newDocUpto; + docUpto = newDocUpto+1; + + // Force block read next: + docBufferUpto = blockSize; + accum = skipper.getDoc(); + docIn.seek(skipper.getDocPointer()); + posPendingFP = skipper.getPosPointer(); + posPendingCount = skipper.getPosBufferUpto(); + } + } + + // Now scan: + while (nextDoc() != NO_MORE_DOCS) { + if (doc >= target) { + if (DEBUG) { + System.out.println(" advance return doc=" + doc); + } + return doc; + } + } + + if (DEBUG) { + System.out.println(" advance return doc=END"); + } + + return NO_MORE_DOCS; + } + + // nocommit in theory we could avoid loading frq block + // when not needed, ie, use skip data to load how far to + // seek the pos pointe ... instead of having to load frq + // blocks only to sum up how many positions to skip + private void skipPositions() throws IOException { + // Skip positions now: + int toSkip = posPendingCount - freq; + if (DEBUG) { + System.out.println(" FPR.skipPositions: toSkip=" + toSkip); + } + + final int leftInBlock = blockSize - posBufferUpto; + if (toSkip < leftInBlock) { + posBufferUpto += toSkip; + if (DEBUG) { + System.out.println(" skip w/in block to posBufferUpto=" + posBufferUpto); + } + } else { + toSkip -= leftInBlock; + while(toSkip >= blockSize) { + if (DEBUG) { + System.out.println(" skip whole block @ fp=" + posIn.getFilePointer()); + } + assert posIn.getFilePointer() != lastPosBlockFP; + skipBlock(posIn); + toSkip -= blockSize; + } + refillPositions(); + posBufferUpto = toSkip; + if (DEBUG) { + System.out.println(" skip w/in block to posBufferUpto=" + posBufferUpto); + } + } + + position = 0; + } + + @Override + public int nextPosition() throws IOException { + if (DEBUG) { + System.out.println(" FPR.nextPosition posPendingCount=" + posPendingCount + " posBufferUpto=" + posBufferUpto); + } + if (posPendingFP != -1) { + if (DEBUG) { + System.out.println(" seek to pendingFP=" + posPendingFP); + } + posIn.seek(posPendingFP); + posPendingFP = -1; + + // Force buffer refill: + posBufferUpto = blockSize; + } + + if (posPendingCount > freq) { + skipPositions(); + posPendingCount = freq; + } + + if (posBufferUpto == blockSize) { + refillPositions(); + posBufferUpto = 0; + } + position += (int)posDeltaBuffer[posBufferUpto++]; + posPendingCount--; + if (DEBUG) { + System.out.println(" return pos=" + position); + } + return position; + } + + @Override + public int startOffset() { + return -1; + } + + @Override + public int endOffset() { + return -1; + } + + @Override + public boolean hasPayload() { + return false; + } + + @Override + public BytesRef getPayload() { + return null; + } + } + + // Also handles payloads + offsets + final class EverythingEnum extends DocsAndPositionsEnum { + + private final byte[] encoded; + private final LongBuffer encodedBuffer; + + private final long[] docDeltaBuffer = new long[blockSize]; + private final long[] freqBuffer = new long[blockSize]; + private final long[] posDeltaBuffer = new long[blockSize]; + + private final long[] payloadLengthBuffer; + private final long[] offsetStartDeltaBuffer; + private final long[] offsetLengthBuffer; + + + private final LongBuffer docDeltaLBuffer = LongBuffer.wrap(docDeltaBuffer); + private final LongBuffer freqLBuffer = LongBuffer.wrap(freqBuffer); + private final LongBuffer posDeltaLBuffer = LongBuffer.wrap(posDeltaBuffer); + + private final LongBuffer payloadLengthLBuffer; + private final LongBuffer offsetStartDeltaLBuffer; + private final LongBuffer offsetLengthLBuffer; + + private byte[] payloadBytes; + private int payloadByteUpto; + private int payloadLength; + + private int lastEndOffset; + private int startOffset; + private int endOffset; + + private int docBufferUpto; + private int posBufferUpto; + + private BlockPackedSkipReader skipper; + private boolean skipped; + + final IndexInput startDocIn; + + final IndexInput docIn; + final IndexInput posIn; + final IndexInput payIn; + final BytesRef payload; + + final boolean indexHasOffsets; + final boolean indexHasPayloads; + + private int docFreq; // number of docs in this posting list + private int docUpto; // how many docs we've read + private int doc; // doc we last read + private int accum; // accumulator for doc deltas + private int freq; // freq we last read + private int position; // current position + + // how many positions "behind" we are; nextPosition must + // skip these to "catch up": + private int posPendingCount; + + // Lazy pos seek: if != -1 then we must seek to this FP + // before reading positions: + private long posPendingFP; + + // Lazy pay seek: if != -1 then we must seek to this FP + // before reading payloads/offsets: + private long payPendingFP; + + // Where this term's postings start in the .doc file: + private long docTermStartFP; + + // Where this term's postings start in the .pos file: + private long posTermStartFP; + + // Where this term's payloads/offsets start in the .pay + // file: + private long payTermStartFP; + + // File pointer where the last (vInt encoded) pos delta + // block is. We need this to know whether to bulk + // decode vs vInt decode the block: + private long lastPosBlockFP; + + // Where this term's skip data starts (after + // docTermStartFP) in the .doc file (or -1 if there is + // no skip data for this term): + private int skipOffset; + + private Bits liveDocs; + + public EverythingEnum(FieldInfo fieldInfo) throws IOException { + this.startDocIn = BlockPackedPostingsReader.this.docIn; + this.docIn = (IndexInput) startDocIn.clone(); + this.posIn = (IndexInput) BlockPackedPostingsReader.this.posIn.clone(); + this.payIn = (IndexInput) BlockPackedPostingsReader.this.payIn.clone(); + encoded = new byte[blockSize*4]; + encodedBuffer = ByteBuffer.wrap(encoded).asLongBuffer(); + indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + if (indexHasOffsets) { + offsetStartDeltaBuffer = new long[blockSize]; + offsetLengthBuffer = new long[blockSize]; + offsetStartDeltaLBuffer = LongBuffer.wrap(offsetStartDeltaBuffer); + offsetLengthLBuffer = LongBuffer.wrap(offsetLengthBuffer); + } else { + offsetStartDeltaBuffer = null; + offsetStartDeltaLBuffer = null; + offsetLengthBuffer = null; + offsetLengthLBuffer = null; + startOffset = -1; + endOffset = -1; + } + + indexHasPayloads = fieldInfo.hasPayloads(); + if (indexHasPayloads) { + payloadLengthBuffer = new long[blockSize]; + payloadLengthLBuffer = LongBuffer.wrap(payloadLengthBuffer); + payloadBytes = new byte[128]; + payload = new BytesRef(); + } else { + payloadLengthBuffer = null; + payloadLengthLBuffer = null; + payloadBytes = null; + payload = null; + } + } + + public boolean canReuse(IndexInput docIn, FieldInfo fieldInfo) { + return docIn == startDocIn && + indexHasOffsets == (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) && + indexHasPayloads == fieldInfo.hasPayloads(); + } + + public EverythingEnum reset(Bits liveDocs, IntBlockTermState termState) throws IOException { + this.liveDocs = liveDocs; + if (DEBUG) { + System.out.println(" FPR.reset: termState=" + termState); + } + docFreq = termState.docFreq; + docTermStartFP = termState.docStartFP; + posTermStartFP = termState.posStartFP; + payTermStartFP = termState.payStartFP; + docIn.seek(docTermStartFP); + skipOffset = termState.skipOffset; + posPendingFP = posTermStartFP; + payPendingFP = payTermStartFP; + posPendingCount = 0; + if (termState.totalTermFreq < blockSize) { + lastPosBlockFP = posTermStartFP; + } else if (termState.totalTermFreq == blockSize) { + lastPosBlockFP = -1; + } else { + lastPosBlockFP = posTermStartFP + termState.lastPosBlockOffset; + } + + doc = -1; + accum = 0; + docUpto = 0; + docBufferUpto = blockSize; + skipped = false; + return this; + } + + @Override + public int freq() throws IOException { + return freq; + } + + @Override + public int docID() { + return doc; + } + + private void refillDocs() throws IOException { + final int left = docFreq - docUpto; + assert left > 0; + + if (left >= blockSize) { + if (DEBUG) { + System.out.println(" fill doc block from fp=" + docIn.getFilePointer()); + } + + readBlock(docIn, encoded, encodedBuffer, docDeltaLBuffer); + + if (DEBUG) { + System.out.println(" fill freq block from fp=" + docIn.getFilePointer()); + } + + readBlock(docIn, encoded, encodedBuffer, freqLBuffer); + } else { + // Read vInts: + if (DEBUG) { + System.out.println(" fill last vInt doc block from fp=" + docIn.getFilePointer()); + } + for(int i=0;i>> 1; + if ((code & 1) != 0) { + freqBuffer[i] = 1; + } else { + freqBuffer[i] = docIn.readVInt(); + } + } + } + docBufferUpto = 0; + } + + private void refillPositions() throws IOException { + if (DEBUG) { + System.out.println(" refillPositions"); + } + if (posIn.getFilePointer() == lastPosBlockFP) { + if (DEBUG) { + System.out.println(" vInt pos block @ fp=" + posIn.getFilePointer() + " hasPayloads=" + indexHasPayloads + " hasOffsets=" + indexHasOffsets); + } + final int count = posIn.readVInt(); + int payloadLength = 0; + payloadByteUpto = 0; + for(int i=0;i docUpto) { + // Skipper moved + + if (DEBUG) { + System.out.println(" skipper moved to docUpto=" + newDocUpto + " vs current=" + docUpto + "; docID=" + skipper.getDoc() + " fp=" + skipper.getDocPointer() + " pos.fp=" + skipper.getPosPointer() + " pos.bufferUpto=" + skipper.getPosBufferUpto() + " pay.fp=" + skipper.getPayPointer() + " lastEndOffset=" + lastEndOffset); + } + + assert newDocUpto % blockSize == (blockSize-1): "got " + newDocUpto; + docUpto = newDocUpto+1; + + // Force block read next: + docBufferUpto = blockSize; + accum = skipper.getDoc(); + docIn.seek(skipper.getDocPointer()); + posPendingFP = skipper.getPosPointer(); + payPendingFP = skipper.getPayPointer(); + posPendingCount = skipper.getPosBufferUpto(); + lastEndOffset = skipper.getEndOffset(); + payloadByteUpto = skipper.getPayloadByteUpto(); + } + } + + // Now scan: + while (nextDoc() != NO_MORE_DOCS) { + if (doc >= target) { + if (DEBUG) { + System.out.println(" advance return doc=" + doc); + } + return doc; + } + } + + if (DEBUG) { + System.out.println(" advance return doc=END"); + } + + return NO_MORE_DOCS; + } + + // nocommit in theory we could avoid loading frq block + // when not needed, ie, use skip data to load how far to + // seek the pos pointe ... instead of having to load frq + // blocks only to sum up how many positions to skip + private void skipPositions() throws IOException { + // Skip positions now: + int toSkip = posPendingCount - freq; + if (DEBUG) { + System.out.println(" FPR.skipPositions: toSkip=" + toSkip); + } + + final int leftInBlock = blockSize - posBufferUpto; + if (toSkip < leftInBlock) { + int end = posBufferUpto + toSkip; + while(posBufferUpto < end) { + if (indexHasPayloads) { + payloadByteUpto += payloadLengthBuffer[posBufferUpto]; + } + if (indexHasOffsets) { + lastEndOffset += offsetStartDeltaBuffer[posBufferUpto] + offsetLengthBuffer[posBufferUpto]; + } + posBufferUpto++; + } + if (DEBUG) { + System.out.println(" skip w/in block to posBufferUpto=" + posBufferUpto); + } + } else { + toSkip -= leftInBlock; + while(toSkip >= blockSize) { + if (DEBUG) { + System.out.println(" skip whole block @ fp=" + posIn.getFilePointer()); + } + assert posIn.getFilePointer() != lastPosBlockFP; + skipBlock(posIn); + + if (indexHasPayloads) { + // Skip payloadLength block: + skipBlock(payIn); + + // Skip payloadBytes block: + int numBytes = payIn.readVInt(); + payIn.seek(payIn.getFilePointer() + numBytes); + } + + if (indexHasOffsets) { + // Must load offset blocks merely to sum + // up into lastEndOffset: + readBlock(payIn, encoded, encodedBuffer, offsetStartDeltaLBuffer); + readBlock(payIn, encoded, encodedBuffer, offsetLengthLBuffer); + for(int i=0;i= 0; + fieldHasPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + fieldHasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + fieldHasPayloads = fieldInfo.hasPayloads(); + skipWriter.setField(fieldHasPositions, fieldHasOffsets, fieldHasPayloads); + } + + @Override + public void startTerm() { + docTermStartFP = docOut.getFilePointer(); + if (fieldHasPositions) { + posTermStartFP = posOut.getFilePointer(); + if (fieldHasPayloads || fieldHasOffsets) { + payTermStartFP = payOut.getFilePointer(); + } + } + lastBlockDocID = -1; + lastDocID = 0; + if (DEBUG) { + System.out.println("FPW.startTerm startFP=" + docTermStartFP); + } + skipWriter.resetSkip(); + } + + private void writeBlock(LongBuffer buffer, IndexOutput out) throws IOException { + final int header = ForUtil.compress(buffer, encodedBuffer); + out.writeVInt(header); + out.writeBytes(encoded, ForUtil.getEncodedSize(header)); + } + + @Override + public void startDoc(int docID, int termDocFreq) throws IOException { + if (DEBUG) { + System.out.println("FPW.startDoc docID=" + docID); + } + + // nocommit do this in finishDoc... but does it fail...? + // is it not always called...? + if (posOut != null && saveNextPosBlock) { + lastBlockPosFP = posOut.getFilePointer(); + if (payOut != null) { + lastBlockPayFP = payOut.getFilePointer(); + } + lastBlockPosBufferUpto = posBufferUpto; + lastBlockEndOffset = lastEndOffset; + lastBlockPayloadByteUpto = payloadByteUpto; + saveNextPosBlock = false; + if (DEBUG) { + System.out.println(" now save lastBlockPosFP=" + lastBlockPosFP + " lastBlockPosBufferUpto=" + lastBlockPosBufferUpto + " lastBlockPayloadByteUpto=" + lastBlockPayloadByteUpto); + } + } + + final int docDelta = docID - lastDocID; + if (docID < 0 || (docCount > 0 && docDelta <= 0)) { + throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " ) (docOut: " + docOut + ")"); + } + lastDocID = docID; + + docDeltaBuffer[docBufferUpto] = docDelta; + if (DEBUG) { + System.out.println(" docDeltaBuffer[" + docBufferUpto + "]=" + docDelta); + } + if (fieldHasFreqs) { + freqBuffer[docBufferUpto] = termDocFreq; + } + + docBufferUpto++; + docCount++; + + if (docBufferUpto == blockSize) { + // nocommit maybe instead of buffering skip before + // writing a block based on last block's end data + // ... we could buffer after writing the block? only + // iffiness with that approach is it could be a + // pointlness skip? like we may stop adding docs + // right after that, then we have skip point AFTER + // last doc. the thing is, in finishTerm we are + // already sometimes adding a skip point AFTER the + // last doc? + if (lastBlockDocID != -1) { + if (DEBUG) { + System.out.println(" bufferSkip at writeBlock: lastDocID=" + lastBlockDocID + " docCount=" + (docCount-blockSize)); + } + skipWriter.bufferSkip(lastBlockDocID, docCount-blockSize, lastBlockPosFP, lastBlockPayFP, lastBlockPosBufferUpto, lastBlockEndOffset, lastBlockPayloadByteUpto); + } + lastBlockDocID = docID; + saveNextPosBlock = true; + + if (DEBUG) { + System.out.println(" write docDelta block @ fp=" + docOut.getFilePointer()); + } + writeBlock(docDeltaLBuffer, docOut); + if (fieldHasFreqs) { + if (DEBUG) { + System.out.println(" write freq block @ fp=" + docOut.getFilePointer()); + } + writeBlock(freqLBuffer, docOut); + } + docBufferUpto = 0; + } + + lastPosition = 0; + lastEndOffset = 0; + } + + /** Add a new position & payload */ + @Override + public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException { + if (DEBUG) { + System.out.println("FPW.addPosition pos=" + position + " posBufferUpto=" + posBufferUpto + (fieldHasPayloads ? " payloadByteUpto=" + payloadByteUpto: "")); + } + posDeltaBuffer[posBufferUpto] = position - lastPosition; + if (fieldHasPayloads) { + if (payload == null || payload.length == 0) { + // no payload + payloadLengthBuffer[posBufferUpto] = 0; + } else { + payloadLengthBuffer[posBufferUpto] = payload.length; + if (payloadByteUpto + payload.length > payloadBytes.length) { + payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payload.length); + } + System.arraycopy(payload.bytes, payload.offset, payloadBytes, payloadByteUpto, payload.length); + payloadByteUpto += payload.length; + } + } + + if (fieldHasOffsets) { + assert startOffset >= lastEndOffset; + assert endOffset >= startOffset; + offsetStartDeltaBuffer[posBufferUpto] = startOffset - lastEndOffset; + offsetLengthBuffer[posBufferUpto] = endOffset - startOffset; + lastEndOffset = endOffset; + } + + posBufferUpto++; + lastPosition = position; + if (posBufferUpto == blockSize) { + if (DEBUG) { + System.out.println(" write pos bulk block @ fp=" + posOut.getFilePointer()); + } + writeBlock(posDeltaLBuffer, posOut); + + if (fieldHasPayloads) { + writeBlock(payloadLengthLBuffer, payOut); + payOut.writeVInt(payloadByteUpto); + payOut.writeBytes(payloadBytes, 0, payloadByteUpto); + payloadByteUpto = 0; + } + if (fieldHasOffsets) { + writeBlock(offsetStartDeltaLBuffer, payOut); + writeBlock(offsetLengthLBuffer, payOut); + } + posBufferUpto = 0; + } + } + + @Override + public void finishDoc() { + } + + private static class PendingTerm { + public final long docStartFP; + public final long posStartFP; + public final long payStartFP; + public final int skipOffset; + public final int lastPosBlockOffset; + + public PendingTerm(long docStartFP, long posStartFP, long payStartFP, int skipOffset, int lastPosBlockOffset) { + this.docStartFP = docStartFP; + this.posStartFP = posStartFP; + this.payStartFP = payStartFP; + this.skipOffset = skipOffset; + this.lastPosBlockOffset = lastPosBlockOffset; + } + } + + private final List pendingTerms = new ArrayList(); + + /** Called when we are done adding docs to this term */ + @Override + public void finishTerm(TermStats stats) throws IOException { + + assert stats.docFreq > 0; + + // TODO: wasteful we are counting this (counting # docs + // for this term) in two places? + assert stats.docFreq == docCount: stats.docFreq + " vs " + docCount; + + if (DEBUG) { + System.out.println("FPW.finishTerm docFreq=" + stats.docFreq); + } + + // nocommit silly that skipper must write skip when we no + // postings come after it, but if we don't do this, skip + // reader incorrectly thinks it can read another level 0 + // skip entry here!: + //if (docCount > blockSize && docBufferUpto > 0) { + if (docCount > blockSize) { + final int lastDocCount = blockSize*(docCount/blockSize); + if (DEBUG) { + System.out.println(" bufferSkip at finishTerm: lastDocID=" + lastBlockDocID + " docCount=" + lastDocCount); + } + skipWriter.bufferSkip(lastBlockDocID, lastDocCount, lastBlockPosFP, lastBlockPayFP, lastBlockPosBufferUpto, lastBlockEndOffset, lastBlockPayloadByteUpto); + } + + if (DEBUG) { + if (docBufferUpto > 0) { + System.out.println(" write doc/freq vInt block (count=" + docBufferUpto + ") at fp=" + docOut.getFilePointer() + " docTermStartFP=" + docTermStartFP); + } + } + + // vInt encode the remaining doc deltas and freqs: + for(int i=0;i 0) { + System.out.println(" write pos vInt block (count=" + posBufferUpto + ") at fp=" + posOut.getFilePointer() + " posTermStartFP=" + posTermStartFP + " hasPayloads=" + fieldHasPayloads + " hasOffsets=" + fieldHasOffsets); + } + } + + assert stats.totalTermFreq != -1; + if (stats.totalTermFreq > blockSize) { + lastPosBlockOffset = (int) (posOut.getFilePointer() - posTermStartFP); + } else { + lastPosBlockOffset = -1; + } + if (posBufferUpto > 0) { + posOut.writeVInt(posBufferUpto); + + // nocommit should we send offsets/payloads to + // .pay...? seems wasteful (have to store extra + // vLong for low (< blockSize) DF terms = vast vast + // majority) + + // vInt encode the remaining positions/payloads/offsets: + int lastPayloadLength = -1; + int payloadBytesReadUpto = 0; + for(int i=0;i 0? +// 128*128 is immense? may need to decouple +// baseSkipInterval & theRestSkipInterval? + +final class BlockPackedSkipWriter extends MultiLevelSkipListWriter { + private boolean DEBUG = BlockPackedPostingsReader.DEBUG; + + private int[] lastSkipDoc; + private long[] lastSkipDocPointer; + private long[] lastSkipPosPointer; + private long[] lastSkipPayPointer; + private int[] lastEndOffset; + private int[] lastPayloadByteUpto; + + private final IndexOutput docOut; + private final IndexOutput posOut; + private final IndexOutput payOut; + + private int curDoc; + private long curDocPointer; + private long curPosPointer; + private long curPayPointer; + private int curPosBufferUpto; + private int curEndOffset; + private int curPayloadByteUpto; + private boolean fieldHasPositions; + private boolean fieldHasOffsets; + private boolean fieldHasPayloads; + + public BlockPackedSkipWriter(int skipInterval, int maxSkipLevels, int docCount, IndexOutput docOut, IndexOutput posOut, IndexOutput payOut) { + super(skipInterval, maxSkipLevels, docCount); + this.docOut = docOut; + this.posOut = posOut; + this.payOut = payOut; + + lastSkipDoc = new int[maxSkipLevels]; + lastSkipDocPointer = new long[maxSkipLevels]; + if (posOut != null) { + lastSkipPosPointer = new long[maxSkipLevels]; + if (payOut != null) { + lastSkipPayPointer = new long[maxSkipLevels]; + } + lastEndOffset = new int[maxSkipLevels]; + lastPayloadByteUpto = new int[maxSkipLevels]; + } + } + + public void setField(boolean fieldHasPositions, boolean fieldHasOffsets, boolean fieldHasPayloads) { + this.fieldHasPositions = fieldHasPositions; + this.fieldHasOffsets = fieldHasOffsets; + this.fieldHasPayloads = fieldHasPayloads; + } + + @Override + public void resetSkip() { + super.resetSkip(); + Arrays.fill(lastSkipDoc, 0); + Arrays.fill(lastSkipDocPointer, docOut.getFilePointer()); + if (fieldHasPositions) { + Arrays.fill(lastSkipPosPointer, posOut.getFilePointer()); + if (fieldHasOffsets) { + Arrays.fill(lastEndOffset, 0); + } + if (fieldHasPayloads) { + Arrays.fill(lastPayloadByteUpto, 0); + } + if (fieldHasOffsets || fieldHasPayloads) { + Arrays.fill(lastSkipPayPointer, payOut.getFilePointer()); + } + } + } + + /** + * Sets the values for the current skip data. + */ + public void bufferSkip(int doc, int numDocs, long posFP, long payFP, int posBufferUpto, int endOffset, int payloadByteUpto) throws IOException { + this.curDoc = doc; + this.curDocPointer = docOut.getFilePointer(); + this.curPosPointer = posFP; + this.curPayPointer = payFP; + this.curPosBufferUpto = posBufferUpto; + this.curPayloadByteUpto = payloadByteUpto; + this.curEndOffset = endOffset; + bufferSkip(numDocs); + } + + @Override + protected void writeSkipData(int level, IndexOutput skipBuffer) throws IOException { + int delta = curDoc - lastSkipDoc[level]; + if (DEBUG) { + System.out.println("writeSkipData level=" + level + " lastDoc=" + curDoc + " delta=" + delta + " curDocPointer=" + curDocPointer); + } + skipBuffer.writeVInt(delta); + lastSkipDoc[level] = curDoc; + + skipBuffer.writeVInt((int) (curDocPointer - lastSkipDocPointer[level])); + lastSkipDocPointer[level] = curDocPointer; + + if (fieldHasPositions) { + if (DEBUG) { + System.out.println(" curPosPointer=" + curPosPointer + " curPosBufferUpto=" + curPosBufferUpto); + } + skipBuffer.writeVInt((int) (curPosPointer - lastSkipPosPointer[level])); + lastSkipPosPointer[level] = curPosPointer; + skipBuffer.writeVInt(curPosBufferUpto); + + if (fieldHasPayloads) { + skipBuffer.writeVInt(curPayloadByteUpto); + } + + if (fieldHasOffsets) { + skipBuffer.writeVInt(curEndOffset - lastEndOffset[level]); + lastEndOffset[level] = curEndOffset; + } + + if (fieldHasOffsets || fieldHasPayloads) { + skipBuffer.writeVInt((int) (curPayPointer - lastSkipPayPointer[level])); + lastSkipPayPointer[level] = curPayPointer; + } + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blockpacked/ForUtil.java b/lucene/core/src/java/org/apache/lucene/codecs/blockpacked/ForUtil.java new file mode 100644 index 00000000000..022e73ba79b --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/blockpacked/ForUtil.java @@ -0,0 +1,130 @@ +package org.apache.lucene.codecs.blockpacked; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.nio.LongBuffer; +import java.nio.IntBuffer; +import java.nio.ByteBuffer; +import java.util.Arrays; + +import org.apache.lucene.util.packed.PackedInts; +import org.apache.lucene.util.packed.PackedInts.Reader; +import org.apache.lucene.util.packed.PackedInts.Writer; +import org.apache.lucene.util.packed.PackedInts.Mutable; +import org.apache.lucene.util.packed.PackedInts.Encoder; +import org.apache.lucene.util.packed.PackedInts.Decoder; + +/** + * Encode all values in normal area with fixed bit width, + * which is determined by the max value in this block. + */ +public class ForUtil { + protected static final int[] MASK = { 0x00000000, + 0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f, 0x0000003f, + 0x0000007f, 0x000000ff, 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff, + 0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff, 0x0001ffff, 0x0003ffff, + 0x0007ffff, 0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff, + 0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, 0x1fffffff, 0x3fffffff, + 0x7fffffff, 0xffffffff}; + + /** Compress given int[] into output stream, with For format + */ + public static int compress(final LongBuffer data, LongBuffer packed) throws IOException { + int numBits=getNumBits(data.array()); + + if (numBits == 0) { // when block is equal, save the value once + packed.put(0, data.get(0)<<32); // java uses big endian for LongBuffer impl + return (getHeader(1,numBits)); + } + + PackedInts.Format format = PackedInts.fastestFormatAndBits(128, numBits, PackedInts.FASTEST).format; + PackedInts.Encoder encoder = PackedInts.getEncoder(format, PackedInts.VERSION_CURRENT, numBits); + int perIter = encoder.values(); + int iters = 128/perIter; + int nblocks = encoder.blocks()*iters; + assert 128 % perIter == 0; + + packed.rewind(); + data.rewind(); + + encoder.encode(data, packed, iters); + + int encodedSize = nblocks*2; + return getHeader(encodedSize,numBits); + } + + /** Decompress given ouput stream into int array. + */ + public static void decompress(LongBuffer data, LongBuffer packed, int header) throws IOException { + // nocommit assert header isn't "malformed", ie besides + // numBytes / bit-width there is nothing else! + + packed.rewind(); + data.rewind(); + int numBits = ((header >> 8) & MASK[6]); + + if (numBits == 0) { + Arrays.fill(data.array(), (int)(packed.get(0)>>>32)); + return; + } + + PackedInts.Format format = PackedInts.fastestFormatAndBits(128, numBits, PackedInts.FASTEST).format; + PackedInts.Decoder decoder = PackedInts.getDecoder(format, PackedInts.VERSION_CURRENT, numBits); + int perIter = decoder.values(); + int iters = 128/perIter; + int nblocks = decoder.blocks()*iters; + assert 128 % perIter == 0; + + decoder.decode(packed, data, iters); + } + + static int getNumBits(final long[] data) { + if (isAllEqual(data)) { + return 0; + } + int size=data.length; + int optBits=1; + for (int i=0; i> 8) & MASK[6]); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/pfor/ForFactory.java b/lucene/core/src/java/org/apache/lucene/codecs/pfor/ForFactory.java deleted file mode 100644 index 1c636b99fbb..00000000000 --- a/lucene/core/src/java/org/apache/lucene/codecs/pfor/ForFactory.java +++ /dev/null @@ -1,128 +0,0 @@ -package org.apache.lucene.codecs.pfor; -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.IntBuffer; - -import org.apache.lucene.util.IOUtils; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.IOContext; -import org.apache.lucene.store.IndexInput; -import org.apache.lucene.store.IndexOutput; -import org.apache.lucene.codecs.sep.IntStreamFactory; -import org.apache.lucene.codecs.sep.IntIndexInput; -import org.apache.lucene.codecs.sep.IntIndexOutput; -import org.apache.lucene.codecs.intblock.FixedIntBlockIndexInput; -import org.apache.lucene.codecs.intblock.FixedIntBlockIndexOutput; - -/** - * Used to plug to PostingsReader/WriterBase. - * Encoder and decoder in lower layers are called by - * flushBlock() and readBlock() - */ - -public final class ForFactory extends IntStreamFactory { - - public ForFactory() { - } - - @Override - public IntIndexOutput createOutput(Directory dir, String fileName, IOContext context) throws IOException { - boolean success = false; - IndexOutput out = dir.createOutput(fileName, context); - try { - IntIndexOutput ret = new ForIndexOutput(out); - success = true; - return ret; - } finally { - if (!success) { - // For some cases (e.g. disk full), the IntIndexOutput may not be - // properly created. So we should close those opened files. - IOUtils.closeWhileHandlingException(out); - } - } - } - - @Override - public IntIndexInput openInput(Directory dir, String fileName, IOContext context) throws IOException { - return new ForIndexInput(dir.openInput(fileName, context)); - } - - /** - * Here we'll hold both input buffer and output buffer for - * encoder/decoder. - */ - private class ForIndexInput extends FixedIntBlockIndexInput { - - ForIndexInput(final IndexInput in) throws IOException { - super(in); - } - - class ForBlockReader implements FixedIntBlockIndexInput.BlockReader { - private final byte[] encoded; - private final int[] buffer; - private final IndexInput in; - private final IntBuffer encodedBuffer; - - ForBlockReader(final IndexInput in, final int[] buffer) { - // upperbound for encoded value should include(here header is not buffered): - // blockSize of normal value when numFrameBits=32(4x bytes); - this.encoded = new byte[ForPostingsFormat.DEFAULT_BLOCK_SIZE*4]; - this.in = in; - this.buffer = buffer; - this.encodedBuffer = ByteBuffer.wrap(encoded).asIntBuffer(); - } - - // TODO: implement public void skipBlock() {} ? - @Override - public void readBlock() throws IOException { - final int header = in.readInt(); - final int numBytes = ForUtil.getEncodedSize(header); - assert numBytes <= ForPostingsFormat.DEFAULT_BLOCK_SIZE*4; - in.readBytes(encoded,0,numBytes); - ForUtil.decompress(encodedBuffer,buffer,header); - } - } - - @Override - protected BlockReader getBlockReader(final IndexInput in, final int[] buffer) throws IOException { - return new ForBlockReader(in,buffer); - } - } - - private class ForIndexOutput extends FixedIntBlockIndexOutput { - private final byte[] encoded; - private final IntBuffer encodedBuffer; - - ForIndexOutput(IndexOutput out) throws IOException { - super(out,ForPostingsFormat.DEFAULT_BLOCK_SIZE); - this.encoded = new byte[ForPostingsFormat.DEFAULT_BLOCK_SIZE*4]; - this.encodedBuffer=ByteBuffer.wrap(encoded).asIntBuffer(); - } - - @Override - protected void flushBlock() throws IOException { - final int header = ForUtil.compress(buffer,encodedBuffer); - final int numBytes = ForUtil.getEncodedSize(header); - // nocommit writeVInt instead? - out.writeInt(header); - out.writeBytes(encoded, numBytes); - } - } -} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/pfor/PForFactory.java b/lucene/core/src/java/org/apache/lucene/codecs/pfor/PForFactory.java deleted file mode 100644 index aa06f785e9a..00000000000 --- a/lucene/core/src/java/org/apache/lucene/codecs/pfor/PForFactory.java +++ /dev/null @@ -1,129 +0,0 @@ -package org.apache.lucene.codecs.pfor; -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.IntBuffer; - -import org.apache.lucene.util.IOUtils; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.IOContext; -import org.apache.lucene.store.IndexInput; -import org.apache.lucene.store.IndexOutput; -import org.apache.lucene.codecs.sep.IntStreamFactory; -import org.apache.lucene.codecs.sep.IntIndexInput; -import org.apache.lucene.codecs.sep.IntIndexOutput; -import org.apache.lucene.codecs.intblock.FixedIntBlockIndexInput; -import org.apache.lucene.codecs.intblock.FixedIntBlockIndexOutput; - -/** - * Used to plug to PostingsReader/WriterBase. - * Encoder and decoder in lower layers are called by - * flushBlock() and readBlock() - */ - -public final class PForFactory extends IntStreamFactory { - - public PForFactory() { - } - - @Override - public IntIndexOutput createOutput(Directory dir, String fileName, IOContext context) throws IOException { - boolean success = false; - IndexOutput out = dir.createOutput(fileName, context); - try { - IntIndexOutput ret = new PForIndexOutput(out); - success = true; - return ret; - } finally { - if (!success) { - // For some cases (e.g. disk full), the IntIndexOutput may not be - // properly created. So we should close those opened files. - IOUtils.closeWhileHandlingException(out); - } - } - } - - @Override - public IntIndexInput openInput(Directory dir, String fileName, IOContext context) throws IOException { - return new PForIndexInput(dir.openInput(fileName, context)); - } - - /** - * Here we'll hold both input buffer and output buffer for - * encoder/decoder. - */ - private class PForIndexInput extends FixedIntBlockIndexInput { - - PForIndexInput(final IndexInput in) throws IOException { - super(in); - } - - class PForBlockReader implements FixedIntBlockIndexInput.BlockReader { - private final byte[] encoded; - private final int[] buffer; - private final IndexInput in; - private final IntBuffer encodedBuffer; - - PForBlockReader(final IndexInput in, final int[] buffer) { - // upperbound for encoded value should include(here header is not buffered): - // 1. blockSize of normal value (4x bytes); - // 2. blockSize of exception value (4x bytes); - this.encoded = new byte[PForPostingsFormat.DEFAULT_BLOCK_SIZE*8]; - this.in = in; - this.buffer = buffer; - this.encodedBuffer = ByteBuffer.wrap(encoded).asIntBuffer(); - } - - // TODO: implement public void skipBlock() {} ? - @Override - public void readBlock() throws IOException { - final int header = in.readInt(); - final int numBytes = PForUtil.getEncodedSize(header); - assert numBytes <= PForPostingsFormat.DEFAULT_BLOCK_SIZE*8; - in.readBytes(encoded,0,numBytes); - PForUtil.decompress(encodedBuffer,buffer,header); - } - } - - @Override - protected BlockReader getBlockReader(final IndexInput in, final int[] buffer) throws IOException { - return new PForBlockReader(in,buffer); - } - } - - private class PForIndexOutput extends FixedIntBlockIndexOutput { - private final byte[] encoded; - private final IntBuffer encodedBuffer; - - PForIndexOutput(IndexOutput out) throws IOException { - super(out, PForPostingsFormat.DEFAULT_BLOCK_SIZE); - this.encoded = new byte[PForPostingsFormat.DEFAULT_BLOCK_SIZE*8]; - this.encodedBuffer=ByteBuffer.wrap(encoded).asIntBuffer(); - } - - @Override - protected void flushBlock() throws IOException { - final int header = PForUtil.compress(buffer,encodedBuffer); - final int numBytes = PForUtil.getEncodedSize(header); - // nocommit writeVInt instead? - out.writeInt(header); - out.writeBytes(encoded, numBytes); - } - } -} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/pfor/PForPostingsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/pfor/PForPostingsFormat.java deleted file mode 100644 index 8704dcdd4cf..00000000000 --- a/lucene/core/src/java/org/apache/lucene/codecs/pfor/PForPostingsFormat.java +++ /dev/null @@ -1,115 +0,0 @@ -package org.apache.lucene.codecs.pfor; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.util.Set; - -import org.apache.lucene.codecs.BlockTreeTermsReader; -import org.apache.lucene.codecs.BlockTreeTermsWriter; -import org.apache.lucene.codecs.FieldsConsumer; -import org.apache.lucene.codecs.FieldsProducer; -import org.apache.lucene.codecs.FixedGapTermsIndexReader; -import org.apache.lucene.codecs.FixedGapTermsIndexWriter; -import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.PostingsReaderBase; -import org.apache.lucene.codecs.PostingsWriterBase; -import org.apache.lucene.codecs.TermsIndexReaderBase; -import org.apache.lucene.codecs.TermsIndexWriterBase; -import org.apache.lucene.codecs.sep.SepPostingsReader; -import org.apache.lucene.codecs.sep.SepPostingsWriter; -import org.apache.lucene.index.SegmentInfo; -import org.apache.lucene.index.SegmentReadState; -import org.apache.lucene.index.SegmentWriteState; -import org.apache.lucene.store.IOContext; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.IOUtils; - -/** - * Pass PForFactory to a PostingsWriter/ReaderBase, and get - * customized postings format plugged. - */ -public final class PForPostingsFormat extends PostingsFormat { - private final int minBlockSize; - private final int maxBlockSize; - public final static int DEFAULT_BLOCK_SIZE = 128; - - public PForPostingsFormat() { - super("PFor"); - this.minBlockSize = BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE; - this.maxBlockSize = BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE; - } - public PForPostingsFormat(int minBlockSize, int maxBlockSize) { - super("PFor"); - this.minBlockSize = minBlockSize; - assert minBlockSize > 1; - this.maxBlockSize = maxBlockSize; - assert minBlockSize <= maxBlockSize; - } - - @Override - public String toString() { - return getName() + "(blocksize=" + DEFAULT_BLOCK_SIZE+ ")"; - } - - @Override - public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - // TODO: implement a new PostingsWriterBase to improve skip-settings - PostingsWriterBase postingsWriter = new SepPostingsWriter(state, new PForFactory()); - boolean success = false; - try { - FieldsConsumer ret = new BlockTreeTermsWriter(state, - postingsWriter, - minBlockSize, - maxBlockSize); - success = true; - return ret; - } finally { - if (!success) { - IOUtils.closeWhileHandlingException(postingsWriter); - } - } - } - - @Override - public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - PostingsReaderBase postingsReader = new SepPostingsReader(state.dir, - state.fieldInfos, - state.segmentInfo, - state.context, - new PForFactory(), - state.segmentSuffix); - - boolean success = false; - try { - FieldsProducer ret = new BlockTreeTermsReader(state.dir, - state.fieldInfos, - state.segmentInfo.name, - postingsReader, - state.context, - state.segmentSuffix, - state.termsIndexDivisor); - success = true; - return ret; - } finally { - if (!success) { - IOUtils.closeWhileHandlingException(postingsReader); - } - } - } -} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/pfor/PForUtil.java b/lucene/core/src/java/org/apache/lucene/codecs/pfor/PForUtil.java deleted file mode 100644 index 92baa3e2317..00000000000 --- a/lucene/core/src/java/org/apache/lucene/codecs/pfor/PForUtil.java +++ /dev/null @@ -1,343 +0,0 @@ -package org.apache.lucene.codecs.pfor; -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.nio.IntBuffer; -import java.nio.ByteBuffer; -import java.util.Arrays; - -/** - * Encode all small values and exception pointers in normal area; - * Encode large values in exception area; - * Size per exception is variable, possibly: 1byte, 2bytes, or 4bytes - */ -public final class PForUtil extends ForUtil { - - protected static final int[] PER_EXCEPTION_SIZE = {1,2,4}; - - /** Compress given int[] into Integer buffer, with PFor format - * - * @param data uncompressed data - * @param intBuffer integer buffer to hold compressed data - * @return block header - */ - public static int compress(final int[] data, IntBuffer intBuffer) { - /** estimate minimum compress size to determine numFrameBits */ - int numBits=getNumBits(data); - if (numBits == 0) { - return compressDuplicateBlock(data,intBuffer); - } - - int size = data.length; - int[] excValues = new int[size]; - int excNum = 0, excLastPos = -1, excFirstPos = -1, excLastNonForcePos = -1; - - // num of exception until the last non-forced exception - int excNumBase = 0; - - // bytes per exception - int excBytes = 1; - - // bytes before exception area, e.g. header and normal area - int excByteOffset = 0; - - // the max value possible for current exception pointer, - // value of the first pointer is limited by header as 254 - // (first exception ranges from -1 ~ 254) - long maxChainFirst = 254; - long maxChain = maxChainFirst + 1; - - boolean conValue, conForce, conEnd; - int i=0; - - /** estimate exceptions */ - for (i=0; i= maxChain + excLastPos); // force exception - if (conValue || conForce) { - excValues[excNum++] = data[i]; - if (excLastPos == -1) { - maxChain = 1L<= maxChain + excLastPos); // force exception - conEnd = (excNum == excNumBase); // following forced ignored - if ((!conValue && !conForce) || conEnd) { - encodeNormalValue(intBuffer,i,data[i], numBits); - } else { - encodeNormalValue(intBuffer, excLastPos, i-excLastPos-1, numBits); - excNum++; - excLastPos = i; - } - } - } - - /** encode exception area */ - for (i=0; i> 8) & MASK[8]) + 1; - int excFirstPos = ((header >> 16) & MASK[8]) - 1; - int excBytes = PER_EXCEPTION_SIZE[(header >> 30) & MASK[2]]; - int numBits = ((header >> 24) & MASK[6]); - - decompressCore(intBuffer, data, numBits); - - patchException(intBuffer,data,excNum,excFirstPos,excBytes); - } - - /** - * Encode exception values into exception area. - * The width for each exception will be fixed as: - * 1, 2, or 4 byte(s). - */ - static void encodeExcValues(IntBuffer intBuffer, int[] values, int num, int perbytes, int byteOffset) { - if (num == 0) - return; - if (perbytes == 1) { - int curBytePos = byteOffset; - for (int i=0; i>> 8) & MASK[8]); - curPos = patch(data, curPos, (curInt >>> 16) & MASK[8]); - curPos = patch(data, curPos, (curInt >>> 24) & MASK[8]); - } - if (i>> j) & MASK[8]); - } - } - } else if (excBytes == 2) { // each exception consumes 2 bytes - for (i=0; i+1>> 16) & MASK[16]); - } - if (i pos; - return nextPos; - } - - /** - * Estimate best number of frame bits according to minimum compressed size. - * It will run 32 times. - */ - static int getNumBits(final int[] data) { - if (isAllEqual(data)) { - return 0; - } - int optBits=1; - int optSize=estimateCompressedSize(data,optBits); - for (int i=2; i<=32; ++i) { - int curSize=estimateCompressedSize(data,i); - if (curSize> 8) & MASK[8]) + 1; - } - public static int getFirstPos(int header) { - return ((header >> 16) & MASK[8]) - 1; - } - public static int getExcBytes(int header) { - return PER_EXCEPTION_SIZE[(header >> 30) & MASK[2]]; - } - public static int getNumBits(int header) { - return ((header >> 24) & MASK[6]); - } -} diff --git a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat index a89e8702acd..115cb3d5f1b 100644 --- a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat +++ b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat @@ -17,8 +17,6 @@ org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat org.apache.lucene.codecs.pulsing.Pulsing40PostingsFormat org.apache.lucene.codecs.simpletext.SimpleTextPostingsFormat org.apache.lucene.codecs.memory.MemoryPostingsFormat -org.apache.lucene.codecs.pfor.ForPostingsFormat -org.apache.lucene.codecs.pfor.PForPostingsFormat org.apache.lucene.codecs.bulkvint.BulkVIntPostingsFormat org.apache.lucene.codecs.block.BlockPostingsFormat org.apache.lucene.codecs.memory.DirectPostingsFormat diff --git a/lucene/core/src/test/org/apache/lucene/codecs/pfor/TestPForUtil.java b/lucene/core/src/test/org/apache/lucene/codecs/pfor/TestPForUtil.java deleted file mode 100644 index 6394d92851d..00000000000 --- a/lucene/core/src/test/org/apache/lucene/codecs/pfor/TestPForUtil.java +++ /dev/null @@ -1,293 +0,0 @@ -package org.apache.lucene.codecs.pfor; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.nio.ByteBuffer; -import java.nio.IntBuffer; -import java.util.Arrays; -import java.util.Collections; -import java.util.Locale; -import java.util.Random; - -import org.apache.lucene.codecs.pfor.ForPostingsFormat; -import org.apache.lucene.codecs.pfor.PForUtil; -import org.apache.lucene.util.LuceneTestCase; - -/** - * Test the core utility for PFor compress and decompress - * We don't specially provide test case for For encoder/decoder, since - * PFor is a extended version of For, and most methods will be reused - * here. - */ -public class TestPForUtil extends LuceneTestCase { - static final int[] MASK={ 0x00000000, - 0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f, 0x0000003f, - 0x0000007f, 0x000000ff, 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff, - 0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff, 0x0001ffff, 0x0003ffff, - 0x0007ffff, 0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff, - 0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, 0x1fffffff, 0x3fffffff, - 0x7fffffff, 0xffffffff}; - Random gen; - public void initRandom() { - this.gen = random(); - } - - /** - * Should not encode extra information other than single int - */ - public void testAllEqual() throws Exception { - initRandom(); - int sz=ForPostingsFormat.DEFAULT_BLOCK_SIZE; - int[] data=new int[sz]; - byte[] res = new byte[sz*8]; - int[] copy = new int[sz]; - IntBuffer resBuffer = ByteBuffer.wrap(res).asIntBuffer(); - int ensz; - int header; - - Arrays.fill(data,gen.nextInt()); - header = ForUtil.compress(data,resBuffer); // test For - ensz = ForUtil.getEncodedSize(header); - assert ensz == 4; - - ForUtil.decompress(resBuffer,copy,header); - assert cmp(data,sz,copy,sz)==true; - - Arrays.fill(data,gen.nextInt()); - header = PForUtil.compress(data,resBuffer); // test PFor - ensz = PForUtil.getEncodedSize(header); - assert ensz == 4; - - PForUtil.decompress(resBuffer,copy,header); - assert cmp(data,sz,copy,sz)==true; - } - - /** - * Test correctness of forced exception. - * the forced ones should exactly fit max chain - */ - public void testForcedExceptionDistance() throws Exception { - initRandom(); - int sz=ForPostingsFormat.DEFAULT_BLOCK_SIZE; - int[] data=new int[sz]; - byte[] res = new byte[sz*8]; - int[] copy = new int[sz]; - IntBuffer resBuffer = ByteBuffer.wrap(res).asIntBuffer(); - int numBits = gen.nextInt(5)+1; - - int i,j; - int pace, ensz, header; - int expect, got; - - // fill exception value with same pace, there should - // be no forced exceptions. - createDistribution(data, sz, 1, MASK[numBits], MASK[numBits]); - pace = 1<0 - createDistribution(data, sz, 1, MASK[numBits], MASK[numBits]); - pace = (1< "+sz*8; // must not exceed the loose upperbound - assert (ensz >= 4); // at least we have an exception, right? - - PForUtil.decompress(resBuffer,copy,header); - -// println(getHex(data,sz)+"\n"); -// println(getHex(res,ensz)+"\n"); -// println(getHex(copy,sz)+"\n"); - - // fetch the last int, i.e. last exception. - int lastExc = (res[ensz-4] << 24) | - ((0xff & res[ensz-3]) << 16) | - ((0xff & res[ensz-2]) << 8 ) | - (0xff & res[ensz-1]); - - // trailing forced exceptions are suppressed, - // so the last exception should be what we assigned. - assert lastExc==excValue; - assert cmp(data,sz,copy,sz)==true; - } - - /** - * Test correctness of compressing and decompressing. - * Here we randomly assign a rate of exception (i.e. 1-alpha), - * and test different scale of normal/exception values. - */ - public void testAllDistribution() throws Exception { - initRandom(); - int sz = ForPostingsFormat.DEFAULT_BLOCK_SIZE; - int[] data = new int[sz]; - for (int i=0; i<=32; ++i) { // try to test every kinds of distribution - double alpha=gen.nextDouble(); // rate of normal value - for (int j=i; j<=32; ++j) { - createDistribution(data,sz,alpha,MASK[i],MASK[j]); - tryCompressAndDecompress(data, sz); - } - } - } - public void createDistribution(int[] data, int sz, double alpha, int masknorm, int maskexc) { - Integer[] buff= new Integer[sz]; - int i=0; - for (; i0 && (i)%16 == 0) - hex.append("\n"); - byte b=raw[i]; - hex.append(HEXES.charAt((b & 0xF0) >> 4)) - .append(HEXES.charAt((b & 0x0F))) - .append(" "); - } - return hex.toString(); - } - public static String getHex( int [] raw, int sz ) { - if ( raw == null ) { - return null; - } - final StringBuilder hex = new StringBuilder( 4 * raw.length ); - for ( int i=0; i0 && i%8 == 0) - hex.append("\n"); - hex.append(String.format(Locale.ENGLISH, "%08x ",raw[i])); - } - return hex.toString(); - } - static void eprintln(String format, Object... args) { - System.err.println(String.format(Locale.ENGLISH, format,args)); - } - static void println(String format, Object... args) { - System.out.println(String.format(Locale.ENGLISH, format,args)); - } - static void print(String format, Object... args) { - System.out.print(String.format(Locale.ENGLISH, format,args)); - } -} diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/mockrandom/MockRandomPostingsFormat.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/mockrandom/MockRandomPostingsFormat.java index a8bbb94b001..97fef583a5f 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/mockrandom/MockRandomPostingsFormat.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/mockrandom/MockRandomPostingsFormat.java @@ -61,7 +61,6 @@ import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; -import org.apache.lucene.codecs.pfor.*; /** * Randomly combines terms index impl w/ postings impls. @@ -103,8 +102,6 @@ public class MockRandomPostingsFormat extends PostingsFormat { final int baseBlockSize = _TestUtil.nextInt(random, 1, 127); delegates.add(new MockVariableIntBlockPostingsFormat.MockIntFactory(baseBlockSize)); // TODO: others - delegates.add(new ForFactory()); - delegates.add(new PForFactory()); } private static String getExtension(String fileName) { diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java index 538559a761c..f9527a86ce8 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java @@ -282,9 +282,7 @@ public abstract class LuceneTestCase extends Assert { "MockFixedIntBlock", "MockVariableIntBlock", "MockSep", - "MockRandom", - "For", - "PFor" + "MockRandom" )); // -----------------------------------------------------------------