From 1e49670a55937e81dd4ed18b68654d50e826e25c Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Thu, 19 Jul 2012 16:53:58 +0000 Subject: [PATCH] LUCENE-4225: BlockPostingsFormat git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/pforcodec_3892@1363421 13f79535-47bb-0310-9956-ffa450edef68 --- build.xml | 3 +- .../lucene/codecs/BlockTreeTermsWriter.java | 2 +- .../codecs/block/BlockPostingsFormat.java | 108 ++ .../codecs/block/BlockPostingsReader.java | 1524 +++++++++++++++++ .../codecs/block/BlockPostingsWriter.java | 574 +++++++ .../lucene/codecs/block/BlockSkipReader.java | 205 +++ .../lucene/codecs/block/BlockSkipWriter.java | 147 ++ .../lucene40/Lucene40PostingsWriter.java | 2 - .../apache/lucene/codecs/pfor/ForUtil.java | 5 +- .../org/apache/lucene/index/FieldInfos.java | 18 + .../org.apache.lucene.codecs.PostingsFormat | 1 + .../lucene/index/TestPostingsFormat.java | 866 ++++++++++ 12 files changed, 3449 insertions(+), 6 deletions(-) create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsFormat.java create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsReader.java create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsWriter.java create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/block/BlockSkipReader.java create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/block/BlockSkipWriter.java create mode 100644 lucene/core/src/test/org/apache/lucene/index/TestPostingsFormat.java diff --git a/build.xml b/build.xml index d16c3ff1dc7..c37f91e73b9 100644 --- a/build.xml +++ b/build.xml @@ -24,7 +24,8 @@ - + + diff --git a/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java index 80156f869a7..0c070abb1c6 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java @@ -724,7 +724,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer { // Write term stats, to separate byte[] blob: bytesWriter2.writeVInt(term.stats.docFreq); if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) { - assert term.stats.totalTermFreq >= term.stats.docFreq; + assert term.stats.totalTermFreq >= term.stats.docFreq: term.stats.totalTermFreq + " vs " + term.stats.docFreq; bytesWriter2.writeVLong(term.stats.totalTermFreq - term.stats.docFreq); } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsFormat.java new file mode 100644 index 00000000000..5d73d1704a9 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsFormat.java @@ -0,0 +1,108 @@ +package org.apache.lucene.codecs.block; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.codecs.BlockTreeTermsReader; +import org.apache.lucene.codecs.BlockTreeTermsWriter; +import org.apache.lucene.codecs.FieldsConsumer; +import org.apache.lucene.codecs.FieldsProducer; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.PostingsReaderBase; +import org.apache.lucene.codecs.PostingsWriterBase; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.util.IOUtils; + +/** + * Pass ForFactory to a PostingsWriter/ReaderBase, and get + * customized postings format plugged. + */ +public final class BlockPostingsFormat extends PostingsFormat { + public static final String DOC_EXTENSION = "doc"; + public static final String POS_EXTENSION = "pos"; + public static final String PAY_EXTENSION = "pay"; + + private final int minTermBlockSize; + private final int maxTermBlockSize; + public final static int DEFAULT_BLOCK_SIZE = 128; + + public BlockPostingsFormat() { + this(BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE); + } + + public BlockPostingsFormat(int minTermBlockSize, int maxTermBlockSize) { + super("Block"); + this.minTermBlockSize = minTermBlockSize; + assert minTermBlockSize > 1; + this.maxTermBlockSize = maxTermBlockSize; + assert minTermBlockSize <= maxTermBlockSize; + } + + @Override + public String toString() { + return getName() + "(blocksize=" + DEFAULT_BLOCK_SIZE + ")"; + } + + @Override + public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + // TODO: implement a new PostingsWriterBase to improve skip-settings + PostingsWriterBase postingsWriter = new BlockPostingsWriter(state, 128); + + boolean success = false; + try { + FieldsConsumer ret = new BlockTreeTermsWriter(state, + postingsWriter, + minTermBlockSize, + maxTermBlockSize); + success = true; + return ret; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(postingsWriter); + } + } + } + + @Override + public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { + PostingsReaderBase postingsReader = new BlockPostingsReader(state.dir, + state.fieldInfos, + state.segmentInfo, + state.context, + state.segmentSuffix, + 128); + boolean success = false; + try { + FieldsProducer ret = new BlockTreeTermsReader(state.dir, + state.fieldInfos, + state.segmentInfo.name, + postingsReader, + state.context, + state.segmentSuffix, + state.termsIndexDivisor); + success = true; + return ret; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(postingsReader); + } + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsReader.java new file mode 100644 index 00000000000..efec0e12282 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsReader.java @@ -0,0 +1,1524 @@ +package org.apache.lucene.codecs.block; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.IntBuffer; + +import org.apache.lucene.codecs.BlockTermState; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.PostingsReaderBase; +import org.apache.lucene.codecs.pfor.ForUtil; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.TermState; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; + +// nocommit move ForUtil here? + +// nocommit javadocs +public final class BlockPostingsReader extends PostingsReaderBase { + + private final IndexInput docIn; + private final IndexInput posIn; + private final IndexInput payIn; + + public static boolean DEBUG = false; + + // nocommit + final String segment; + + // NOTE: not private to avoid access$NNN methods: + final int blockSize; + + public BlockPostingsReader(Directory dir, FieldInfos fieldInfos, SegmentInfo segmentInfo, IOContext ioContext, String segmentSuffix, int blockSize) throws IOException { + boolean success = false; + segment = segmentInfo.name; + IndexInput docIn = null; + IndexInput posIn = null; + IndexInput payIn = null; + try { + docIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, BlockPostingsFormat.DOC_EXTENSION), + ioContext); + CodecUtil.checkHeader(docIn, + BlockPostingsWriter.DOC_CODEC, + BlockPostingsWriter.VERSION_START, + BlockPostingsWriter.VERSION_START); + + if (fieldInfos.hasProx()) { + posIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, BlockPostingsFormat.POS_EXTENSION), + ioContext); + CodecUtil.checkHeader(posIn, + BlockPostingsWriter.POS_CODEC, + BlockPostingsWriter.VERSION_START, + BlockPostingsWriter.VERSION_START); + + if (fieldInfos.hasPayloads() || fieldInfos.hasOffsets()) { + payIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, BlockPostingsFormat.PAY_EXTENSION), + ioContext); + CodecUtil.checkHeader(payIn, + BlockPostingsWriter.PAY_CODEC, + BlockPostingsWriter.VERSION_START, + BlockPostingsWriter.VERSION_START); + } + } + + this.docIn = docIn; + this.posIn = posIn; + this.payIn = payIn; + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(docIn, posIn, payIn); + } + } + + this.blockSize = blockSize; + } + + @Override + public void init(IndexInput termsIn) throws IOException { + // Make sure we are talking to the matching past writer + CodecUtil.checkHeader(termsIn, + BlockPostingsWriter.TERMS_CODEC, + BlockPostingsWriter.VERSION_START, + BlockPostingsWriter.VERSION_START); + final int indexBlockSize = termsIn.readVInt(); + if (indexBlockSize != blockSize) { + throw new IllegalStateException("index-time blockSize (" + indexBlockSize + ") != read-time blockSize (" + blockSize + ")"); + } + } + + // Must keep final because we do non-standard clone + private final static class IntBlockTermState extends BlockTermState { + long docStartFP; + long posStartFP; + long payStartFP; + int skipOffset; + int lastPosBlockOffset; + + // Only used by the "primary" TermState -- clones don't + // copy this (basically they are "transient"): + ByteArrayDataInput bytesReader; // TODO: should this NOT be in the TermState...? + byte[] bytes; + + @Override + public IntBlockTermState clone() { + IntBlockTermState other = new IntBlockTermState(); + other.copyFrom(this); + return other; + } + + @Override + public void copyFrom(TermState _other) { + super.copyFrom(_other); + IntBlockTermState other = (IntBlockTermState) _other; + docStartFP = other.docStartFP; + posStartFP = other.posStartFP; + payStartFP = other.payStartFP; + lastPosBlockOffset = other.lastPosBlockOffset; + skipOffset = other.skipOffset; + + // Do not copy bytes, bytesReader (else TermState is + // very heavy, ie drags around the entire block's + // byte[]). On seek back, if next() is in fact used + // (rare!), they will be re-read from disk. + } + + @Override + public String toString() { + return super.toString() + " docStartFP=" + docStartFP + " posStartFP=" + posStartFP + " payStartFP=" + payStartFP + " lastPosBlockOffset=" + lastPosBlockOffset; + } + } + + @Override + public IntBlockTermState newTermState() { + return new IntBlockTermState(); + } + + @Override + public void close() throws IOException { + IOUtils.close(docIn, posIn, payIn); + } + + /* Reads but does not decode the byte[] blob holding + metadata for the current terms block */ + @Override + public void readTermsBlock(IndexInput termsIn, FieldInfo fieldInfo, BlockTermState _termState) throws IOException { + final IntBlockTermState termState = (IntBlockTermState) _termState; + + final int numBytes = termsIn.readVInt(); + + if (termState.bytes == null) { + termState.bytes = new byte[ArrayUtil.oversize(numBytes, 1)]; + termState.bytesReader = new ByteArrayDataInput(); + } else if (termState.bytes.length < numBytes) { + termState.bytes = new byte[ArrayUtil.oversize(numBytes, 1)]; + } + + termsIn.readBytes(termState.bytes, 0, numBytes); + termState.bytesReader.reset(termState.bytes, 0, numBytes); + } + + @Override + public void nextTerm(FieldInfo fieldInfo, BlockTermState _termState) + throws IOException { + final IntBlockTermState termState = (IntBlockTermState) _termState; + final boolean isFirstTerm = termState.termBlockOrd == 0; + final boolean fieldHasPositions = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + final boolean fieldHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + final boolean fieldHasPayloads = fieldInfo.hasPayloads(); + + final DataInput in = termState.bytesReader; + if (isFirstTerm) { + termState.docStartFP = in.readVLong(); + if (fieldHasPositions) { + termState.posStartFP = in.readVLong(); + if (termState.totalTermFreq > blockSize) { + termState.lastPosBlockOffset = in.readVInt(); + } else { + termState.lastPosBlockOffset = -1; + } + if ((fieldHasPayloads || fieldHasOffsets) && termState.totalTermFreq >= blockSize) { + termState.payStartFP = in.readVLong(); + } else { + termState.payStartFP = -1; + } + } + } else { + termState.docStartFP += in.readVLong(); + if (fieldHasPositions) { + termState.posStartFP += in.readVLong(); + if (termState.totalTermFreq > blockSize) { + termState.lastPosBlockOffset = in.readVInt(); + } else { + termState.lastPosBlockOffset = -1; + } + if ((fieldHasPayloads || fieldHasOffsets) && termState.totalTermFreq >= blockSize) { + long delta = in.readVLong(); + if (termState.payStartFP == -1) { + termState.payStartFP = delta; + } else { + termState.payStartFP += delta; + } + } + } + } + + if (termState.docFreq > blockSize) { + termState.skipOffset = in.readVInt(); + } else { + termState.skipOffset = -1; + } + } + + @Override + public DocsEnum docs(FieldInfo fieldInfo, BlockTermState termState, Bits liveDocs, DocsEnum reuse, boolean needsFreqs) throws IOException { + BlockDocsEnum docsEnum; + if (reuse instanceof BlockDocsEnum) { + docsEnum = (BlockDocsEnum) reuse; + if (!docsEnum.canReuse(docIn, fieldInfo)) { + docsEnum = new BlockDocsEnum(fieldInfo); + } + } else { + docsEnum = new BlockDocsEnum(fieldInfo); + } + return docsEnum.reset(liveDocs, (IntBlockTermState) termState); + } + + // TODO: specialize to liveDocs vs not, and freqs vs not + + @Override + public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState termState, Bits liveDocs, + DocsAndPositionsEnum reuse, boolean needsOffsets) + throws IOException { + + // nocommit use needsPayloads here: + if (!needsOffsets && !fieldInfo.hasPayloads()) { + BlockDocsAndPositionsEnum docsAndPositionsEnum; + if (reuse instanceof BlockDocsAndPositionsEnum) { + docsAndPositionsEnum = (BlockDocsAndPositionsEnum) reuse; + if (!docsAndPositionsEnum.canReuse(docIn, fieldInfo)) { + docsAndPositionsEnum = new BlockDocsAndPositionsEnum(fieldInfo); + } + } else { + docsAndPositionsEnum = new BlockDocsAndPositionsEnum(fieldInfo); + } + return docsAndPositionsEnum.reset(liveDocs, (IntBlockTermState) termState); + } else { + EverythingEnum everythingEnum; + if (reuse instanceof EverythingEnum) { + everythingEnum = (EverythingEnum) reuse; + if (!everythingEnum.canReuse(docIn, fieldInfo)) { + everythingEnum = new EverythingEnum(fieldInfo); + } + } else { + everythingEnum = new EverythingEnum(fieldInfo); + } + return everythingEnum.reset(liveDocs, (IntBlockTermState) termState); + } + } + + final class BlockDocsEnum extends DocsEnum { + private final byte[] encoded; + private final IntBuffer encodedBuffer; + + private final int[] docDeltaBuffer = new int[blockSize]; + private final int[] freqBuffer = new int[blockSize]; + + private int docBufferUpto; + + private BlockSkipReader skipper; + private boolean skipped; + + final IndexInput startDocIn; + + final IndexInput docIn; + final boolean indexHasFreq; + final boolean indexHasPos; + final boolean indexHasOffsets; + final boolean indexHasPayloads; + + private int docFreq; // number of docs in this posting list + private int docUpto; // how many docs we've read + private int doc; // doc we last read + private int accum; // accumulator for doc deltas + private int freq; // freq we last read + + // Where this term's postings start in the .doc file: + private long docTermStartFP; + + // Where this term's skip data starts (after + // docTermStartFP) in the .doc file (or -1 if there is + // no skip data for this term): + private int skipOffset; + + private Bits liveDocs; + + public BlockDocsEnum(FieldInfo fieldInfo) throws IOException { + this.startDocIn = BlockPostingsReader.this.docIn; + this.docIn = (IndexInput) startDocIn.clone(); + indexHasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; + indexHasPos = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + indexHasPayloads = fieldInfo.hasPayloads(); + encoded = new byte[blockSize*4 + 4]; + encodedBuffer = ByteBuffer.wrap(encoded).asIntBuffer(); + } + + public boolean canReuse(IndexInput docIn, FieldInfo fieldInfo) { + return docIn == startDocIn && + indexHasFreq == (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0) && + indexHasPos == (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) && + indexHasPayloads == fieldInfo.hasPayloads(); + } + + public DocsEnum reset(Bits liveDocs, IntBlockTermState termState) throws IOException { + this.liveDocs = liveDocs; + if (DEBUG) { + System.out.println(" FPR.reset: seg=" + segment + " termState=" + termState); + } + docFreq = termState.docFreq; + docTermStartFP = termState.docStartFP; + docIn.seek(docTermStartFP); + skipOffset = termState.skipOffset; + + doc = -1; + accum = 0; + docUpto = 0; + docBufferUpto = blockSize; + skipped = false; + return this; + } + + @Override + public int freq() throws IOException { + return freq; + } + + @Override + public int docID() { + return doc; + } + + private void readBlock(IndexInput in, int[] buffer) throws IOException { + int header = in.readVInt(); + in.readBytes(encoded, 0, ForUtil.getEncodedSize(header)); + ForUtil.decompress(encodedBuffer, buffer, header); + } + + private void refillDocs() throws IOException { + final int left = docFreq - docUpto; + assert left > 0; + + if (left >= blockSize) { + if (DEBUG) { + System.out.println(" fill doc block from fp=" + docIn.getFilePointer()); + } + readBlock(docIn, docDeltaBuffer); + + if (indexHasFreq) { + if (DEBUG) { + System.out.println(" fill freq block from fp=" + docIn.getFilePointer()); + } + readBlock(docIn, freqBuffer); + } + } else { + // Read vInts: + if (DEBUG) { + System.out.println(" fill last vInt block from fp=" + docIn.getFilePointer()); + } + for(int i=0;i>> 1; + if ((code & 1) != 0) { + freqBuffer[i] = 1; + } else { + freqBuffer[i] = docIn.readVInt(); + } + } else { + docDeltaBuffer[i] = code; + } + } + } + docBufferUpto = 0; + } + + @Override + public int nextDoc() throws IOException { + + if (DEBUG) { + System.out.println("\nFPR.nextDoc"); + } + + while (true) { + if (DEBUG) { + System.out.println(" docUpto=" + docUpto + " (of df=" + docFreq + ") docBufferUpto=" + docBufferUpto); + } + + if (docUpto == docFreq) { + if (DEBUG) { + System.out.println(" return doc=END"); + } + return doc = NO_MORE_DOCS; + } + + if (docBufferUpto == blockSize) { + refillDocs(); + } + + if (DEBUG) { + System.out.println(" accum=" + accum + " docDeltaBuffer[" + docBufferUpto + "]=" + docDeltaBuffer[docBufferUpto]); + } + accum += docDeltaBuffer[docBufferUpto]; + docUpto++; + + if (liveDocs == null || liveDocs.get(accum)) { + doc = accum; + freq = freqBuffer[docBufferUpto]; + docBufferUpto++; + if (DEBUG) { + System.out.println(" return doc=" + doc + " freq=" + freq); + } + return doc; + } + + if (DEBUG) { + System.out.println(" doc=" + accum + " is deleted; try next doc"); + } + + docBufferUpto++; + } + } + + @Override + public int advance(int target) throws IOException { + // nocommit make frq block load lazy/skippable + + // nocommit 2 is heuristic guess!! + // nocommit put cheating back! does it help? + // nocommit use skipper!!! it has next last doc id!! + //if (docFreq > blockSize && target - (blockSize - docBufferUpto) - 2*blockSize > accum) { + if (docFreq > blockSize && target - accum > blockSize) { + + if (DEBUG) { + System.out.println("load skipper"); + } + + if (skipper == null) { + // Lazy init: first time this enum has ever been used for skipping + skipper = new BlockSkipReader((IndexInput) docIn.clone(), + BlockPostingsWriter.maxSkipLevels, + blockSize, + indexHasPos, + indexHasOffsets, + indexHasPayloads); + } + + if (!skipped) { + assert skipOffset != -1; + // This is the first time this enum has skipped + // since reset() was called; load the skip data: + skipper.init(docTermStartFP+skipOffset, docTermStartFP, 0, 0, docFreq); + skipped = true; + } + + final int newDocUpto = skipper.skipTo(target); + + if (newDocUpto > docUpto) { + // Skipper moved + + if (DEBUG) { + System.out.println("skipper moved to docUpto=" + newDocUpto + " vs current=" + docUpto + "; docID=" + skipper.getDoc() + " fp=" + skipper.getDocPointer()); + } + + assert newDocUpto % blockSize == (blockSize-1): "got " + newDocUpto; + docUpto = newDocUpto+1; + + // Force block read next: + docBufferUpto = blockSize; + accum = skipper.getDoc(); + docIn.seek(skipper.getDocPointer()); + } + } + + // Now scan: + while (nextDoc() != NO_MORE_DOCS) { + if (doc >= target) { + if (DEBUG) { + System.out.println(" advance return doc=" + doc); + } + return doc; + } + } + + if (DEBUG) { + System.out.println(" advance return doc=END"); + } + + return NO_MORE_DOCS; + } + } + + + final class BlockDocsAndPositionsEnum extends DocsAndPositionsEnum { + + private final byte[] encoded; + private final IntBuffer encodedBuffer; + + private final int[] docDeltaBuffer = new int[blockSize]; + private final int[] freqBuffer = new int[blockSize]; + private final int[] posDeltaBuffer = new int[blockSize]; + + private int docBufferUpto; + private int posBufferUpto; + + private BlockSkipReader skipper; + private boolean skipped; + + final IndexInput startDocIn; + + final IndexInput docIn; + final IndexInput posIn; + + final boolean indexHasOffsets; + final boolean indexHasPayloads; + + private int docFreq; // number of docs in this posting list + private int docUpto; // how many docs we've read + private int doc; // doc we last read + private int accum; // accumulator for doc deltas + private int freq; // freq we last read + private int position; // current position + + // how many positions "behind" we are; nextPosition must + // skip these to "catch up": + private int posPendingCount; + + // Lazy pos seek: if != -1 then we must seek to this FP + // before reading positions: + private long posPendingFP; + + // Where this term's postings start in the .doc file: + private long docTermStartFP; + + // Where this term's postings start in the .pos file: + private long posTermStartFP; + + // Where this term's payloads/offsets start in the .pay + // file: + private long payTermStartFP; + + // File pointer where the last (vInt encoded) pos delta + // block is. We need this to know whether to bulk + // decode vs vInt decode the block: + private long lastPosBlockFP; + + // Where this term's skip data starts (after + // docTermStartFP) in the .doc file (or -1 if there is + // no skip data for this term): + private int skipOffset; + + private Bits liveDocs; + + public BlockDocsAndPositionsEnum(FieldInfo fieldInfo) throws IOException { + this.startDocIn = BlockPostingsReader.this.docIn; + this.docIn = (IndexInput) startDocIn.clone(); + this.posIn = (IndexInput) BlockPostingsReader.this.posIn.clone(); + encoded = new byte[blockSize*4 + 4]; + encodedBuffer = ByteBuffer.wrap(encoded).asIntBuffer(); + indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + indexHasPayloads = fieldInfo.hasPayloads(); + } + + public boolean canReuse(IndexInput docIn, FieldInfo fieldInfo) { + return docIn == startDocIn && + indexHasOffsets == (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) && + indexHasPayloads == fieldInfo.hasPayloads(); + } + + public DocsAndPositionsEnum reset(Bits liveDocs, IntBlockTermState termState) throws IOException { + this.liveDocs = liveDocs; + if (DEBUG) { + System.out.println(" FPR.reset: termState=" + termState); + } + docFreq = termState.docFreq; + docTermStartFP = termState.docStartFP; + posTermStartFP = termState.posStartFP; + payTermStartFP = termState.payStartFP; + docIn.seek(docTermStartFP); + skipOffset = termState.skipOffset; + posPendingFP = posTermStartFP; + posPendingCount = 0; + if (termState.totalTermFreq < blockSize) { + lastPosBlockFP = posTermStartFP; + } else if (termState.totalTermFreq == blockSize) { + lastPosBlockFP = -1; + } else { + lastPosBlockFP = posTermStartFP + termState.lastPosBlockOffset; + } + + doc = -1; + accum = 0; + docUpto = 0; + docBufferUpto = blockSize; + skipped = false; + return this; + } + + @Override + public int freq() throws IOException { + return freq; + } + + @Override + public int docID() { + return doc; + } + + private void readBlock(IndexInput in, int[] buffer) throws IOException { + int header = in.readVInt(); + in.readBytes(encoded, 0, ForUtil.getEncodedSize(header)); + ForUtil.decompress(encodedBuffer, buffer, header); + } + + private void skipBlock(IndexInput in) throws IOException { + int header = in.readVInt(); + in.seek(in.getFilePointer() + ForUtil.getEncodedSize(header)); + } + + private void refillDocs() throws IOException { + final int left = docFreq - docUpto; + assert left > 0; + + if (left >= blockSize) { + if (DEBUG) { + System.out.println(" fill doc block from fp=" + docIn.getFilePointer()); + } + + readBlock(docIn, docDeltaBuffer); + + if (DEBUG) { + System.out.println(" fill freq block from fp=" + docIn.getFilePointer()); + } + + readBlock(docIn, freqBuffer); + } else { + // Read vInts: + if (DEBUG) { + System.out.println(" fill last vInt doc block from fp=" + docIn.getFilePointer()); + } + for(int i=0;i>> 1; + if ((code & 1) != 0) { + freqBuffer[i] = 1; + } else { + freqBuffer[i] = docIn.readVInt(); + } + } + } + docBufferUpto = 0; + } + + private void refillPositions() throws IOException { + if (DEBUG) { + System.out.println(" refillPositions"); + } + if (posIn.getFilePointer() == lastPosBlockFP) { + if (DEBUG) { + System.out.println(" vInt pos block @ fp=" + posIn.getFilePointer() + " hasPayloads=" + indexHasPayloads + " hasOffsets=" + indexHasOffsets); + } + final int count = posIn.readVInt(); + int payloadLength = 0; + for(int i=0;i>> 1; + if (payloadLength != 0) { + posIn.seek(posIn.getFilePointer() + payloadLength); + } + } else { + posDeltaBuffer[i] = code; + } + + if (indexHasOffsets) { + posIn.readVInt(); + posIn.readVInt(); + } + } + } else { + if (DEBUG) { + System.out.println(" bulk pos block @ fp=" + posIn.getFilePointer()); + } + readBlock(posIn, posDeltaBuffer); + } + } + + @Override + public int nextDoc() throws IOException { + + if (DEBUG) { + System.out.println(" FPR.nextDoc"); + } + + while (true) { + if (DEBUG) { + System.out.println(" docUpto=" + docUpto + " (of df=" + docFreq + ") docBufferUpto=" + docBufferUpto); + } + + if (docUpto == docFreq) { + return doc = NO_MORE_DOCS; + } + + if (docBufferUpto == blockSize) { + refillDocs(); + } + + if (DEBUG) { + System.out.println(" accum=" + accum + " docDeltaBuffer[" + docBufferUpto + "]=" + docDeltaBuffer[docBufferUpto]); + } + accum += docDeltaBuffer[docBufferUpto]; + freq = freqBuffer[docBufferUpto]; + posPendingCount += freq; + docBufferUpto++; + docUpto++; + + if (liveDocs == null || liveDocs.get(accum)) { + doc = accum; + if (DEBUG) { + System.out.println(" return doc=" + doc + " freq=" + freq + " posPendingCount=" + posPendingCount); + } + position = 0; + return doc; + } + + if (DEBUG) { + System.out.println(" doc=" + accum + " is deleted; try next doc"); + } + } + } + + @Override + public int advance(int target) throws IOException { + // nocommit make frq block load lazy/skippable + if (DEBUG) { + System.out.println(" FPR.advance target=" + target); + } + + // nocommit 2 is heuristic guess!! + // nocommit put cheating back! does it help? + // nocommit use skipper!!! it has next last doc id!! + //if (docFreq > blockSize && target - (blockSize - docBufferUpto) - 2*blockSize > accum) { + if (docFreq > blockSize && target - accum > blockSize) { + + if (DEBUG) { + System.out.println(" try skipper"); + } + + if (skipper == null) { + // Lazy init: first time this enum has ever been used for skipping + if (DEBUG) { + System.out.println(" create skipper"); + } + skipper = new BlockSkipReader((IndexInput) docIn.clone(), + BlockPostingsWriter.maxSkipLevels, + blockSize, + true, + indexHasOffsets, + indexHasPayloads); + } + + if (!skipped) { + assert skipOffset != -1; + // This is the first time this enum has skipped + // since reset() was called; load the skip data: + if (DEBUG) { + System.out.println(" init skipper"); + } + skipper.init(docTermStartFP+skipOffset, docTermStartFP, posTermStartFP, payTermStartFP, docFreq); + skipped = true; + } + + final int newDocUpto = skipper.skipTo(target); + + if (newDocUpto > docUpto) { + // Skipper moved + + if (DEBUG) { + System.out.println(" skipper moved to docUpto=" + newDocUpto + " vs current=" + docUpto + "; docID=" + skipper.getDoc() + " fp=" + skipper.getDocPointer() + " pos.fp=" + skipper.getPosPointer() + " pos.bufferUpto=" + skipper.getPosBufferUpto()); + } + + assert newDocUpto % blockSize == (blockSize-1): "got " + newDocUpto; + docUpto = newDocUpto+1; + + // Force block read next: + docBufferUpto = blockSize; + accum = skipper.getDoc(); + docIn.seek(skipper.getDocPointer()); + posPendingFP = skipper.getPosPointer(); + posPendingCount = skipper.getPosBufferUpto(); + } + } + + // Now scan: + while (nextDoc() != NO_MORE_DOCS) { + if (doc >= target) { + if (DEBUG) { + System.out.println(" advance return doc=" + doc); + } + return doc; + } + } + + if (DEBUG) { + System.out.println(" advance return doc=END"); + } + + return NO_MORE_DOCS; + } + + // nocommit in theory we could avoid loading frq block + // when not needed, ie, use skip data to load how far to + // seek the pos pointe ... instead of having to load frq + // blocks only to sum up how many positions to skip + private void skipPositions() throws IOException { + // Skip positions now: + int toSkip = posPendingCount - freq; + if (DEBUG) { + System.out.println(" FPR.skipPositions: toSkip=" + toSkip); + } + + final int leftInBlock = blockSize - posBufferUpto; + if (toSkip < leftInBlock) { + posBufferUpto += toSkip; + if (DEBUG) { + System.out.println(" skip w/in block to posBufferUpto=" + posBufferUpto); + } + } else { + toSkip -= leftInBlock; + while(toSkip >= blockSize) { + if (DEBUG) { + System.out.println(" skip whole block @ fp=" + posIn.getFilePointer()); + } + assert posIn.getFilePointer() != lastPosBlockFP; + skipBlock(posIn); + toSkip -= blockSize; + } + refillPositions(); + posBufferUpto = toSkip; + if (DEBUG) { + System.out.println(" skip w/in block to posBufferUpto=" + posBufferUpto); + } + } + + position = 0; + } + + @Override + public int nextPosition() throws IOException { + if (DEBUG) { + System.out.println(" FPR.nextPosition posPendingCount=" + posPendingCount + " posBufferUpto=" + posBufferUpto); + } + if (posPendingFP != -1) { + if (DEBUG) { + System.out.println(" seek to pendingFP=" + posPendingFP); + } + posIn.seek(posPendingFP); + posPendingFP = -1; + + // Force buffer refill: + posBufferUpto = blockSize; + } + + if (posPendingCount > freq) { + skipPositions(); + posPendingCount = freq; + } + + if (posBufferUpto == blockSize) { + refillPositions(); + posBufferUpto = 0; + } + position += posDeltaBuffer[posBufferUpto++]; + posPendingCount--; + if (DEBUG) { + System.out.println(" return pos=" + position); + } + return position; + } + + @Override + public int startOffset() { + return -1; + } + + @Override + public int endOffset() { + return -1; + } + + @Override + public boolean hasPayload() { + return false; + } + + @Override + public BytesRef getPayload() { + return null; + } + } + + // Also handles payloads + offsets + final class EverythingEnum extends DocsAndPositionsEnum { + + private final byte[] encoded; + private final IntBuffer encodedBuffer; + + private final int[] docDeltaBuffer = new int[blockSize]; + private final int[] freqBuffer = new int[blockSize]; + private final int[] posDeltaBuffer = new int[blockSize]; + + private final int[] payloadLengthBuffer; + private final int[] offsetStartDeltaBuffer; + private final int[] offsetLengthBuffer; + + private byte[] payloadBytes; + private int payloadByteUpto; + private int payloadLength; + + private int lastEndOffset; + private int startOffset; + private int endOffset; + + private int docBufferUpto; + private int posBufferUpto; + + private BlockSkipReader skipper; + private boolean skipped; + + final IndexInput startDocIn; + + final IndexInput docIn; + final IndexInput posIn; + final IndexInput payIn; + final BytesRef payload; + + final boolean indexHasOffsets; + final boolean indexHasPayloads; + + private int docFreq; // number of docs in this posting list + private int docUpto; // how many docs we've read + private int doc; // doc we last read + private int accum; // accumulator for doc deltas + private int freq; // freq we last read + private int position; // current position + + // how many positions "behind" we are; nextPosition must + // skip these to "catch up": + private int posPendingCount; + + // Lazy pos seek: if != -1 then we must seek to this FP + // before reading positions: + private long posPendingFP; + + // Lazy pay seek: if != -1 then we must seek to this FP + // before reading payloads/offsets: + private long payPendingFP; + + // Where this term's postings start in the .doc file: + private long docTermStartFP; + + // Where this term's postings start in the .pos file: + private long posTermStartFP; + + // Where this term's payloads/offsets start in the .pay + // file: + private long payTermStartFP; + + // File pointer where the last (vInt encoded) pos delta + // block is. We need this to know whether to bulk + // decode vs vInt decode the block: + private long lastPosBlockFP; + + // Where this term's skip data starts (after + // docTermStartFP) in the .doc file (or -1 if there is + // no skip data for this term): + private int skipOffset; + + private Bits liveDocs; + + public EverythingEnum(FieldInfo fieldInfo) throws IOException { + this.startDocIn = BlockPostingsReader.this.docIn; + this.docIn = (IndexInput) startDocIn.clone(); + this.posIn = (IndexInput) BlockPostingsReader.this.posIn.clone(); + this.payIn = (IndexInput) BlockPostingsReader.this.payIn.clone(); + encoded = new byte[blockSize*4 + 4]; + encodedBuffer = ByteBuffer.wrap(encoded).asIntBuffer(); + indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + if (indexHasOffsets) { + offsetStartDeltaBuffer = new int[blockSize]; + offsetLengthBuffer = new int[blockSize]; + } else { + offsetStartDeltaBuffer = null; + offsetLengthBuffer = null; + startOffset = -1; + endOffset = -1; + } + + indexHasPayloads = fieldInfo.hasPayloads(); + if (indexHasPayloads) { + payloadLengthBuffer = new int[blockSize]; + payloadBytes = new byte[128]; + payload = new BytesRef(); + } else { + payloadLengthBuffer = null; + payloadBytes = null; + payload = null; + } + } + + public boolean canReuse(IndexInput docIn, FieldInfo fieldInfo) { + return docIn == startDocIn && + indexHasOffsets == (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) && + indexHasPayloads == fieldInfo.hasPayloads(); + } + + public EverythingEnum reset(Bits liveDocs, IntBlockTermState termState) throws IOException { + this.liveDocs = liveDocs; + if (DEBUG) { + System.out.println(" FPR.reset: termState=" + termState); + } + docFreq = termState.docFreq; + docTermStartFP = termState.docStartFP; + posTermStartFP = termState.posStartFP; + payTermStartFP = termState.payStartFP; + docIn.seek(docTermStartFP); + skipOffset = termState.skipOffset; + posPendingFP = posTermStartFP; + payPendingFP = payTermStartFP; + posPendingCount = 0; + if (termState.totalTermFreq < blockSize) { + lastPosBlockFP = posTermStartFP; + } else if (termState.totalTermFreq == blockSize) { + lastPosBlockFP = -1; + } else { + lastPosBlockFP = posTermStartFP + termState.lastPosBlockOffset; + } + + doc = -1; + accum = 0; + docUpto = 0; + docBufferUpto = blockSize; + skipped = false; + return this; + } + + @Override + public int freq() throws IOException { + return freq; + } + + @Override + public int docID() { + return doc; + } + + private void readBlock(IndexInput in, int[] buffer) throws IOException { + int header = in.readVInt(); + in.readBytes(encoded, 0, ForUtil.getEncodedSize(header)); + ForUtil.decompress(encodedBuffer, buffer, header); + } + + private void skipBlock(IndexInput in) throws IOException { + int header = in.readVInt(); + in.seek(in.getFilePointer() + ForUtil.getEncodedSize(header)); + } + + private void refillDocs() throws IOException { + final int left = docFreq - docUpto; + assert left > 0; + + if (left >= blockSize) { + if (DEBUG) { + System.out.println(" fill doc block from fp=" + docIn.getFilePointer()); + } + + readBlock(docIn, docDeltaBuffer); + + if (DEBUG) { + System.out.println(" fill freq block from fp=" + docIn.getFilePointer()); + } + + readBlock(docIn, freqBuffer); + } else { + // Read vInts: + if (DEBUG) { + System.out.println(" fill last vInt doc block from fp=" + docIn.getFilePointer()); + } + for(int i=0;i>> 1; + if ((code & 1) != 0) { + freqBuffer[i] = 1; + } else { + freqBuffer[i] = docIn.readVInt(); + } + } + } + docBufferUpto = 0; + } + + private void refillPositions() throws IOException { + if (DEBUG) { + System.out.println(" refillPositions"); + } + if (posIn.getFilePointer() == lastPosBlockFP) { + if (DEBUG) { + System.out.println(" vInt pos block @ fp=" + posIn.getFilePointer() + " hasPayloads=" + indexHasPayloads + " hasOffsets=" + indexHasOffsets); + } + final int count = posIn.readVInt(); + int payloadLength = 0; + payloadByteUpto = 0; + for(int i=0;i docUpto) { + // Skipper moved + + if (DEBUG) { + System.out.println(" skipper moved to docUpto=" + newDocUpto + " vs current=" + docUpto + "; docID=" + skipper.getDoc() + " fp=" + skipper.getDocPointer() + " pos.fp=" + skipper.getPosPointer() + " pos.bufferUpto=" + skipper.getPosBufferUpto() + " pay.fp=" + skipper.getPayPointer() + " lastEndOffset=" + lastEndOffset); + } + + assert newDocUpto % blockSize == (blockSize-1): "got " + newDocUpto; + docUpto = newDocUpto+1; + + // Force block read next: + docBufferUpto = blockSize; + accum = skipper.getDoc(); + docIn.seek(skipper.getDocPointer()); + posPendingFP = skipper.getPosPointer(); + payPendingFP = skipper.getPayPointer(); + posPendingCount = skipper.getPosBufferUpto(); + lastEndOffset = skipper.getEndOffset(); + payloadByteUpto = skipper.getPayloadByteUpto(); + } + } + + // Now scan: + while (nextDoc() != NO_MORE_DOCS) { + if (doc >= target) { + if (DEBUG) { + System.out.println(" advance return doc=" + doc); + } + return doc; + } + } + + if (DEBUG) { + System.out.println(" advance return doc=END"); + } + + return NO_MORE_DOCS; + } + + // nocommit in theory we could avoid loading frq block + // when not needed, ie, use skip data to load how far to + // seek the pos pointe ... instead of having to load frq + // blocks only to sum up how many positions to skip + private void skipPositions() throws IOException { + // Skip positions now: + int toSkip = posPendingCount - freq; + if (DEBUG) { + System.out.println(" FPR.skipPositions: toSkip=" + toSkip); + } + + final int leftInBlock = blockSize - posBufferUpto; + if (toSkip < leftInBlock) { + int end = posBufferUpto + toSkip; + while(posBufferUpto < end) { + if (indexHasPayloads) { + payloadByteUpto += payloadLengthBuffer[posBufferUpto]; + } + if (indexHasOffsets) { + lastEndOffset += offsetStartDeltaBuffer[posBufferUpto] + offsetLengthBuffer[posBufferUpto]; + } + posBufferUpto++; + } + if (DEBUG) { + System.out.println(" skip w/in block to posBufferUpto=" + posBufferUpto); + } + } else { + toSkip -= leftInBlock; + while(toSkip >= blockSize) { + if (DEBUG) { + System.out.println(" skip whole block @ fp=" + posIn.getFilePointer()); + } + assert posIn.getFilePointer() != lastPosBlockFP; + skipBlock(posIn); + + if (indexHasPayloads) { + // Skip payloadLength block: + skipBlock(payIn); + + // Skip payloadBytes block: + int numBytes = payIn.readVInt(); + payIn.seek(payIn.getFilePointer() + numBytes); + } + + if (indexHasOffsets) { + // Must load offset blocks merely to sum + // up into lastEndOffset: + readBlock(payIn, offsetStartDeltaBuffer); + readBlock(payIn, offsetLengthBuffer); + for(int i=0;i= 0; + fieldHasPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + fieldHasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + fieldHasPayloads = fieldInfo.hasPayloads(); + skipWriter.setField(fieldHasPositions, fieldHasOffsets, fieldHasPayloads); + } + + @Override + public void startTerm() { + docTermStartFP = docOut.getFilePointer(); + if (fieldHasPositions) { + posTermStartFP = posOut.getFilePointer(); + if (fieldHasPayloads || fieldHasOffsets) { + payTermStartFP = payOut.getFilePointer(); + } + } + lastBlockDocID = -1; + lastDocID = 0; + if (DEBUG) { + System.out.println("FPW.startTerm startFP=" + docTermStartFP); + } + skipWriter.resetSkip(); + } + + private void writeBlock(int[] buffer, IndexOutput out) throws IOException { + final int header = ForUtil.compress(buffer, encodedBuffer); + //System.out.println(" block has " + numBytes + " bytes"); + out.writeVInt(header); + out.writeBytes(encoded, ForUtil.getEncodedSize(header)); + } + + @Override + public void startDoc(int docID, int termDocFreq) throws IOException { + if (DEBUG) { + System.out.println("FPW.startDoc docID=" + docID); + } + + // nocommit do this in finishDoc... but does it fail...? + // is it not always called...? + if (posOut != null && saveNextPosBlock) { + lastBlockPosFP = posOut.getFilePointer(); + if (payOut != null) { + lastBlockPayFP = payOut.getFilePointer(); + } + lastBlockPosBufferUpto = posBufferUpto; + lastBlockEndOffset = lastEndOffset; + lastBlockPayloadByteUpto = payloadByteUpto; + saveNextPosBlock = false; + if (DEBUG) { + System.out.println(" now save lastBlockPosFP=" + lastBlockPosFP + " lastBlockPosBufferUpto=" + lastBlockPosBufferUpto + " lastBlockPayloadByteUpto=" + lastBlockPayloadByteUpto); + } + } + + final int docDelta = docID - lastDocID; + if (docID < 0 || (docCount > 0 && docDelta <= 0)) { + throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " ) (docOut: " + docOut + ")"); + } + lastDocID = docID; + + docDeltaBuffer[docBufferUpto] = docDelta; + if (DEBUG) { + System.out.println(" docDeltaBuffer[" + docBufferUpto + "]=" + docDelta); + } + if (fieldHasFreqs) { + freqBuffer[docBufferUpto] = termDocFreq; + } + + docBufferUpto++; + docCount++; + + if (docBufferUpto == blockSize) { + // nocommit maybe instead of buffering skip before + // writing a block based on last block's end data + // ... we could buffer after writing the block? only + // iffiness with that approach is it could be a + // pointlness skip? like we may stop adding docs + // right after that, then we have skip point AFTER + // last doc. the thing is, in finishTerm we are + // already sometimes adding a skip point AFTER the + // last doc? + if (lastBlockDocID != -1) { + if (DEBUG) { + System.out.println(" bufferSkip at writeBlock: lastDocID=" + lastBlockDocID + " docCount=" + (docCount-blockSize)); + } + skipWriter.bufferSkip(lastBlockDocID, docCount-blockSize, lastBlockPosFP, lastBlockPayFP, lastBlockPosBufferUpto, lastBlockEndOffset, lastBlockPayloadByteUpto); + } + lastBlockDocID = docID; + saveNextPosBlock = true; + + if (DEBUG) { + System.out.println(" write docDelta block @ fp=" + docOut.getFilePointer()); + } + writeBlock(docDeltaBuffer, docOut); + if (fieldHasFreqs) { + if (DEBUG) { + System.out.println(" write freq block @ fp=" + docOut.getFilePointer()); + } + writeBlock(freqBuffer, docOut); + } + docBufferUpto = 0; + } + + lastPosition = 0; + lastEndOffset = 0; + } + + /** Add a new position & payload */ + @Override + public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException { + if (DEBUG) { + System.out.println("FPW.addPosition pos=" + position + " posBufferUpto=" + posBufferUpto + (fieldHasPayloads ? " payloadByteUpto=" + payloadByteUpto: "")); + } + posDeltaBuffer[posBufferUpto] = position - lastPosition; + if (fieldHasPayloads) { + if (payload == null || payload.length == 0) { + // no payload + payloadLengthBuffer[posBufferUpto] = 0; + } else { + payloadLengthBuffer[posBufferUpto] = payload.length; + if (payloadByteUpto + payload.length > payloadBytes.length) { + payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payload.length); + } + System.arraycopy(payload.bytes, payload.offset, payloadBytes, payloadByteUpto, payload.length); + payloadByteUpto += payload.length; + } + } + + if (fieldHasOffsets) { + assert startOffset >= lastEndOffset; + assert endOffset >= startOffset; + offsetStartDeltaBuffer[posBufferUpto] = startOffset - lastEndOffset; + offsetLengthBuffer[posBufferUpto] = endOffset - startOffset; + lastEndOffset = endOffset; + } + + posBufferUpto++; + lastPosition = position; + if (posBufferUpto == blockSize) { + if (DEBUG) { + System.out.println(" write pos bulk block @ fp=" + posOut.getFilePointer()); + } + writeBlock(posDeltaBuffer, posOut); + + if (fieldHasPayloads) { + writeBlock(payloadLengthBuffer, payOut); + payOut.writeVInt(payloadByteUpto); + payOut.writeBytes(payloadBytes, 0, payloadByteUpto); + payloadByteUpto = 0; + } + if (fieldHasOffsets) { + writeBlock(offsetStartDeltaBuffer, payOut); + writeBlock(offsetLengthBuffer, payOut); + } + posBufferUpto = 0; + } + } + + @Override + public void finishDoc() { + } + + private static class PendingTerm { + public final long docStartFP; + public final long posStartFP; + public final long payStartFP; + public final int skipOffset; + public final int lastPosBlockOffset; + + public PendingTerm(long docStartFP, long posStartFP, long payStartFP, int skipOffset, int lastPosBlockOffset) { + this.docStartFP = docStartFP; + this.posStartFP = posStartFP; + this.payStartFP = payStartFP; + this.skipOffset = skipOffset; + this.lastPosBlockOffset = lastPosBlockOffset; + } + } + + private final List pendingTerms = new ArrayList(); + + /** Called when we are done adding docs to this term */ + @Override + public void finishTerm(TermStats stats) throws IOException { + + assert stats.docFreq > 0; + + // TODO: wasteful we are counting this (counting # docs + // for this term) in two places? + assert stats.docFreq == docCount: stats.docFreq + " vs " + docCount; + + if (DEBUG) { + System.out.println("FPW.finishTerm docFreq=" + stats.docFreq); + } + + // nocommit silly that skipper must write skip when we no + // postings come after it, but if we don't do this, skip + // reader incorrectly thinks it can read another level 0 + // skip entry here!: + //if (docCount > blockSize && docBufferUpto > 0) { + if (docCount > blockSize) { + final int lastDocCount = blockSize*(docCount/blockSize); + if (DEBUG) { + System.out.println(" bufferSkip at finishTerm: lastDocID=" + lastBlockDocID + " docCount=" + lastDocCount); + } + skipWriter.bufferSkip(lastBlockDocID, lastDocCount, lastBlockPosFP, lastBlockPayFP, lastBlockPosBufferUpto, lastBlockEndOffset, lastBlockPayloadByteUpto); + } + + if (DEBUG) { + if (docBufferUpto > 0) { + System.out.println(" write doc/freq vInt block (count=" + docBufferUpto + ") at fp=" + docOut.getFilePointer() + " docTermStartFP=" + docTermStartFP); + } + } + + // vInt encode the remaining doc deltas and freqs: + for(int i=0;i 0) { + System.out.println(" write pos vInt block (count=" + posBufferUpto + ") at fp=" + posOut.getFilePointer() + " posTermStartFP=" + posTermStartFP + " hasPayloads=" + fieldHasPayloads + " hasOffsets=" + fieldHasOffsets); + } + } + + assert stats.totalTermFreq != -1; + if (stats.totalTermFreq > blockSize) { + lastPosBlockOffset = (int) (posOut.getFilePointer() - posTermStartFP); + } else { + lastPosBlockOffset = -1; + } + if (posBufferUpto > 0) { + posOut.writeVInt(posBufferUpto); + + // nocommit should we send offsets/payloads to + // .pay...? seems wasteful (have to store extra + // vLong for low (< blockSize) DF terms = vast vast + // majority) + + // vInt encode the remaining positions/payloads/offsets: + int lastPayloadLength = -1; + int payloadBytesReadUpto = 0; + for(int i=0;i 0? +// 128*128 is immense? may need to decouple +// baseSkipInterval & theRestSkipInterval? + +final class BlockSkipWriter extends MultiLevelSkipListWriter { + private boolean DEBUG = BlockPostingsReader.DEBUG; + + private int[] lastSkipDoc; + private long[] lastSkipDocPointer; + private long[] lastSkipPosPointer; + private long[] lastSkipPayPointer; + private int[] lastEndOffset; + private int[] lastPayloadByteUpto; + + private final IndexOutput docOut; + private final IndexOutput posOut; + private final IndexOutput payOut; + + private int curDoc; + private long curDocPointer; + private long curPosPointer; + private long curPayPointer; + private int curPosBufferUpto; + private int curEndOffset; + private int curPayloadByteUpto; + private boolean fieldHasPositions; + private boolean fieldHasOffsets; + private boolean fieldHasPayloads; + + public BlockSkipWriter(int skipInterval, int maxSkipLevels, int docCount, IndexOutput docOut, IndexOutput posOut, IndexOutput payOut) { + super(skipInterval, maxSkipLevels, docCount); + this.docOut = docOut; + this.posOut = posOut; + this.payOut = payOut; + + lastSkipDoc = new int[maxSkipLevels]; + lastSkipDocPointer = new long[maxSkipLevels]; + if (posOut != null) { + lastSkipPosPointer = new long[maxSkipLevels]; + if (payOut != null) { + lastSkipPayPointer = new long[maxSkipLevels]; + } + lastEndOffset = new int[maxSkipLevels]; + lastPayloadByteUpto = new int[maxSkipLevels]; + } + } + + public void setField(boolean fieldHasPositions, boolean fieldHasOffsets, boolean fieldHasPayloads) { + this.fieldHasPositions = fieldHasPositions; + this.fieldHasOffsets = fieldHasOffsets; + this.fieldHasPayloads = fieldHasPayloads; + } + + @Override + public void resetSkip() { + super.resetSkip(); + Arrays.fill(lastSkipDoc, 0); + Arrays.fill(lastSkipDocPointer, docOut.getFilePointer()); + if (fieldHasPositions) { + Arrays.fill(lastSkipPosPointer, posOut.getFilePointer()); + if (fieldHasOffsets) { + Arrays.fill(lastEndOffset, 0); + } + if (fieldHasPayloads) { + Arrays.fill(lastPayloadByteUpto, 0); + } + if (fieldHasOffsets || fieldHasPayloads) { + Arrays.fill(lastSkipPayPointer, payOut.getFilePointer()); + } + } + } + + /** + * Sets the values for the current skip data. + */ + public void bufferSkip(int doc, int numDocs, long posFP, long payFP, int posBufferUpto, int endOffset, int payloadByteUpto) throws IOException { + this.curDoc = doc; + this.curDocPointer = docOut.getFilePointer(); + this.curPosPointer = posFP; + this.curPayPointer = payFP; + this.curPosBufferUpto = posBufferUpto; + this.curPayloadByteUpto = payloadByteUpto; + this.curEndOffset = endOffset; + bufferSkip(numDocs); + } + + @Override + protected void writeSkipData(int level, IndexOutput skipBuffer) throws IOException { + int delta = curDoc - lastSkipDoc[level]; + if (DEBUG) { + System.out.println("writeSkipData level=" + level + " lastDoc=" + curDoc + " delta=" + delta + " curDocPointer=" + curDocPointer); + } + skipBuffer.writeVInt(delta); + lastSkipDoc[level] = curDoc; + + skipBuffer.writeVInt((int) (curDocPointer - lastSkipDocPointer[level])); + lastSkipDocPointer[level] = curDocPointer; + + if (fieldHasPositions) { + if (DEBUG) { + System.out.println(" curPosPointer=" + curPosPointer + " curPosBufferUpto=" + curPosBufferUpto); + } + skipBuffer.writeVInt((int) (curPosPointer - lastSkipPosPointer[level])); + lastSkipPosPointer[level] = curPosPointer; + skipBuffer.writeVInt(curPosBufferUpto); + + if (fieldHasPayloads) { + skipBuffer.writeVInt(curPayloadByteUpto); + } + + if (fieldHasOffsets) { + skipBuffer.writeVInt(curEndOffset - lastEndOffset[level]); + lastEndOffset[level] = curEndOffset; + } + + if (fieldHasOffsets || fieldHasPayloads) { + skipBuffer.writeVInt((int) (curPayPointer - lastSkipPayPointer[level])); + lastSkipPayPointer[level] = curPayPointer; + } + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java index 91c135c0d6a..750cf3a5280 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java @@ -185,8 +185,6 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase { int lastDocID; int df; - /** Adds a new doc in this term. If this returns null - * then we just skip consuming positions/payloads. */ @Override public void startDoc(int docID, int termDocFreq) throws IOException { // if (DEBUG) System.out.println("SPW: startDoc seg=" + segment + " docID=" + docID + " tf=" + termDocFreq + " freqOut.fp=" + freqOut.getFilePointer()); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/pfor/ForUtil.java b/lucene/core/src/java/org/apache/lucene/codecs/pfor/ForUtil.java index 0f26bc66f1a..b660c0b38e6 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/pfor/ForUtil.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/pfor/ForUtil.java @@ -17,8 +17,6 @@ package org.apache.lucene.codecs.pfor; */ import java.nio.IntBuffer; -import java.nio.ByteBuffer; -import java.util.Arrays; /** * Encode all values in normal area with fixed bit width, @@ -73,6 +71,9 @@ public class ForUtil { // since this buffer is reused at upper level, rewind first intBuffer.rewind(); + // nocommit assert header isn't "malformed", ie besides + // numBytes / bit-width there is nothing else! + int numBits = ((header >> 8) & MASK[6]); decompressCore(intBuffer, data, numBits); diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java index f0ff871b64f..3dc1b0fc8c5 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java +++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java @@ -34,6 +34,8 @@ import org.apache.lucene.index.FieldInfo.IndexOptions; public class FieldInfos implements Iterable { private final boolean hasFreq; private final boolean hasProx; + private final boolean hasPayloads; + private final boolean hasOffsets; private final boolean hasVectors; private final boolean hasNorms; private final boolean hasDocValues; @@ -45,6 +47,8 @@ public class FieldInfos implements Iterable { public FieldInfos(FieldInfo[] infos) { boolean hasVectors = false; boolean hasProx = false; + boolean hasPayloads = false; + boolean hasOffsets = false; boolean hasFreq = false; boolean hasNorms = false; boolean hasDocValues = false; @@ -58,12 +62,16 @@ public class FieldInfos implements Iterable { hasVectors |= info.hasVectors(); hasProx |= info.isIndexed() && info.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; hasFreq |= info.isIndexed() && info.getIndexOptions() != IndexOptions.DOCS_ONLY; + hasOffsets |= info.isIndexed() && info.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; hasNorms |= info.hasNorms(); hasDocValues |= info.hasDocValues(); + hasPayloads |= info.hasPayloads(); } this.hasVectors = hasVectors; this.hasProx = hasProx; + this.hasPayloads = hasPayloads; + this.hasOffsets = hasOffsets; this.hasFreq = hasFreq; this.hasNorms = hasNorms; this.hasDocValues = hasDocValues; @@ -79,6 +87,16 @@ public class FieldInfos implements Iterable { public boolean hasProx() { return hasProx; } + + /** Returns true if any fields have payloads */ + public boolean hasPayloads() { + return hasPayloads; + } + + /** Returns true if any fields have offsets */ + public boolean hasOffsets() { + return hasOffsets; + } /** * @return true if at least one field has any vectors diff --git a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat index af76d41c42b..dc5843940cc 100644 --- a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat +++ b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat @@ -20,3 +20,4 @@ org.apache.lucene.codecs.memory.MemoryPostingsFormat org.apache.lucene.codecs.pfor.ForPostingsFormat org.apache.lucene.codecs.pfor.PForPostingsFormat org.apache.lucene.codecs.bulkvint.BulkVIntPostingsFormat +org.apache.lucene.codecs.block.BlockPostingsFormat diff --git a/lucene/core/src/test/org/apache/lucene/index/TestPostingsFormat.java b/lucene/core/src/test/org/apache/lucene/index/TestPostingsFormat.java new file mode 100644 index 00000000000..746e1e979e7 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/index/TestPostingsFormat.java @@ -0,0 +1,866 @@ +package org.apache.lucene.index; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.EnumSet; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; + +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.FieldsConsumer; +import org.apache.lucene.codecs.FieldsProducer; +import org.apache.lucene.codecs.PostingsConsumer; +import org.apache.lucene.codecs.TermStats; +import org.apache.lucene.codecs.TermsConsumer; +import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.Constants; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; +import org.junit.BeforeClass; + +/* NOTE: This test focuses on the postings + * (docs/freqs/positions/payloads/offsets) impl, not the + * terms dict. The [stretch] goal is for this test to be + * so thorough in testing a new PostingsFormat that if this + * test passes, then all Lucene/Solr tests should also pass. Ie, + * if there is some bug in a given PostingsFormat that this + * test fails to catch then this test needs to be improved! */ + +// nocommit can we make it easy for testing to pair up a "random terms dict impl" with your postings base format... + +// nocommit test when you reuse after skipping a term or two, eg the block reuse case + +// nocommit hmm contract says .doc() can return NO_MORE_DOCS +// before nextDoc too...? + +/* TODO + - threads + - assert doc=-1 before any nextDoc + - if a PF passes this test but fails other tests then this + test has a bug!! + - test tricky reuse cases, eg across fields + - verify you get null if you pass needFreq/needOffset but + they weren't indexed +*/ + +public class TestPostingsFormat extends LuceneTestCase { + + private enum Option { + // Sometimes use .advance(): + SKIPPING, + + // Sometimes reuse the Docs/AndPositionsEnum across terms: + REUSE_ENUMS, + + // Sometimes pass non-null live docs: + LIVE_DOCS, + + // Sometimes seek to term using previously saved TermState: + TERM_STATE, + + // Sometimes don't fully consume docs from the enum + PARTIAL_DOC_CONSUME, + + // Sometimes don't fully consume positions at each doc + PARTIAL_POS_CONSUME, + + // Sometimes check payloads + PAYLOADS, + + // Test w/ multiple threads + THREADS}; + + private static class FieldAndTerm { + String field; + BytesRef term; + + public FieldAndTerm(String field, BytesRef term) { + this.field = field; + this.term = BytesRef.deepCopyOf(term); + } + } + + private static class Position { + int position; + byte[] payload; + int startOffset; + int endOffset; + } + + private static class Posting implements Comparable{ + int docID; + List positions; + + public int compareTo(Posting other) { + return docID - other.docID; + } + } + + // Holds all postings: + private static Map>> fields = new TreeMap>>(); + + // Holds only live doc postings: + private static Map>> fieldsLive = new TreeMap>>(); + + private static FieldInfos fieldInfos; + + private static int maxDocID; + + private static FixedBitSet globalLiveDocs; + + private static List allTerms; + + @BeforeClass + public static void createPostings() throws IOException { + + final int numFields = _TestUtil.nextInt(random(), 1, 5); + if (VERBOSE) { + System.out.println("TEST: " + numFields + " fields"); + } + + FieldInfo[] fieldInfoArray = new FieldInfo[numFields]; + int fieldUpto = 0; + int numMediumTerms = 0; + int numBigTerms = 0; + int numManyPositions = 0; + while (fieldUpto < numFields) { + String field = _TestUtil.randomSimpleString(random()); + if (fields.containsKey(field)) { + continue; + } + + boolean fieldHasPayloads = random().nextBoolean(); + + fieldInfoArray[fieldUpto] = new FieldInfo(field, true, fieldUpto, false, false, fieldHasPayloads, + IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS, + null, DocValues.Type.FIXED_INTS_8, null); + fieldUpto++; + + Map> postings = new TreeMap>(); + fields.put(field, postings); + Set seenTerms = new HashSet(); + + // nocommit + //final int numTerms = atLeast(10); + final int numTerms = 4; + for(int termUpto=0;termUpto termPostings = new ArrayList(); + postings.put(new BytesRef(term), termPostings); + + int docID = 0; + + // TODO: more realistic to inversely tie this to numDocs: + int maxDocSpacing = _TestUtil.nextInt(random(), 1, 100); + + // 10% of the time create big payloads: + int payloadSize; + if (!fieldHasPayloads) { + payloadSize = 0; + } else if (random().nextInt(10) == 7) { + payloadSize = random().nextInt(50); + } else { + payloadSize = random().nextInt(10); + } + + boolean fixedPayloads = random().nextBoolean(); + + for(int docUpto=0;docUpto(); + termPostings.add(posting); + + int freq; + if (random().nextInt(30) == 17 && numManyPositions < 10) { + freq = _TestUtil.nextInt(random(), 1, 1000); + numManyPositions++; + } else { + freq = _TestUtil.nextInt(random(), 1, 20); + } + int pos = 0; + int offset = 0; + int posSpacing = _TestUtil.nextInt(random(), 1, 100); + for(int posUpto=0;posUpto>> fieldEnt : fields.entrySet()) { + Map> postingsLive = new TreeMap>(); + fieldsLive.put(fieldEnt.getKey(), postingsLive); + for(Map.Entry> termEnt : fieldEnt.getValue().entrySet()) { + List termPostingsLive = new ArrayList(); + postingsLive.put(termEnt.getKey(), termPostingsLive); + for(Posting posting : termEnt.getValue()) { + if (globalLiveDocs.get(posting.docID)) { + termPostingsLive.add(posting); + } + } + } + } + + allTerms = new ArrayList(); + for(Map.Entry>> fieldEnt : fields.entrySet()) { + String field = fieldEnt.getKey(); + for(Map.Entry> termEnt : fieldEnt.getValue().entrySet()) { + allTerms.add(new FieldAndTerm(field, termEnt.getKey())); + } + } + + if (VERBOSE) { + System.out.println("TEST: done init postings; maxDocID=" + maxDocID + "; " + allTerms.size() + " total terms, across " + fieldInfos.size() + " fields"); + } + } + + // nocommit maybe instead of @BeforeClass just make a single test run: build postings & index & test it? + + private FieldInfos currentFieldInfos; + + // maxAllowed = the "highest" we can index, but we will still + // randomly index at lower IndexOption + private FieldsProducer buildIndex(Directory dir, IndexOptions maxAllowed, boolean allowPayloads) throws IOException { + SegmentInfo segmentInfo = new SegmentInfo(dir, Constants.LUCENE_MAIN_VERSION, "_0", 1+maxDocID, false, Codec.getDefault(), null, null); + + int maxIndexOption = Arrays.asList(IndexOptions.values()).indexOf(maxAllowed); + if (VERBOSE) { + System.out.println("\nTEST: now build index"); + } + + // nocommit use allowPayloads + + FieldInfo[] newFieldInfoArray = new FieldInfo[fields.size()]; + for(int fieldUpto=0;fieldUpto= 0 && allowPayloads; + + newFieldInfoArray[fieldUpto] = new FieldInfo(oldFieldInfo.name, + true, + fieldUpto, + false, + false, + doPayloads, + indexOptions, + null, + DocValues.Type.FIXED_INTS_8, + null); + } + + FieldInfos newFieldInfos = new FieldInfos(newFieldInfoArray); + + SegmentWriteState writeState = new SegmentWriteState(null, dir, + segmentInfo, newFieldInfos, + 32, null, IOContext.DEFAULT); + + FieldsConsumer fieldsConsumer = Codec.getDefault().postingsFormat().fieldsConsumer(writeState); + + for(Map.Entry>> fieldEnt : fields.entrySet()) { + String field = fieldEnt.getKey(); + Map> terms = fieldEnt.getValue(); + + FieldInfo fieldInfo = newFieldInfos.fieldInfo(field); + if (VERBOSE) { + System.out.println("field=" + field); + } + + IndexOptions indexOptions = fieldInfo.getIndexOptions(); + + boolean doFreq = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; + boolean doPos = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + boolean doPayloads = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 && allowPayloads; + + TermsConsumer termsConsumer = fieldsConsumer.addField(fieldInfo); + long sumTotalTF = 0; + long sumDF = 0; + FixedBitSet seenDocs = new FixedBitSet(maxDocID+1); + for(Map.Entry> termEnt : terms.entrySet()) { + BytesRef term = termEnt.getKey(); + List postings = termEnt.getValue(); + if (VERBOSE) { + System.out.println(" term=" + field + ":" + term.utf8ToString() + " docFreq=" + postings.size()); + } + + PostingsConsumer postingsConsumer = termsConsumer.startTerm(term); + long totalTF = 0; + int docCount = 0; + for(Posting posting : postings) { + if (VERBOSE) { + System.out.println(" " + docCount + ": docID=" + posting.docID + " freq=" + posting.positions.size()); + } + postingsConsumer.startDoc(posting.docID, posting.positions.size()); + seenDocs.set(posting.docID); + if (doPos) { + totalTF += posting.positions.size(); + for(Position pos : posting.positions) { + if (VERBOSE) { + if (doPayloads) { + System.out.println(" pos=" + pos.position + " payload=" + (pos.payload == null ? "null" : pos.payload.length + " bytes")); + } else { + System.out.println(" pos=" + pos.position); + } + } + postingsConsumer.addPosition(pos.position, (doPayloads && pos.payload != null) ? new BytesRef(pos.payload) : null, pos.startOffset, pos.endOffset); + } + } else if (doFreq) { + totalTF += posting.positions.size(); + } else { + totalTF++; + } + docCount++; + } + termsConsumer.finishTerm(term, new TermStats(postings.size(), totalTF)); + sumTotalTF += totalTF; + sumDF += postings.size(); + } + + termsConsumer.finish(sumTotalTF, sumDF, seenDocs.cardinality()); + } + + fieldsConsumer.close(); + + if (VERBOSE) { + System.out.println("TEST: after indexing: files="); + for(String file : dir.listAll()) { + System.out.println(" " + file + ": " + dir.fileLength(file) + " bytes"); + } + } + + currentFieldInfos = newFieldInfos; + + SegmentReadState readState = new SegmentReadState(dir, segmentInfo, newFieldInfos, IOContext.DEFAULT, 1); + + return Codec.getDefault().postingsFormat().fieldsProducer(readState); + } + + private static class ThreadState { + // Only used with REUSE option: + public DocsEnum reuseDocsEnum; + public DocsAndPositionsEnum reuseDocsAndPositionsEnum; + } + + private void verifyEnum(ThreadState threadState, + String field, + BytesRef term, + TermsEnum termsEnum, + + // Maximum options (docs/freqs/positions/offsets) to test: + IndexOptions maxIndexOptions, + + EnumSet