diff --git a/lucene/core/src/java/org/apache/lucene/codecs/MultiLevelSkipListWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/MultiLevelSkipListWriter.java index 4822eb7249b..0c80d702c78 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/MultiLevelSkipListWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/MultiLevelSkipListWriter.java @@ -26,6 +26,8 @@ import org.apache.lucene.util.MathUtil; /** * This abstract class writes skip lists with multiple levels. * + *
+ * * Example for skipInterval = 3: * c (skip level 2) * c c c (skip level 1) @@ -45,6 +47,7 @@ import org.apache.lucene.util.MathUtil; * * While this class takes care of writing the different skip levels, * subclasses must define the actual format of the skip data. + ** @lucene.experimental */ diff --git a/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsFormat.java index afd04fff911..c3ae82d65a3 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsFormat.java @@ -30,10 +30,290 @@ import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.util.IOUtils; +// javadocs +import org.apache.lucene.codecs.MultiLevelSkipListWriter; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.packed.PackedInts; + /** - * Encodes/decode postings in packed int blocks for faster - * decode. + * Block postings format, which encodes postings in packed int blocks + * for faster decode. + * + *
+ * Basic idea: + *
In packed block, integers are encoded with the same bit width ({@link PackedInts packed format}), + * the block size (i.e. number of integers inside block) is fixed.
+ *In VInt block, integers are encoded as {@link DataOutput#writeVInt VInt}, + * the block size is variable.
+ *When the postings is long enough, BlockPostingsFormat will try to encode most integer data + * as packed block.
+ *Take a term with 259 documents as example, the first 256 document ids are encoded as two packed + * blocks, while the remaining 3 as one VInt block.
+ *Different kinds of data are always encoded separately into different packed blocks, but may + * possible be encoded into a same VInt block.
+ *This strategy is applied to pairs: + * <document number, frequency>, + * <position, payload length>, + * <position, offset start, offset length>, and + * <position, payload length, offsetstart, offset length>.
+ *The structure of skip table is quite similar to Lucene40PostingsFormat. Skip interval is the + * same as block size, and each skip entry points to the beginning of each block. However, for + * the first block, skip data is omitted.
+ *A position is an integer indicating where the term occured in one document. + * A payload is a blob of metadata associated with current position. + * An offset is a pair of integers indicating the tokenized start/end offsets for given term + * in current position.
+ *When payloads and offsets are not omitted, numPositions==numPayloads==numOffsets (assuming a + * null payload contributes one count). As mentioned in block structure, it is possible to encode + * these three either centralizedly or separately. + *
For all the cases, payloads and offsets are stored together. When encoded as packed block, + * position data is separated out as .pos, while payloads and offsets are encoded in .pay (payload + * metadata will also be stored directly in .pay). When encoded as VInt block, all these three are + * stored in .pos (so as payload metadata).
+ *+ * Files and detailed format: + *
The .tim file format is quite similar to Lucene40PostingsFormat, + * with minor difference in MetadataBlock
+ * + *Notes:
+ *The .tim file format is mentioned in + * + * Lucene40PostingsFormat:TermIndex + *
The .doc file contains the lists of documents which contain each term, along + * with the frequency of the term in that document (except when frequencies are + * omitted: {@link IndexOptions#DOCS_ONLY}). It also saves skip data to the beginning of + * each packed or VInt block, when the length of document list is larger than packed block size.
+ * + *Notes:
+ *Notes:
+ *Notes:
+ *