From 573f6a734c734dca87ea770806e61c1168db8fb0 Mon Sep 17 00:00:00 2001
From: Michael McCandless
Date: Sat, 18 Aug 2012 14:42:59 +0000
Subject: [PATCH] LUCENE-3892: javadocs
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/pforcodec_3892@1374591 13f79535-47bb-0310-9956-ffa450edef68
---
.../codecs/block/BlockPostingsFormat.java | 88 +++++++++++--------
.../codecs/block/BlockPostingsWriter.java | 3 +
2 files changed, 55 insertions(+), 36 deletions(-)
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsFormat.java
index c3ae82d65a3..c6a08d798c7 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsFormat.java
@@ -1,5 +1,6 @@
package org.apache.lucene.codecs.block;
+
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -50,7 +51,7 @@ import org.apache.lucene.util.packed.PackedInts;
*
* -
* Packed Block and VInt Block:
- *
In packed block, integers are encoded with the same bit width ({@link PackedInts packed format}),
+ *
In packed block, integers are encoded with the same bit width ({@link PackedInts packed format}),
* the block size (i.e. number of integers inside block) is fixed.
* In VInt block, integers are encoded as {@link DataOutput#writeVInt VInt},
* the block size is variable.
@@ -80,7 +81,7 @@ import org.apache.lucene.util.packed.PackedInts;
*
* -
* Positions, Payloads, and Offsets:
- *
A position is an integer indicating where the term occured in one document.
+ *
A position is an integer indicating where the term occurs at within one document.
* A payload is a blob of metadata associated with current position.
* An offset is a pair of integers indicating the tokenized start/end offsets for given term
* in current position.
@@ -91,6 +92,9 @@ import org.apache.lucene.util.packed.PackedInts;
* position data is separated out as .pos, while payloads and offsets are encoded in .pay (payload
* metadata will also be stored directly in .pay). When encoded as VInt block, all these three are
* stored in .pos (so as payload metadata).
+ * With this strategy, the majority of payload and offset data will be outside .pos file.
+ * So for queries that require only position data, running on a full index with payloads and offsets,
+ * this reduces disk pre-fetches.
*
*
*
@@ -119,18 +123,18 @@ import org.apache.lucene.util.packed.PackedInts;
* TermDictionary(.tim) --> Header, DirOffset, PostingsHeader, PackedBlockSize,
* <Block>NumBlocks, FieldSummary
* Block --> SuffixBlock, StatsBlock, MetadataBlock
- * SuffixBlock --> EntryCount, SuffixLength, ByteSuffixLength
+ * SuffixBlock --> EntryCount, SuffixLength, {@link DataOutput#writeByte byte}SuffixLength
* StatsBlock --> StatsLength, <DocFreq, TotalTermFreq>EntryCount
* MetadataBlock --> MetaLength, <DocFPDelta,
- * <PosFPDelta, PosBlockFPDelta?, PayFPDelta?>?,
- * SkipFPDelta? >EntryCount
+ * <PosFPDelta, PosVIntBlockFPDelta?, PayFPDelta?>?,
+ * SkipFPDelta?>EntryCount
* FieldSummary --> NumFields, <FieldNumber, NumTerms, RootCodeLength,
* ByteRootCodeLength, SumDocFreq, DocCount>
* NumFields
* Header, PostingsHeader --> {@link CodecUtil#writeHeader CodecHeader}
* DirOffset --> {@link DataOutput#writeLong Uint64}
* PackedBlockSize, EntryCount, SuffixLength, StatsLength, DocFreq, MetaLength,
- * PosBlockFPDelta, SkipFPDelta, NumFields, FieldNumber, RootCodeLength, DocCount -->
+ * PosVIntBlockFPDelta , SkipFPDelta, NumFields, FieldNumber, RootCodeLength, DocCount -->
* {@link DataOutput#writeVInt VInt}
* TotalTermFreq, DocFPDelta, PosFPDelta, NumTerms, SumTotalTermFreq, SumDocFreq -->
* {@link DataOutput#writeVLong VLong}
@@ -141,27 +145,30 @@ import org.apache.lucene.util.packed.PackedInts;
*
* Lucene40PostingsFormat:TermDictionary
*
- * PackedBlockSize is fixed block size for packed blocks. In packed block, bit width is
+ * PackedBlockSize is the fixed block size for packed blocks. In packed block, bit width is
* determined by the largest integer. Smaller block size result in smaller variance among width
* of integers hence smaller indexes. Larger block size result in more efficient bulk i/o hence
* better acceleration. This value should always be a multiple of 64, currently fixed as 128 as
- * a tradeoff. It is also the skip interval used to accerlerate {@link DocsEnum#advance(int)}.
+ * a tradeoff. It is also the skip interval used to accelerate {@link DocsEnum#advance(int)}.
* DocFPDelta determines the position of this term's TermFreqs within the .doc file.
* In particular, it is the difference of file offset between this term's
- * data and previous term's data (or zero, for the first term in the block).
- * PayFPDelta determines the position of this term's payload or offset data within the .pay file.
- * Similar to DocFPDelta, it is the difference between two file positions (or neglected,
- * for fields that omit payloads and offsets, or for the first term in the block).
- *
- * PosFPDelta and PosBlockFPDelta determine the position of this term's TermPositions within
- * the .pos file.
- * PosBlockFPDelta determines the position of this term's TermPositions within the .pos file.
- * Similar to DocFPDelta, it is the difference between two file positions (or neglected,
- * for fields that omit position data, or for the first term in the block).
+ * data and previous term's data (or zero, for the first term in the block).On disk it is
+ * stored as the difference from previous value in sequence.
+ * PosFPDelta determines the position of this term's TermPositions within the .pos file.
+ * While PayFPDelta determines the position of this term's <TermPayloads, TermOffsets?> within
+ * the .pay file. Similar to DocFPDelta, it is the difference between two file positions (or
+ * neglected, for fields that omit payloads and offsets).
+ * PosVIntBlockFPDelta determines the position of this term's last TermPosition in last pos packed
+ * block within the .pos file. It is synonym for PayVIntBlockFPDelta or OffsetVIntBlockFPDelta.
+ * This is actually used to indicate whether it is necessary to load following
+ * payloads and offsets from .pos instead of .pay. Everytime a new block of positions are to be
+ * loaded, the PostingsReader will use this value to check whether current block is packed format
+ * or VInt. When packed format, payloads and offsets are fetched from .pay, otherwise from .pos.
+ * (this value is neglected when total number of positions i.e. totalTermFreq is less or equal
+ * to PackedBlockSize).
* SkipFPDelta determines the position of this term's SkipData within the .doc
- * file. In particular, it is the number of bytes after TermFreqs that the
- * SkipData starts. In other words, it is the length of the TermFreq data.
- * SkipDelta is only stored if DocFreq is not smaller than SkipMinimum,
+ * file. In particular, it is the length of the TermFreq data.
+ * SkipDelta is only stored if DocFreq is not smaller than SkipMinimum
* (i.e. 8 in BlockPostingsFormat).
*
*
@@ -189,19 +196,20 @@ import org.apache.lucene.util.packed.PackedInts;
* each packed or VInt block, when the length of document list is larger than packed block size.
*
*
- * - docFile(.doc) --> Header, < TermFreqs, SkipData? >TermCount
+ * - docFile(.doc) --> Header, <TermFreqs, SkipData?>TermCount
* - Header --> {@link CodecUtil#writeHeader CodecHeader}
- * - TermFreqs --> < PackedBlock > PackedDocBlockNum,
+ *
- TermFreqs --> <PackedBlock> PackedDocBlockNum,
* VIntBlock?
* - PackedBlock --> PackedDocDeltaBlock, PackedFreqBlock?
- *
- VIntBlock --> < DocDelta[, Freq?] >DocFreq-PackedBlockSize*PackedDocBlockNum
+ *
- VIntBlock --> <DocDelta[, Freq?]>DocFreq-PackedBlockSize*PackedDocBlockNum
*
- SkipData --> <<SkipLevelLength, SkipLevel>
* NumSkipLevels-1, SkipLevel> <SkipDatum?>
* - SkipLevel --> <SkipDatum> TrimmedDocFreq/(PackedBlockSize^(Level + 1))
- * - SkipDatum --> DocSkip, DocFPSkip, < PosFPSkip, PosBlockOffset, PayLength?,
- * OffsetStart?, PayFPSkip? >?, SkipChildLevelPointer?
+ * - SkipDatum --> DocSkip, DocFPSkip, <PosFPSkip, PosBlockOffset, PayLength?,
+ * OffsetStart?, PayFPSkip?>?, SkipChildLevelPointer?
* - PackedDocDeltaBlock, PackedFreqBlock --> {@link PackedInts PackedInts}
- * - DocDelta,Freq,DocSkip,DocFPSkip,PosFPSkip,PosBlockOffset,PayLength,OffsetStart,PayFPSkip -->
+ *
- DocDelta, Freq, DocSkip, DocFPSkip, PosFPSkip, PosBlockOffset, PayLength, OffsetStart, PayFPSkip
+ * -->
* {@link DataOutput#writeVInt VInt}
* - SkipChildLevelPointer --> {@link DataOutput#writeVLong VLong}
*
@@ -234,7 +242,7 @@ import org.apache.lucene.util.packed.PackedInts;
* SkipDatum is the metadata of one skip entry.
* For the first block (no matter packed or VInt), it is omitted.
* DocSkip records the document number of every PackedBlockSizeth document number in
- * the postings(i.e. last document number in each packed block). On disk it is stored as the
+ * the postings (i.e. last document number in each packed block). On disk it is stored as the
* difference from previous value in the sequence.
* DocFPSkip records the file offsets of each block (excluding )posting at
* PackedBlockSize+1th, 2*PackedBlockSize+1th ... , in DocFile.
@@ -256,13 +264,15 @@ import org.apache.lucene.util.packed.PackedInts;
*
* -
* Positions
+ *
The .pos file contains the lists of positions that each term occurs at within documents. It also
+ * sometimes stores part of payloads and offsets for speedup.
*
- * - Pos(.prx) --> Header, <TermPositions> TermCount
+ * - Pos(.pos) --> Header, <TermPositions> TermCount
* - Header --> {@link CodecUtil#writeHeader CodecHeader}
- * - TermPositions --> < PackedPosDeltaBlock > PackedPosBlockNum,
+ *
- TermPositions --> <PackedPosDeltaBlock> PackedPosBlockNum,
* VIntBlock?
- * - VIntBlock --> PosVIntCount < PosDelta[, PayLength?], PayData?,
- * OffsetStartDelta?, OffsetLength? >PosVIntCount
+ *
- VIntBlock --> PosVIntCount <PosDelta[, PayLength?], PayData?,
+ * OffsetStartDelta?, OffsetLength?>PosVIntCount
*
- PackedPosDeltaBlock --> {@link PackedInts PackedInts}
* - PosVIntCount, PosDelta, OffsetStartDelta, OffsetLength -->
* {@link DataOutput#writeVInt VInt}
@@ -272,6 +282,8 @@ import org.apache.lucene.util.packed.PackedInts;
*
* - TermPositions are order by term (terms are implicit, from the term dictionary), and position
* values for each term document pair are incremental, and ordered by document number.
+ * - PackedPosBlockNum is the number of packed blocks for current term's positions, payloads or offsets.
+ * In particular, PackedDocBlockNum = floor(totalTermFreq/PackedBlockSize)
* - The procedure how PackedPosDeltaBlock is generated is the same as PackedDocDeltaBlock
* in chapter Frequencies and Skip Data.
* - PosDelta is the same as the format mentioned in
@@ -290,10 +302,12 @@ import org.apache.lucene.util.packed.PackedInts;
*
* -
* Payloads and Offsets
+ *
The .pay file will store payload and offset associated with certain term-document positons.
+ * Some payloads and offsets will be seperated out into .pos file, for speedup reason.
*
* - PayFile(.pay): --> Header, <TermPayloads, TermOffsets?> TermCount
* - Header --> {@link CodecUtil#writeHeader CodecHeader}
- * - TermPayloads --> < PackedPayLengthBlock, PayBlockLength, PayData, PackedOffsetStartDeltaBlock?, PackedOffsetLengthBlock > PackedPayBlockNum
+ *
- TermPayloads --> <PackedPayLengthBlock, PayBlockLength, PayData, PackedOffsetStartDeltaBlock?, PackedOffsetLengthBlock?> PackedPayBlockNum
*
- PackedPayLengthBlock, PackedOffsetStartDeltaBlock, PackedOffsetLengthBlock --> {@link PackedInts PackedInts}
* - PayBlockLength --> {@link DataOutput#writeVInt VInt}
* - PayData --> {@link DataOutput#writeByte byte}PayBlockLength
@@ -302,11 +316,13 @@ import org.apache.lucene.util.packed.PackedInts;
*
* - The order of TermPayloads/TermOffsets will be the same as TermPositions, note that part of
* payload/offsets are stored in .pos.
- * - The procedure how PackedPayLengthBlock is generated is the same as PackedFreqBlock
- * in chapter Frequencies and Skip Data.
+ * - The procedure how PackedPayLengthBlock and PackedOffsetLengthBlock are generated is the
+ * same as PackedFreqBlock in chapter Frequencies and Skip Data.
+ * While PackedStartDeltaBlock follows a same procedure as PackedDocDeltaBlock.
* - PayBlockLength is the total length of payloads written within one block, should be the sum
* of PayLengths in one packed block.
- * - PayLength is the length of each payload, associated with current position.
+ * - PayLength in PackedPayLengthBlock is the length of each payload, associated with current
+ * position.
*
*
*
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsWriter.java
index e07b411139b..183fd5e7ea6 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsWriter.java
@@ -403,8 +403,11 @@ final class BlockPostingsWriter extends PostingsWriterBase {
// }
// }
+ // totalTermFreq is just total number of positions(or payloads, or offsets)
+ // associated with current term.
assert stats.totalTermFreq != -1;
if (stats.totalTermFreq > BLOCK_SIZE) {
+ // record file offset for last pos in last block
lastPosBlockOffset = (int) (posOut.getFilePointer() - posTermStartFP);
} else {
lastPosBlockOffset = -1;