LUCENE-3892: merge BlockPostingsFormat from branch

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1375486 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2012-08-21 12:00:48 +00:00
commit 03003209b4
33 changed files with 35430 additions and 8018 deletions

View File

@ -15,6 +15,11 @@ New Features
that you must store term vector positions to store payloads. that you must store term vector positions to store payloads.
(Robert Muir) (Robert Muir)
* LUCENE-3892: Add a new BlockPostingsFormat that bulk-encodes docs,
freqs and positions in large (size 128) packed-int blocks for faster
search performance. This was from Han Jiang's 2012 Google Summer of
Code project (Han Jiang, Adrien Grand, Robert Muir, Mike McCandless)
API Changes API Changes
* LUCENE-4299: Added Terms.hasPositions() and Terms.hasOffsets(). * LUCENE-4299: Added Terms.hasPositions() and Terms.hasOffsets().

View File

@ -724,7 +724,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
// Write term stats, to separate byte[] blob: // Write term stats, to separate byte[] blob:
bytesWriter2.writeVInt(term.stats.docFreq); bytesWriter2.writeVInt(term.stats.docFreq);
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) { if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
assert term.stats.totalTermFreq >= term.stats.docFreq; assert term.stats.totalTermFreq >= term.stats.docFreq: term.stats.totalTermFreq + " vs " + term.stats.docFreq;
bytesWriter2.writeVLong(term.stats.totalTermFreq - term.stats.docFreq); bytesWriter2.writeVLong(term.stats.totalTermFreq - term.stats.docFreq);
} }
} }

View File

@ -59,30 +59,36 @@ public abstract class MultiLevelSkipListReader {
private int skipInterval[]; // skipInterval of each level private int skipInterval[]; // skipInterval of each level
private int[] numSkipped; // number of docs skipped per level private int[] numSkipped; // number of docs skipped per level
private int[] skipDoc; // doc id of current skip entry per level protected int[] skipDoc; // doc id of current skip entry per level
private int lastDoc; // doc id of last read skip entry with docId <= target private int lastDoc; // doc id of last read skip entry with docId <= target
private long[] childPointer; // child pointer of current skip entry per level private long[] childPointer; // child pointer of current skip entry per level
private long lastChildPointer; // childPointer of last read skip entry with docId <= target private long lastChildPointer; // childPointer of last read skip entry with docId <= target
private boolean inputIsBuffered; private boolean inputIsBuffered;
private final int skipMultiplier;
public MultiLevelSkipListReader(IndexInput skipStream, int maxSkipLevels, int skipInterval) { protected MultiLevelSkipListReader(IndexInput skipStream, int maxSkipLevels, int skipInterval, int skipMultiplier) {
this.skipStream = new IndexInput[maxSkipLevels]; this.skipStream = new IndexInput[maxSkipLevels];
this.skipPointer = new long[maxSkipLevels]; this.skipPointer = new long[maxSkipLevels];
this.childPointer = new long[maxSkipLevels]; this.childPointer = new long[maxSkipLevels];
this.numSkipped = new int[maxSkipLevels]; this.numSkipped = new int[maxSkipLevels];
this.maxNumberOfSkipLevels = maxSkipLevels; this.maxNumberOfSkipLevels = maxSkipLevels;
this.skipInterval = new int[maxSkipLevels]; this.skipInterval = new int[maxSkipLevels];
this.skipMultiplier = skipMultiplier;
this.skipStream [0]= skipStream; this.skipStream [0]= skipStream;
this.inputIsBuffered = (skipStream instanceof BufferedIndexInput); this.inputIsBuffered = (skipStream instanceof BufferedIndexInput);
this.skipInterval[0] = skipInterval; this.skipInterval[0] = skipInterval;
for (int i = 1; i < maxSkipLevels; i++) { for (int i = 1; i < maxSkipLevels; i++) {
// cache skip intervals // cache skip intervals
this.skipInterval[i] = this.skipInterval[i - 1] * skipInterval; this.skipInterval[i] = this.skipInterval[i - 1] * skipMultiplier;
} }
skipDoc = new int[maxSkipLevels]; skipDoc = new int[maxSkipLevels];
} }
// skipMultiplier and skipInterval are the same:
protected MultiLevelSkipListReader(IndexInput skipStream, int maxSkipLevels, int skipInterval) {
this(skipStream, maxSkipLevels, skipInterval, skipInterval);
}
/** Returns the id of the doc to which the last call of {@link #skipTo(int)} /** Returns the id of the doc to which the last call of {@link #skipTo(int)}
* has skipped. */ * has skipped. */
@ -157,7 +163,7 @@ public abstract class MultiLevelSkipListReader {
numSkipped[level] = numSkipped[level + 1] - skipInterval[level + 1]; numSkipped[level] = numSkipped[level + 1] - skipInterval[level + 1];
skipDoc[level] = lastDoc; skipDoc[level] = lastDoc;
if (level > 0) { if (level > 0) {
childPointer[level] = skipStream[level].readVLong() + skipPointer[level - 1]; childPointer[level] = skipStream[level].readVLong() + skipPointer[level - 1];
} }
} }
@ -187,7 +193,12 @@ public abstract class MultiLevelSkipListReader {
/** Loads the skip levels */ /** Loads the skip levels */
private void loadSkipLevels() throws IOException { private void loadSkipLevels() throws IOException {
numberOfSkipLevels = MathUtil.log(docCount, skipInterval[0]); if (docCount <= skipInterval[0]) {
numberOfSkipLevels = 1;
} else {
numberOfSkipLevels = 1+MathUtil.log(docCount/skipInterval[0], skipMultiplier);
}
if (numberOfSkipLevels > maxNumberOfSkipLevels) { if (numberOfSkipLevels > maxNumberOfSkipLevels) {
numberOfSkipLevels = maxNumberOfSkipLevels; numberOfSkipLevels = maxNumberOfSkipLevels;
} }

View File

@ -26,6 +26,8 @@ import org.apache.lucene.util.MathUtil;
/** /**
* This abstract class writes skip lists with multiple levels. * This abstract class writes skip lists with multiple levels.
* *
* <pre>
*
* Example for skipInterval = 3: * Example for skipInterval = 3:
* c (skip level 2) * c (skip level 2)
* c c c (skip level 1) * c c c (skip level 1)
@ -45,6 +47,7 @@ import org.apache.lucene.util.MathUtil;
* *
* While this class takes care of writing the different skip levels, * While this class takes care of writing the different skip levels,
* subclasses must define the actual format of the skip data. * subclasses must define the actual format of the skip data.
* </pre>
* @lucene.experimental * @lucene.experimental
*/ */
@ -55,14 +58,22 @@ public abstract class MultiLevelSkipListWriter {
// the skip interval in the list with level = 0 // the skip interval in the list with level = 0
private int skipInterval; private int skipInterval;
// skipInterval used for level > 0
private int skipMultiplier;
// for every skip level a different buffer is used // for every skip level a different buffer is used
private RAMOutputStream[] skipBuffer; private RAMOutputStream[] skipBuffer;
protected MultiLevelSkipListWriter(int skipInterval, int maxSkipLevels, int df) { protected MultiLevelSkipListWriter(int skipInterval, int skipMultiplier, int maxSkipLevels, int df) {
this.skipInterval = skipInterval; this.skipInterval = skipInterval;
this.skipMultiplier = skipMultiplier;
// calculate the maximum number of skip levels for this document frequency // calculate the maximum number of skip levels for this document frequency
numberOfSkipLevels = MathUtil.log(df, skipInterval); if (df <= skipInterval) {
numberOfSkipLevels = 1;
} else {
numberOfSkipLevels = 1+MathUtil.log(df/skipInterval, skipMultiplier);
}
// make sure it does not exceed maxSkipLevels // make sure it does not exceed maxSkipLevels
if (numberOfSkipLevels > maxSkipLevels) { if (numberOfSkipLevels > maxSkipLevels) {
@ -70,6 +81,11 @@ public abstract class MultiLevelSkipListWriter {
} }
} }
// skipMultiplier and skipInterval are the same:
protected MultiLevelSkipListWriter(int skipInterval, int maxSkipLevels, int df) {
this(skipInterval, skipInterval, maxSkipLevels, df);
}
protected void init() { protected void init() {
skipBuffer = new RAMOutputStream[numberOfSkipLevels]; skipBuffer = new RAMOutputStream[numberOfSkipLevels];
for (int i = 0; i < numberOfSkipLevels; i++) { for (int i = 0; i < numberOfSkipLevels; i++) {
@ -104,11 +120,15 @@ public abstract class MultiLevelSkipListWriter {
* @throws IOException * @throws IOException
*/ */
public void bufferSkip(int df) throws IOException { public void bufferSkip(int df) throws IOException {
int numLevels;
assert df % skipInterval == 0;
int numLevels = 1;
df /= skipInterval;
// determine max level // determine max level
for (numLevels = 0; (df % skipInterval) == 0 && numLevels < numberOfSkipLevels; df /= skipInterval) { while ((df % skipMultiplier) == 0 && numLevels < numberOfSkipLevels) {
numLevels++; numLevels++;
df /= skipMultiplier;
} }
long childPointer = 0; long childPointer = 0;
@ -150,5 +170,4 @@ public abstract class MultiLevelSkipListWriter {
return skipPointer; return skipPointer;
} }
} }

View File

@ -0,0 +1,418 @@
package org.apache.lucene.codecs.block;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.codecs.BlockTreeTermsReader;
import org.apache.lucene.codecs.BlockTreeTermsWriter;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.MultiLevelSkipListWriter;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.packed.PackedInts;
/**
* Block postings format, which encodes postings in packed int blocks
* for faster decode.
*
* <p>
* Basic idea:
* <ul>
* <li>
* <b>Packed Block and VInt Block</b>:
* <p>In packed block, integers are encoded with the same bit width ({@link PackedInts packed format}),
* the block size (i.e. number of integers inside block) is fixed. </p>
* <p>In VInt block, integers are encoded as {@link DataOutput#writeVInt VInt},
* the block size is variable.</p>
* </li>
*
* <li>
* <b>Block structure</b>:
* <p>When the postings is long enough, BlockPostingsFormat will try to encode most integer data
* as packed block.</p>
* <p>Take a term with 259 documents as example, the first 256 document ids are encoded as two packed
* blocks, while the remaining 3 as one VInt block. </p>
* <p>Different kinds of data are always encoded separately into different packed blocks, but may
* possible be encoded into a same VInt block. </p>
* <p>This strategy is applied to pairs:
* &lt;document number, frequency&gt;,
* &lt;position, payload length&gt;,
* &lt;position, offset start, offset length&gt;, and
* &lt;position, payload length, offsetstart, offset length&gt;.</p>
* </li>
*
* <li>
* <b>Skipper setting</b>:
* <p>The structure of skip table is quite similar to Lucene40PostingsFormat. Skip interval is the
* same as block size, and each skip entry points to the beginning of each block. However, for
* the first block, skip data is omitted.</p>
* </li>
*
* <li>
* <b>Positions, Payloads, and Offsets</b>:
* <p>A position is an integer indicating where the term occurs at within one document.
* A payload is a blob of metadata associated with current position.
* An offset is a pair of integers indicating the tokenized start/end offsets for given term
* in current position. </p>
* <p>When payloads and offsets are not omitted, numPositions==numPayloads==numOffsets (assuming a
* null payload contributes one count). As mentioned in block structure, it is possible to encode
* these three either combined or separately.
* <p>For all the cases, payloads and offsets are stored together. When encoded as packed block,
* position data is separated out as .pos, while payloads and offsets are encoded in .pay (payload
* metadata will also be stored directly in .pay). When encoded as VInt block, all these three are
* stored in .pos (so as payload metadata).</p>
* <p>With this strategy, the majority of payload and offset data will be outside .pos file.
* So for queries that require only position data, running on a full index with payloads and offsets,
* this reduces disk pre-fetches.</p>
* </li>
* </ul>
* </p>
*
* <p>
* Files and detailed format:
* <ul>
* <li><tt>.tim</tt>: <a href="#Termdictionary">Term Dictionary</a></li>
* <li><tt>.tip</tt>: <a href="#Termindex">Term Index</a></li>
* <li><tt>.doc</tt>: <a href="#Frequencies">Frequencies and Skip Data</a></li>
* <li><tt>.pos</tt>: <a href="#Positions">Positions</a></li>
* <li><tt>.pay</tt>: <a href="#Payloads">Payloads and Offsets</a></li>
* </ul>
* </p>
*
* <a name="Termdictionary" id="Termdictionary"></a>
* <dl>
* <dd>
* <b>Term Dictionary</b>
*
* <p>The .tim file format is quite similar to Lucene40PostingsFormat,
* with minor difference in MetadataBlock</p>
*
* <ul>
* <!-- TODO: expand on this, its not really correct and doesnt explain sub-blocks etc -->
* <li>TermDictionary(.tim) --&gt; Header, DirOffset, PostingsHeader, PackedBlockSize,
* &lt;Block&gt;<sup>NumBlocks</sup>, FieldSummary</li>
* <li>Block --&gt; SuffixBlock, StatsBlock, MetadataBlock</li>
* <li>SuffixBlock --&gt; EntryCount, SuffixLength, {@link DataOutput#writeByte byte}<sup>SuffixLength</sup></li>
* <li>StatsBlock --&gt; StatsLength, &lt;DocFreq, TotalTermFreq&gt;<sup>EntryCount</sup></li>
* <li>MetadataBlock --&gt; MetaLength, &lt;DocFPDelta,
* &lt;PosFPDelta, PosVIntBlockFPDelta?, PayFPDelta?&gt;?,
* SkipFPDelta?&gt;<sup>EntryCount</sup></li>
* <li>FieldSummary --&gt; NumFields, &lt;FieldNumber, NumTerms, RootCodeLength,
* {@link DataOutput#writeByte byte}<sup>RootCodeLength</sup>, SumDocFreq, DocCount&gt;
* <sup>NumFields</sup></li>
* <li>Header, PostingsHeader --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
* <li>DirOffset --&gt; {@link DataOutput#writeLong Uint64}</li>
* <li>PackedBlockSize, EntryCount, SuffixLength, StatsLength, DocFreq, MetaLength,
* PosVIntBlockFPDelta, SkipFPDelta, NumFields, FieldNumber, RootCodeLength, DocCount --&gt;
* {@link DataOutput#writeVInt VInt}</li>
* <li>TotalTermFreq, DocFPDelta, PosFPDelta, PayFPDelta, NumTerms, SumTotalTermFreq, SumDocFreq --&gt;
* {@link DataOutput#writeVLong VLong}</li>
* </ul>
* <p>Notes:</p>
* <ul>
* <li>Here explains MetadataBlock only, other fields are mentioned in
* <a href="../lucene40/Lucene40PostingsFormat.html#Termdictionary">Lucene40PostingsFormat:TermDictionary</a>
* </li>
* <li>PackedBlockSize is the fixed block size for packed blocks. In packed block, bit width is
* determined by the largest integer. Smaller block size result in smaller variance among width
* of integers hence smaller indexes. Larger block size result in more efficient bulk i/o hence
* better acceleration. This value should always be a multiple of 64, currently fixed as 128 as
* a tradeoff. It is also the skip interval used to accelerate {@link DocsEnum#advance(int)}.
* <li>DocFPDelta determines the position of this term's TermFreqs within the .doc file.
* In particular, it is the difference of file offset between this term's
* data and previous term's data (or zero, for the first term in the block).On disk it is
* stored as the difference from previous value in sequence. </li>
* <li>PosFPDelta determines the position of this term's TermPositions within the .pos file.
* While PayFPDelta determines the position of this term's &lt;TermPayloads, TermOffsets?&gt; within
* the .pay file. Similar to DocFPDelta, it is the difference between two file positions (or
* neglected, for fields that omit payloads and offsets).</li>
* <li>PosVIntBlockFPDelta determines the position of this term's last TermPosition in last pos packed
* block within the .pos file. It is synonym for PayVIntBlockFPDelta or OffsetVIntBlockFPDelta.
* This is actually used to indicate whether it is necessary to load following
* payloads and offsets from .pos instead of .pay. Every time a new block of positions are to be
* loaded, the PostingsReader will use this value to check whether current block is packed format
* or VInt. When packed format, payloads and offsets are fetched from .pay, otherwise from .pos.
* (this value is neglected when total number of positions i.e. totalTermFreq is less or equal
* to PackedBlockSize).
* <li>SkipFPDelta determines the position of this term's SkipData within the .doc
* file. In particular, it is the length of the TermFreq data.
* SkipDelta is only stored if DocFreq is not smaller than SkipMinimum
* (i.e. 8 in BlockPostingsFormat).</li>
* </ul>
* </dd>
* </dl>
*
* <a name="Termindex" id="Termindex"></a>
* <dl>
* <dd>
* <b>Term Index</b>
* <p>The .tim file format is mentioned in
* <a href="../lucene40/Lucene40PostingsFormat.html#Termindex">Lucene40PostingsFormat:TermIndex</a>
* </dd>
* </dl>
*
*
* <a name="Frequencies" id="Frequencies"></a>
* <dl>
* <dd>
* <b>Frequencies and Skip Data</b>
*
* <p>The .doc file contains the lists of documents which contain each term, along
* with the frequency of the term in that document (except when frequencies are
* omitted: {@link IndexOptions#DOCS_ONLY}). It also saves skip data to the beginning of
* each packed or VInt block, when the length of document list is larger than packed block size.</p>
*
* <ul>
* <li>docFile(.doc) --&gt; Header, &lt;TermFreqs, SkipData?&gt;<sup>TermCount</sup></li>
* <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
* <li>TermFreqs --&gt; &lt;PackedBlock&gt; <sup>PackedDocBlockNum</sup>,
* VIntBlock? </li>
* <li>PackedBlock --&gt; PackedDocDeltaBlock, PackedFreqBlock?
* <li>VIntBlock --&gt; &lt;DocDelta[, Freq?]&gt;<sup>DocFreq-PackedBlockSize*PackedDocBlockNum</sup>
* <li>SkipData --&gt; &lt;&lt;SkipLevelLength, SkipLevel&gt;
* <sup>NumSkipLevels-1</sup>, SkipLevel&gt;, SkipDatum?</li>
* <li>SkipLevel --&gt; &lt;SkipDatum&gt; <sup>TrimmedDocFreq/(PackedBlockSize^(Level + 1))</sup></li>
* <li>SkipDatum --&gt; DocSkip, DocFPSkip, &lt;PosFPSkip, PosBlockOffset, PayLength?,
* OffsetStart?, PayFPSkip?&gt;?, SkipChildLevelPointer?</li>
* <li>PackedDocDeltaBlock, PackedFreqBlock --&gt; {@link PackedInts PackedInts}</li>
* <li>DocDelta, Freq, DocSkip, DocFPSkip, PosFPSkip, PosBlockOffset, PayLength, OffsetStart, PayFPSkip
* --&gt;
* {@link DataOutput#writeVInt VInt}</li>
* <li>SkipChildLevelPointer --&gt; {@link DataOutput#writeVLong VLong}</li>
* </ul>
* <p>Notes:</p>
* <ul>
* <li>PackedDocDeltaBlock is theoretically generated from two steps:
* <ol>
* <li>Calculate the difference between each document number and previous one,
* and get a d-gaps list (for the first document, use absolute value); </li>
* <li>For those d-gaps from first one to PackedDocBlockNum*PackedBlockSize<sup>th</sup>,
* separately encode as packed blocks.</li>
* </ol>
* If frequencies are not omitted, PackedFreqBlock will be generated without d-gap step.
* </li>
* <li>VIntBlock stores remaining d-gaps (along with frequencies when possible) with a format
* mentioned in
* <a href="../lucene40/Lucene40PostingsFormat.html#Frequencies">Lucene40PostingsFormat:Frequencies</a>
* </li>
* <li>PackedDocBlockNum is the number of packed blocks for current term's docids or frequencies.
* In particular, PackedDocBlockNum = floor(DocFreq/PackedBlockSize) </li>
* <li>TrimmedDocFreq = DocFreq % PackedBlockSize == 0 ? DocFreq - 1 : DocFreq.
* We use this trick since the definition of skip entry is a little different from base interface.
* In {@link MultiLevelSkipListWriter}, skip data is assumed to be saved for
* skipInterval<sup>th</sup>, 2*skipInterval<sup>th</sup> ... posting in the list. However,
* in BlockPostingsFormat, the skip data is saved for skipInterval+1<sup>th</sup>,
* 2*skipInterval+1<sup>th</sup> ... posting (skipInterval==PackedBlockSize in this case).
* When DocFreq is multiple of PackedBlockSize, MultiLevelSkipListWriter will expect one
* more skip data than BlockSkipWriter. </li>
* <li>SkipDatum is the metadata of one skip entry.
* For the first block (no matter packed or VInt), it is omitted.</li>
* <li>DocSkip records the document number of every PackedBlockSize<sup>th</sup> document number in
* the postings (i.e. last document number in each packed block). On disk it is stored as the
* difference from previous value in the sequence. </li>
* <li>DocFPSkip records the file offsets of each block (excluding )posting at
* PackedBlockSize+1<sup>th</sup>, 2*PackedBlockSize+1<sup>th</sup> ... , in DocFile.
* The file offsets are relative to the start of current term's TermFreqs.
* On disk it is also stored as the difference from previous SkipDatum in the sequence.</li>
* <li>Since positions and payloads are also block encoded, the skip should skip to related block first,
* then fetch the values according to in-block offset. PosFPSkip and PayFPSkip record the file
* offsets of related block in .pos and .pay, respectively. While PosBlockOffset indicates
* which value to fetch inside the related block (PayBlockOffset is unnecessary since it is always
* equal to PosBlockOffset). Same as DocFPSkip, the file offsets are relative to the start of
* current term's TermFreqs, and stored as a difference sequence.</li>
* <li>PayLength indicates the length of last payload.</li>
* <li>OffsetStart indicates the first value of last offset pair.</li>
* </ul>
* </dd>
* </dl>
*
* <a name="Positions" id="Positions"></a>
* <dl>
* <dd>
* <b>Positions</b>
* <p>The .pos file contains the lists of positions that each term occurs at within documents. It also
* sometimes stores part of payloads and offsets for speedup.</p>
* <ul>
* <li>PosFile(.pos) --&gt; Header, &lt;TermPositions&gt; <sup>TermCount</sup></li>
* <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
* <li>TermPositions --&gt; &lt;PackedPosDeltaBlock&gt; <sup>PackedPosBlockNum</sup>,
* VIntBlock? </li>
* <li>VIntBlock --&gt; PosVIntCount, &lt;PosDelta[, PayLength?], PayData?,
* OffsetStartDelta?, OffsetLength?&gt;<sup>PosVIntCount</sup>
* <li>PackedPosDeltaBlock --&gt; {@link PackedInts PackedInts}</li>
* <li>PosVIntCount, PosDelta, OffsetStartDelta, OffsetLength --&gt;
* {@link DataOutput#writeVInt VInt}</li>
* <li>PayData --&gt; {@link DataOutput#writeByte byte}<sup>PayLength</sup></li>
* </ul>
* <p>Notes:</p>
* <ul>
* <li>TermPositions are order by term (terms are implicit, from the term dictionary), and position
* values for each term document pair are incremental, and ordered by document number.</li>
* <li>PackedPosBlockNum is the number of packed blocks for current term's positions, payloads or offsets.
* In particular, PackedPosBlockNum = floor(totalTermFreq/PackedBlockSize) </li>
* <li>PosVIntCount is the number of positions encoded as VInt format. In particular,
* PosVIntCount = totalTermFreq - PackedPosBlockNum*PackedBlockSize</li>
* <li>The procedure how PackedPosDeltaBlock is generated is the same as PackedDocDeltaBlock
* in chapter <a href="#Frequencies">Frequencies and Skip Data</a>.</li>
* <li>PosDelta is the same as the format mentioned in
* <a href="../lucene40/Lucene40PostingsFormat.html#Positions">Lucene40PostingsFormat:Positions</a>
* </li>
* <li>OffsetStartDelta is the difference between this position's startOffset from the previous
* occurrence (or zero, if this is the first occurrence in this document).</li>
* <li>OffsetLength indicates the length of the current offset (endOffset-startOffset).</li>
* <li>PayloadData is the blob of metadata associated with current position.</li>
* </ul>
* </dd>
* </dl>
*
* <a name="Payloads" id="Payloads"></a>
* <dl>
* <dd>
* <b>Payloads and Offsets</b>
* <p>The .pay file will store payloads and offsets associated with certain term-document positions.
* Some payloads and offsets will be separated out into .pos file, for speedup reason.</p>
* <ul>
* <li>PayFile(.pay): --&gt; Header, &lt;TermPayloads, TermOffsets?&gt; <sup>TermCount</sup></li>
* <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
* <li>TermPayloads --&gt; &lt;PackedPayLengthBlock, SumPayLength, PayData&gt; <sup>PackedPayBlockNum</sup>
* <li>TermOffsets --&gt; &lt;PackedOffsetStartDeltaBlock, PackedOffsetLengthBlock&gt; <sup>PackedPayBlockNum</sup>
* <li>PackedPayLengthBlock, PackedOffsetStartDeltaBlock, PackedOffsetLengthBlock --&gt; {@link PackedInts PackedInts}</li>
* <li>SumPayLength --&gt; {@link DataOutput#writeVInt VInt}</li>
* <li>PayData --&gt; {@link DataOutput#writeByte byte}<sup>SumPayLength</sup></li>
* </ul>
* <p>Notes:</p>
* <ul>
* <li>The order of TermPayloads/TermOffsets will be the same as TermPositions, note that part of
* payload/offsets are stored in .pos.</li>
* <li>The procedure how PackedPayLengthBlock and PackedOffsetLengthBlock are generated is the
* same as PackedFreqBlock in chapter <a href="#Frequencies">Frequencies and Skip Data</a>.
* While PackedStartDeltaBlock follows a same procedure as PackedDocDeltaBlock.</li>
* <li>PackedPayBlockNum is always equal to PackedPosBlockNum, for the same term. It is also synonym
* for PackedOffsetBlockNum.</li>
* <li>SumPayLength is the total length of payloads written within one block, should be the sum
* of PayLengths in one packed block.</li>
* <li>PayLength in PackedPayLengthBlock is the length of each payload, associated with current
* position.</li>
* </ul>
* </dd>
* </dl>
* </p>
*
*/
public final class BlockPostingsFormat extends PostingsFormat {
/**
* Filename extension for document number, frequencies, and skip data.
* See chapter: <a href="#Frequencies">Frequencies and Skip Data</a>
*/
public static final String DOC_EXTENSION = "doc";
/**
* Filename extension for positions.
* See chapter: <a href="#Positions">Positions</a>
*/
public static final String POS_EXTENSION = "pos";
/**
* Filename extension for payloads and offsets.
* See chapter: <a href="#Payloads">Payloads and Offsets</a>
*/
public static final String PAY_EXTENSION = "pay";
private final int minTermBlockSize;
private final int maxTermBlockSize;
/**
* Fixed packed block size, number of integers encoded in
* a single packed block.
*/
// NOTE: must be multiple of 64 because of PackedInts long-aligned encoding/decoding
public final static int BLOCK_SIZE = 128;
public BlockPostingsFormat() {
this(BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
}
public BlockPostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
super("Block");
this.minTermBlockSize = minTermBlockSize;
assert minTermBlockSize > 1;
this.maxTermBlockSize = maxTermBlockSize;
assert minTermBlockSize <= maxTermBlockSize;
}
@Override
public String toString() {
return getName() + "(blocksize=" + BLOCK_SIZE + ")";
}
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase postingsWriter = new BlockPostingsWriter(state);
boolean success = false;
try {
FieldsConsumer ret = new BlockTreeTermsWriter(state,
postingsWriter,
minTermBlockSize,
maxTermBlockSize);
success = true;
return ret;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(postingsWriter);
}
}
}
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
PostingsReaderBase postingsReader = new BlockPostingsReader(state.dir,
state.fieldInfos,
state.segmentInfo,
state.context,
state.segmentSuffix);
boolean success = false;
try {
FieldsProducer ret = new BlockTreeTermsReader(state.dir,
state.fieldInfos,
state.segmentInfo.name,
postingsReader,
state.context,
state.segmentSuffix,
state.termsIndexDivisor);
success = true;
return ret;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(postingsReader);
}
}
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,563 @@
package org.apache.lucene.codecs.block;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import static org.apache.lucene.codecs.block.BlockPostingsFormat.BLOCK_SIZE;
import static org.apache.lucene.codecs.block.ForUtil.MAX_DATA_SIZE;
import static org.apache.lucene.codecs.block.ForUtil.MAX_ENCODED_SIZE;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.TermStats;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.packed.PackedInts;
/**
* Concrete class that writes docId(maybe frq,pos,offset,payloads) list
* with postings format.
*
* Postings list for each term will be stored separately.
*
* @see BlockSkipWriter for details about skipping setting and postings layout.
*
*/
final class BlockPostingsWriter extends PostingsWriterBase {
/**
* Expert: The maximum number of skip levels. Smaller values result in
* slightly smaller indexes, but slower skipping in big posting lists.
*/
static final int maxSkipLevels = 10;
final static String TERMS_CODEC = "BlockPostingsWriterTerms";
final static String DOC_CODEC = "BlockPostingsWriterDoc";
final static String POS_CODEC = "BlockPostingsWriterPos";
final static String PAY_CODEC = "BlockPostingsWriterPay";
// Increment version to change it:
final static int VERSION_START = 0;
final static int VERSION_CURRENT = VERSION_START;
final IndexOutput docOut;
final IndexOutput posOut;
final IndexOutput payOut;
private IndexOutput termsOut;
// How current field indexes postings:
private boolean fieldHasFreqs;
private boolean fieldHasPositions;
private boolean fieldHasOffsets;
private boolean fieldHasPayloads;
// Holds starting file pointers for each term:
private long docTermStartFP;
private long posTermStartFP;
private long payTermStartFP;
final int[] docDeltaBuffer;
final int[] freqBuffer;
private int docBufferUpto;
final int[] posDeltaBuffer;
final int[] payloadLengthBuffer;
final int[] offsetStartDeltaBuffer;
final int[] offsetLengthBuffer;
private int posBufferUpto;
private byte[] payloadBytes;
private int payloadByteUpto;
private int lastBlockDocID;
private long lastBlockPosFP;
private long lastBlockPayFP;
private int lastBlockPosBufferUpto;
private int lastBlockStartOffset;
private int lastBlockPayloadByteUpto;
private int lastDocID;
private int lastPosition;
private int lastStartOffset;
private int docCount;
final byte[] encoded;
private final ForUtil forUtil;
private final BlockSkipWriter skipWriter;
public BlockPostingsWriter(SegmentWriteState state, float acceptableOverheadRatio) throws IOException {
super();
docOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockPostingsFormat.DOC_EXTENSION),
state.context);
IndexOutput posOut = null;
IndexOutput payOut = null;
boolean success = false;
try {
CodecUtil.writeHeader(docOut, DOC_CODEC, VERSION_CURRENT);
forUtil = new ForUtil(acceptableOverheadRatio, docOut);
if (state.fieldInfos.hasProx()) {
posDeltaBuffer = new int[MAX_DATA_SIZE];
posOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockPostingsFormat.POS_EXTENSION),
state.context);
CodecUtil.writeHeader(posOut, POS_CODEC, VERSION_CURRENT);
if (state.fieldInfos.hasPayloads()) {
payloadBytes = new byte[128];
payloadLengthBuffer = new int[MAX_DATA_SIZE];
} else {
payloadBytes = null;
payloadLengthBuffer = null;
}
if (state.fieldInfos.hasOffsets()) {
offsetStartDeltaBuffer = new int[MAX_DATA_SIZE];
offsetLengthBuffer = new int[MAX_DATA_SIZE];
} else {
offsetStartDeltaBuffer = null;
offsetLengthBuffer = null;
}
if (state.fieldInfos.hasPayloads() || state.fieldInfos.hasOffsets()) {
payOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockPostingsFormat.PAY_EXTENSION),
state.context);
CodecUtil.writeHeader(payOut, PAY_CODEC, VERSION_CURRENT);
}
} else {
posDeltaBuffer = null;
payloadLengthBuffer = null;
offsetStartDeltaBuffer = null;
offsetLengthBuffer = null;
payloadBytes = null;
}
this.payOut = payOut;
this.posOut = posOut;
success = true;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(docOut, posOut, payOut);
}
}
docDeltaBuffer = new int[MAX_DATA_SIZE];
freqBuffer = new int[MAX_DATA_SIZE];
// TODO: should we try skipping every 2/4 blocks...?
skipWriter = new BlockSkipWriter(maxSkipLevels,
BLOCK_SIZE,
state.segmentInfo.getDocCount(),
docOut,
posOut,
payOut);
encoded = new byte[MAX_ENCODED_SIZE];
}
public BlockPostingsWriter(SegmentWriteState state) throws IOException {
this(state, PackedInts.COMPACT);
}
@Override
public void start(IndexOutput termsOut) throws IOException {
this.termsOut = termsOut;
CodecUtil.writeHeader(termsOut, TERMS_CODEC, VERSION_CURRENT);
termsOut.writeVInt(BLOCK_SIZE);
}
@Override
public void setField(FieldInfo fieldInfo) {
IndexOptions indexOptions = fieldInfo.getIndexOptions();
fieldHasFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
fieldHasPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
fieldHasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
fieldHasPayloads = fieldInfo.hasPayloads();
skipWriter.setField(fieldHasPositions, fieldHasOffsets, fieldHasPayloads);
}
@Override
public void startTerm() {
docTermStartFP = docOut.getFilePointer();
if (fieldHasPositions) {
posTermStartFP = posOut.getFilePointer();
if (fieldHasPayloads || fieldHasOffsets) {
payTermStartFP = payOut.getFilePointer();
}
}
lastDocID = 0;
lastBlockDocID = -1;
// if (DEBUG) {
// System.out.println("FPW.startTerm startFP=" + docTermStartFP);
// }
skipWriter.resetSkip();
}
@Override
public void startDoc(int docID, int termDocFreq) throws IOException {
// if (DEBUG) {
// System.out.println("FPW.startDoc docID["+docBufferUpto+"]=" + docID);
// }
// Have collected a block of docs, and get a new doc.
// Should write skip data as well as postings list for
// current block.
if (lastBlockDocID != -1 && docBufferUpto == 0) {
// if (DEBUG) {
// System.out.println(" bufferSkip at writeBlock: lastDocID=" + lastBlockDocID + " docCount=" + (docCount-1));
// }
skipWriter.bufferSkip(lastBlockDocID, docCount, lastBlockPosFP, lastBlockPayFP, lastBlockPosBufferUpto, lastBlockStartOffset, lastBlockPayloadByteUpto);
}
final int docDelta = docID - lastDocID;
if (docID < 0 || (docCount > 0 && docDelta <= 0)) {
throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " ) (docOut: " + docOut + ")");
}
docDeltaBuffer[docBufferUpto] = docDelta;
// if (DEBUG) {
// System.out.println(" docDeltaBuffer[" + docBufferUpto + "]=" + docDelta);
// }
if (fieldHasFreqs) {
freqBuffer[docBufferUpto] = termDocFreq;
}
docBufferUpto++;
docCount++;
if (docBufferUpto == BLOCK_SIZE) {
// if (DEBUG) {
// System.out.println(" write docDelta block @ fp=" + docOut.getFilePointer());
// }
forUtil.writeBlock(docDeltaBuffer, encoded, docOut);
if (fieldHasFreqs) {
// if (DEBUG) {
// System.out.println(" write freq block @ fp=" + docOut.getFilePointer());
// }
forUtil.writeBlock(freqBuffer, encoded, docOut);
}
// NOTE: don't set docBufferUpto back to 0 here;
// finishDoc will do so (because it needs to see that
// the block was filled so it can save skip data)
}
lastDocID = docID;
lastPosition = 0;
lastStartOffset = 0;
}
/** Add a new position & payload */
@Override
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
// if (DEBUG) {
// System.out.println("FPW.addPosition pos=" + position + " posBufferUpto=" + posBufferUpto + (fieldHasPayloads ? " payloadByteUpto=" + payloadByteUpto: ""));
// }
posDeltaBuffer[posBufferUpto] = position - lastPosition;
if (fieldHasPayloads) {
if (payload == null || payload.length == 0) {
// no payload
payloadLengthBuffer[posBufferUpto] = 0;
} else {
payloadLengthBuffer[posBufferUpto] = payload.length;
if (payloadByteUpto + payload.length > payloadBytes.length) {
payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payload.length);
}
System.arraycopy(payload.bytes, payload.offset, payloadBytes, payloadByteUpto, payload.length);
payloadByteUpto += payload.length;
}
}
if (fieldHasOffsets) {
assert startOffset >= lastStartOffset;
assert endOffset >= startOffset;
offsetStartDeltaBuffer[posBufferUpto] = startOffset - lastStartOffset;
offsetLengthBuffer[posBufferUpto] = endOffset - startOffset;
lastStartOffset = startOffset;
}
posBufferUpto++;
lastPosition = position;
if (posBufferUpto == BLOCK_SIZE) {
// if (DEBUG) {
// System.out.println(" write pos bulk block @ fp=" + posOut.getFilePointer());
// }
forUtil.writeBlock(posDeltaBuffer, encoded, posOut);
if (fieldHasPayloads) {
forUtil.writeBlock(payloadLengthBuffer, encoded, payOut);
payOut.writeVInt(payloadByteUpto);
payOut.writeBytes(payloadBytes, 0, payloadByteUpto);
payloadByteUpto = 0;
}
if (fieldHasOffsets) {
forUtil.writeBlock(offsetStartDeltaBuffer, encoded, payOut);
forUtil.writeBlock(offsetLengthBuffer, encoded, payOut);
}
posBufferUpto = 0;
}
}
@Override
public void finishDoc() throws IOException {
// Since we don't know df for current term, we had to buffer
// those skip data for each block, and when a new doc comes,
// write them to skip file.
if (docBufferUpto == BLOCK_SIZE) {
lastBlockDocID = lastDocID;
if (posOut != null) {
if (payOut != null) {
lastBlockPayFP = payOut.getFilePointer();
}
lastBlockPosFP = posOut.getFilePointer();
lastBlockPosBufferUpto = posBufferUpto;
lastBlockStartOffset = lastStartOffset;
lastBlockPayloadByteUpto = payloadByteUpto;
}
// if (DEBUG) {
// System.out.println(" docBufferUpto="+docBufferUpto+" now get lastBlockDocID="+lastBlockDocID+" lastBlockPosFP=" + lastBlockPosFP + " lastBlockPosBufferUpto=" + lastBlockPosBufferUpto + " lastBlockPayloadByteUpto=" + lastBlockPayloadByteUpto);
// }
docBufferUpto = 0;
}
}
private static class PendingTerm {
public final long docStartFP;
public final long posStartFP;
public final long payStartFP;
public final int skipOffset;
public final int lastPosBlockOffset;
public PendingTerm(long docStartFP, long posStartFP, long payStartFP, int skipOffset, int lastPosBlockOffset) {
this.docStartFP = docStartFP;
this.posStartFP = posStartFP;
this.payStartFP = payStartFP;
this.skipOffset = skipOffset;
this.lastPosBlockOffset = lastPosBlockOffset;
}
}
private final List<PendingTerm> pendingTerms = new ArrayList<PendingTerm>();
/** Called when we are done adding docs to this term */
@Override
public void finishTerm(TermStats stats) throws IOException {
assert stats.docFreq > 0;
// TODO: wasteful we are counting this (counting # docs
// for this term) in two places?
assert stats.docFreq == docCount: stats.docFreq + " vs " + docCount;
// if (DEBUG) {
// System.out.println("FPW.finishTerm docFreq=" + stats.docFreq);
// }
// if (DEBUG) {
// if (docBufferUpto > 0) {
// System.out.println(" write doc/freq vInt block (count=" + docBufferUpto + ") at fp=" + docOut.getFilePointer() + " docTermStartFP=" + docTermStartFP);
// }
// }
// vInt encode the remaining doc deltas and freqs:
for(int i=0;i<docBufferUpto;i++) {
final int docDelta = docDeltaBuffer[i];
final int freq = freqBuffer[i];
if (!fieldHasFreqs) {
docOut.writeVInt(docDelta);
} else if (freqBuffer[i] == 1) {
docOut.writeVInt((docDelta<<1)|1);
} else {
docOut.writeVInt(docDelta<<1);
docOut.writeVInt(freq);
}
}
final int lastPosBlockOffset;
if (fieldHasPositions) {
// if (DEBUG) {
// if (posBufferUpto > 0) {
// System.out.println(" write pos vInt block (count=" + posBufferUpto + ") at fp=" + posOut.getFilePointer() + " posTermStartFP=" + posTermStartFP + " hasPayloads=" + fieldHasPayloads + " hasOffsets=" + fieldHasOffsets);
// }
// }
// totalTermFreq is just total number of positions(or payloads, or offsets)
// associated with current term.
assert stats.totalTermFreq != -1;
if (stats.totalTermFreq > BLOCK_SIZE) {
// record file offset for last pos in last block
lastPosBlockOffset = (int) (posOut.getFilePointer() - posTermStartFP);
} else {
lastPosBlockOffset = -1;
}
if (posBufferUpto > 0) {
posOut.writeVInt(posBufferUpto);
// TODO: should we send offsets/payloads to
// .pay...? seems wasteful (have to store extra
// vLong for low (< BLOCK_SIZE) DF terms = vast vast
// majority)
// vInt encode the remaining positions/payloads/offsets:
int lastPayloadLength = -1;
int payloadBytesReadUpto = 0;
for(int i=0;i<posBufferUpto;i++) {
final int posDelta = posDeltaBuffer[i];
if (fieldHasPayloads) {
final int payloadLength = payloadLengthBuffer[i];
if (payloadLength != lastPayloadLength) {
lastPayloadLength = payloadLength;
posOut.writeVInt((posDelta<<1)|1);
posOut.writeVInt(payloadLength);
} else {
posOut.writeVInt(posDelta<<1);
}
// if (DEBUG) {
// System.out.println(" i=" + i + " payloadLen=" + payloadLength);
// }
if (payloadLength != 0) {
// if (DEBUG) {
// System.out.println(" write payload @ pos.fp=" + posOut.getFilePointer());
// }
posOut.writeBytes(payloadBytes, payloadBytesReadUpto, payloadLength);
payloadBytesReadUpto += payloadLength;
}
} else {
posOut.writeVInt(posDelta);
}
if (fieldHasOffsets) {
// if (DEBUG) {
// System.out.println(" write offset @ pos.fp=" + posOut.getFilePointer());
// }
posOut.writeVInt(offsetStartDeltaBuffer[i]);
posOut.writeVInt(offsetLengthBuffer[i]);
}
}
if (fieldHasPayloads) {
assert payloadBytesReadUpto == payloadByteUpto;
payloadByteUpto = 0;
}
}
// if (DEBUG) {
// System.out.println(" totalTermFreq=" + stats.totalTermFreq + " lastPosBlockOffset=" + lastPosBlockOffset);
// }
} else {
lastPosBlockOffset = -1;
}
int skipOffset;
if (docCount > BLOCK_SIZE) {
skipOffset = (int) (skipWriter.writeSkip(docOut) - docTermStartFP);
// if (DEBUG) {
// System.out.println("skip packet " + (docOut.getFilePointer() - (docTermStartFP + skipOffset)) + " bytes");
// }
} else {
skipOffset = -1;
// if (DEBUG) {
// System.out.println(" no skip: docCount=" + docCount);
// }
}
long payStartFP;
if (stats.totalTermFreq >= BLOCK_SIZE) {
payStartFP = payTermStartFP;
} else {
payStartFP = -1;
}
// if (DEBUG) {
// System.out.println(" payStartFP=" + payStartFP);
// }
pendingTerms.add(new PendingTerm(docTermStartFP, posTermStartFP, payStartFP, skipOffset, lastPosBlockOffset));
docBufferUpto = 0;
posBufferUpto = 0;
lastDocID = 0;
docCount = 0;
}
private final RAMOutputStream bytesWriter = new RAMOutputStream();
@Override
public void flushTermsBlock(int start, int count) throws IOException {
if (count == 0) {
termsOut.writeByte((byte) 0);
return;
}
assert start <= pendingTerms.size();
assert count <= start;
final int limit = pendingTerms.size() - start + count;
long lastDocStartFP = 0;
long lastPosStartFP = 0;
long lastPayStartFP = 0;
for(int idx=limit-count; idx<limit; idx++) {
PendingTerm term = pendingTerms.get(idx);
bytesWriter.writeVLong(term.docStartFP - lastDocStartFP);
lastDocStartFP = term.docStartFP;
if (fieldHasPositions) {
bytesWriter.writeVLong(term.posStartFP - lastPosStartFP);
lastPosStartFP = term.posStartFP;
if (term.lastPosBlockOffset != -1) {
bytesWriter.writeVInt(term.lastPosBlockOffset);
}
if ((fieldHasPayloads || fieldHasOffsets) && term.payStartFP != -1) {
bytesWriter.writeVLong(term.payStartFP - lastPayStartFP);
lastPayStartFP = term.payStartFP;
}
}
if (term.skipOffset != -1) {
bytesWriter.writeVInt(term.skipOffset);
}
}
termsOut.writeVInt((int) bytesWriter.getFilePointer());
bytesWriter.writeTo(termsOut);
bytesWriter.reset();
// Remove the terms we just wrote:
pendingTerms.subList(limit-count, limit).clear();
}
@Override
public void close() throws IOException {
IOUtils.close(docOut, posOut, payOut);
}
}

View File

@ -0,0 +1,244 @@
package org.apache.lucene.codecs.block;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.codecs.MultiLevelSkipListReader;
import org.apache.lucene.store.IndexInput;
/**
* Implements the skip list reader for block postings format
* that stores positions and payloads.
*
* Although this skipper uses MultiLevelSkipListReader as an interface,
* its definition of skip position will be a little different.
*
* For example, when skipInterval = blockSize = 3, df = 2*skipInterval = 6,
*
* 0 1 2 3 4 5
* d d d d d d (posting list)
* ^ ^ (skip point in MultiLeveSkipWriter)
* ^ (skip point in BlockSkipWriter)
*
* In this case, MultiLevelSkipListReader will use the last document as a skip point,
* while BlockSkipReader should assume no skip point will comes.
*
* If we use the interface directly in BlockSkipReader, it may silly try to read
* another skip data after the only skip point is loaded.
*
* To illustrate this, we can call skipTo(d[5]), since skip point d[3] has smaller docId,
* and numSkipped+blockSize== df, the MultiLevelSkipListReader will assume the skip list
* isn't exhausted yet, and try to load a non-existed skip point
*
* Therefore, we'll trim df before passing it to the interface. see trim(int)
*
*/
final class BlockSkipReader extends MultiLevelSkipListReader {
// private boolean DEBUG = BlockPostingsReader.DEBUG;
private final int blockSize;
private long docPointer[];
private long posPointer[];
private long payPointer[];
private int posBufferUpto[];
private int startOffset[];
private int payloadByteUpto[];
private long lastPosPointer;
private long lastPayPointer;
private int lastStartOffset;
private int lastPayloadByteUpto;
private long lastDocPointer;
private int lastPosBufferUpto;
public BlockSkipReader(IndexInput skipStream, int maxSkipLevels, int blockSize, boolean hasPos, boolean hasOffsets, boolean hasPayloads) {
super(skipStream, maxSkipLevels, blockSize, 8);
this.blockSize = blockSize;
docPointer = new long[maxSkipLevels];
if (hasPos) {
posPointer = new long[maxSkipLevels];
posBufferUpto = new int[maxSkipLevels];
if (hasPayloads) {
payloadByteUpto = new int[maxSkipLevels];
} else {
payloadByteUpto = null;
}
if (hasOffsets) {
startOffset = new int[maxSkipLevels];
} else {
startOffset = null;
}
if (hasOffsets || hasPayloads) {
payPointer = new long[maxSkipLevels];
} else {
payPointer = null;
}
} else {
posPointer = null;
}
}
/**
* Trim original docFreq to tell skipReader read proper number of skip points.
*
* Since our definition in BlockSkip* is a little different from MultiLevelSkip*
* This trimmed docFreq will prevent skipReader from:
* 1. silly reading a non-existed skip point after the last block boundary
* 2. moving into the vInt block
*
*/
protected int trim(int df) {
return df % blockSize == 0? df - 1: df;
}
public void init(long skipPointer, long docBasePointer, long posBasePointer, long payBasePointer, int df) {
super.init(skipPointer, trim(df));
lastDocPointer = docBasePointer;
lastPosPointer = posBasePointer;
lastPayPointer = payBasePointer;
Arrays.fill(docPointer, docBasePointer);
if (posPointer != null) {
Arrays.fill(posPointer, posBasePointer);
if (payPointer != null) {
Arrays.fill(payPointer, payBasePointer);
}
} else {
assert posBasePointer == 0;
}
}
/** Returns the doc pointer of the doc to which the last call of
* {@link MultiLevelSkipListReader#skipTo(int)} has skipped. */
public long getDocPointer() {
return lastDocPointer;
}
public long getPosPointer() {
return lastPosPointer;
}
public int getPosBufferUpto() {
return lastPosBufferUpto;
}
public long getPayPointer() {
return lastPayPointer;
}
public int getStartOffset() {
return lastStartOffset;
}
public int getPayloadByteUpto() {
return lastPayloadByteUpto;
}
public int getNextSkipDoc() {
return skipDoc[0];
}
@Override
protected void seekChild(int level) throws IOException {
super.seekChild(level);
// if (DEBUG) {
// System.out.println("seekChild level=" + level);
// }
docPointer[level] = lastDocPointer;
if (posPointer != null) {
posPointer[level] = lastPosPointer;
posBufferUpto[level] = lastPosBufferUpto;
if (startOffset != null) {
startOffset[level] = lastStartOffset;
}
if (payloadByteUpto != null) {
payloadByteUpto[level] = lastPayloadByteUpto;
}
if (payPointer != null) {
payPointer[level] = lastPayPointer;
}
}
}
@Override
protected void setLastSkipData(int level) {
super.setLastSkipData(level);
lastDocPointer = docPointer[level];
// if (DEBUG) {
// System.out.println("setLastSkipData level=" + level);
// System.out.println(" lastDocPointer=" + lastDocPointer);
// }
if (posPointer != null) {
lastPosPointer = posPointer[level];
lastPosBufferUpto = posBufferUpto[level];
// if (DEBUG) {
// System.out.println(" lastPosPointer=" + lastPosPointer + " lastPosBUfferUpto=" + lastPosBufferUpto);
// }
if (payPointer != null) {
lastPayPointer = payPointer[level];
}
if (startOffset != null) {
lastStartOffset = startOffset[level];
}
if (payloadByteUpto != null) {
lastPayloadByteUpto = payloadByteUpto[level];
}
}
}
@Override
protected int readSkipData(int level, IndexInput skipStream) throws IOException {
// if (DEBUG) {
// System.out.println("readSkipData level=" + level);
// }
int delta = skipStream.readVInt();
// if (DEBUG) {
// System.out.println(" delta=" + delta);
// }
docPointer[level] += skipStream.readVInt();
// if (DEBUG) {
// System.out.println(" docFP=" + docPointer[level]);
// }
if (posPointer != null) {
posPointer[level] += skipStream.readVInt();
// if (DEBUG) {
// System.out.println(" posFP=" + posPointer[level]);
// }
posBufferUpto[level] = skipStream.readVInt();
// if (DEBUG) {
// System.out.println(" posBufferUpto=" + posBufferUpto[level]);
// }
if (payloadByteUpto != null) {
payloadByteUpto[level] = skipStream.readVInt();
}
if (startOffset != null) {
startOffset[level] += skipStream.readVInt();
}
if (payPointer != null) {
payPointer[level] += skipStream.readVInt();
}
}
return delta;
}
}

View File

@ -0,0 +1,163 @@
package org.apache.lucene.codecs.block;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.codecs.MultiLevelSkipListWriter;
/**
* Write skip lists with multiple levels, and support skip within block ints.
*
* Assume that docFreq = 28, skipInterval = blockSize = 12
*
* | block#0 | | block#1 | |vInts|
* d d d d d d d d d d d d d d d d d d d d d d d d d d d d (posting list)
* ^ ^ (level 0 skip point)
*
* Note that skipWriter will ignore first document in block#0, since
* it is useless as a skip point. Also, we'll never skip into the vInts
* block, only record skip data at the start its start point(if it exist).
*
* For each skip point, we will record:
* 1. docID in former position, i.e. for position 12, record docID[11], etc.
* 2. its related file points(position, payload),
* 3. related numbers or uptos(position, payload).
* 4. start offset.
*
*/
final class BlockSkipWriter extends MultiLevelSkipListWriter {
// private boolean DEBUG = BlockPostingsReader.DEBUG;
private int[] lastSkipDoc;
private long[] lastSkipDocPointer;
private long[] lastSkipPosPointer;
private long[] lastSkipPayPointer;
private int[] lastStartOffset;
private int[] lastPayloadByteUpto;
private final IndexOutput docOut;
private final IndexOutput posOut;
private final IndexOutput payOut;
private int curDoc;
private long curDocPointer;
private long curPosPointer;
private long curPayPointer;
private int curPosBufferUpto;
private int curStartOffset;
private int curPayloadByteUpto;
private boolean fieldHasPositions;
private boolean fieldHasOffsets;
private boolean fieldHasPayloads;
public BlockSkipWriter(int maxSkipLevels, int blockSize, int docCount, IndexOutput docOut, IndexOutput posOut, IndexOutput payOut) {
super(blockSize, 8, maxSkipLevels, docCount);
this.docOut = docOut;
this.posOut = posOut;
this.payOut = payOut;
lastSkipDoc = new int[maxSkipLevels];
lastSkipDocPointer = new long[maxSkipLevels];
if (posOut != null) {
lastSkipPosPointer = new long[maxSkipLevels];
if (payOut != null) {
lastSkipPayPointer = new long[maxSkipLevels];
}
lastStartOffset = new int[maxSkipLevels];
lastPayloadByteUpto = new int[maxSkipLevels];
}
}
public void setField(boolean fieldHasPositions, boolean fieldHasOffsets, boolean fieldHasPayloads) {
this.fieldHasPositions = fieldHasPositions;
this.fieldHasOffsets = fieldHasOffsets;
this.fieldHasPayloads = fieldHasPayloads;
}
@Override
public void resetSkip() {
super.resetSkip();
Arrays.fill(lastSkipDoc, 0);
Arrays.fill(lastSkipDocPointer, docOut.getFilePointer());
if (fieldHasPositions) {
Arrays.fill(lastSkipPosPointer, posOut.getFilePointer());
if (fieldHasOffsets) {
Arrays.fill(lastStartOffset, 0);
}
if (fieldHasPayloads) {
Arrays.fill(lastPayloadByteUpto, 0);
}
if (fieldHasOffsets || fieldHasPayloads) {
Arrays.fill(lastSkipPayPointer, payOut.getFilePointer());
}
}
}
/**
* Sets the values for the current skip data.
*/
public void bufferSkip(int doc, int numDocs, long posFP, long payFP, int posBufferUpto, int startOffset, int payloadByteUpto) throws IOException {
this.curDoc = doc;
this.curDocPointer = docOut.getFilePointer();
this.curPosPointer = posFP;
this.curPayPointer = payFP;
this.curPosBufferUpto = posBufferUpto;
this.curPayloadByteUpto = payloadByteUpto;
this.curStartOffset = startOffset;
bufferSkip(numDocs);
}
@Override
protected void writeSkipData(int level, IndexOutput skipBuffer) throws IOException {
int delta = curDoc - lastSkipDoc[level];
// if (DEBUG) {
// System.out.println("writeSkipData level=" + level + " lastDoc=" + curDoc + " delta=" + delta + " curDocPointer=" + curDocPointer);
// }
skipBuffer.writeVInt(delta);
lastSkipDoc[level] = curDoc;
skipBuffer.writeVInt((int) (curDocPointer - lastSkipDocPointer[level]));
lastSkipDocPointer[level] = curDocPointer;
if (fieldHasPositions) {
// if (DEBUG) {
// System.out.println(" curPosPointer=" + curPosPointer + " curPosBufferUpto=" + curPosBufferUpto);
// }
skipBuffer.writeVInt((int) (curPosPointer - lastSkipPosPointer[level]));
lastSkipPosPointer[level] = curPosPointer;
skipBuffer.writeVInt(curPosBufferUpto);
if (fieldHasPayloads) {
skipBuffer.writeVInt(curPayloadByteUpto);
}
if (fieldHasOffsets) {
skipBuffer.writeVInt(curStartOffset - lastStartOffset[level]);
lastStartOffset[level] = curStartOffset;
}
if (fieldHasOffsets || fieldHasPayloads) {
skipBuffer.writeVInt((int) (curPayPointer - lastSkipPayPointer[level]));
lastSkipPayPointer[level] = curPayPointer;
}
}
}
}

View File

@ -0,0 +1,247 @@
package org.apache.lucene.codecs.block;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.packed.PackedInts.Decoder;
import org.apache.lucene.util.packed.PackedInts.FormatAndBits;
import org.apache.lucene.util.packed.PackedInts;
import static org.apache.lucene.codecs.block.BlockPostingsFormat.BLOCK_SIZE;
/**
* Encode all values in normal area with fixed bit width,
* which is determined by the max value in this block.
*/
final class ForUtil {
/**
* Special number of bits per value used whenever all values to encode are equal.
*/
private static final int ALL_VALUES_EQUAL = 0;
/**
* Upper limit of the number of bytes that might be required to stored
* <code>BLOCK_SIZE</code> encoded values.
*/
static final int MAX_ENCODED_SIZE = BLOCK_SIZE * 4;
/**
* Upper limit of the number of values that might be decoded in a single call to
* {@link #readBlock(IndexInput, byte[], int[])}. Although values after
* <code>BLOCK_SIZE</code> are garbage, it is necessary to allocate value buffers
* whose size is >= MAX_DATA_SIZE to avoid {@link ArrayIndexOutOfBoundsException}s.
*/
static final int MAX_DATA_SIZE;
static {
int maxDataSize = 0;
for(int version=PackedInts.VERSION_START;version<=PackedInts.VERSION_CURRENT;version++) {
for (PackedInts.Format format : PackedInts.Format.values()) {
for (int bpv = 1; bpv <= 32; ++bpv) {
if (!format.isSupported(bpv)) {
continue;
}
final PackedInts.Decoder decoder = PackedInts.getDecoder(format, version, bpv);
final int iterations = computeIterations(decoder);
maxDataSize = Math.max(maxDataSize, iterations * decoder.valueCount());
}
}
}
MAX_DATA_SIZE = maxDataSize;
}
/**
* Compute the number of iterations required to decode <code>BLOCK_SIZE</code>
* values with the provided {@link Decoder}.
*/
private static int computeIterations(PackedInts.Decoder decoder) {
return (int) Math.ceil((float) BLOCK_SIZE / decoder.valueCount());
}
/**
* Compute the number of bytes required to encode a block of values that require
* <code>bitsPerValue</code> bits per value with format <code>format</code>.
*/
private static int encodedSize(PackedInts.Format format, int bitsPerValue) {
return format.nblocks(bitsPerValue, BLOCK_SIZE) << 3;
}
private final int[] encodedSizes;
private final PackedInts.Encoder[] encoders;
private final PackedInts.Decoder[] decoders;
private final int[] iterations;
/**
* Create a new {@link ForUtil} instance and save state into <code>out</code>.
*/
ForUtil(float acceptableOverheadRatio, DataOutput out) throws IOException {
out.writeVInt(PackedInts.VERSION_CURRENT);
encodedSizes = new int[33];
encoders = new PackedInts.Encoder[33];
decoders = new PackedInts.Decoder[33];
iterations = new int[33];
for (int bpv = 1; bpv <= 32; ++bpv) {
final FormatAndBits formatAndBits = PackedInts.fastestFormatAndBits(
BLOCK_SIZE, bpv, acceptableOverheadRatio);
assert formatAndBits.format.isSupported(formatAndBits.bitsPerValue);
assert formatAndBits.bitsPerValue <= 32;
encodedSizes[bpv] = encodedSize(formatAndBits.format, formatAndBits.bitsPerValue);
encoders[bpv] = PackedInts.getEncoder(
formatAndBits.format, PackedInts.VERSION_CURRENT, formatAndBits.bitsPerValue);
decoders[bpv] = PackedInts.getDecoder(
formatAndBits.format, PackedInts.VERSION_CURRENT, formatAndBits.bitsPerValue);
iterations[bpv] = computeIterations(decoders[bpv]);
out.writeVInt(formatAndBits.format.getId() << 5 | (formatAndBits.bitsPerValue - 1));
}
}
/**
* Restore a {@link ForUtil} from a {@link DataInput}.
*/
ForUtil(DataInput in) throws IOException {
int packedIntsVersion = in.readVInt();
if (packedIntsVersion != PackedInts.VERSION_START) {
throw new CorruptIndexException("expected version=" + PackedInts.VERSION_START + " but got version=" + packedIntsVersion);
}
encodedSizes = new int[33];
encoders = new PackedInts.Encoder[33];
decoders = new PackedInts.Decoder[33];
iterations = new int[33];
for (int bpv = 1; bpv <= 32; ++bpv) {
final int code = in.readVInt();
final int formatId = code >>> 5;
final int bitsPerValue = (code & 31) + 1;
final PackedInts.Format format = PackedInts.Format.byId(formatId);
assert format.isSupported(bitsPerValue);
encodedSizes[bpv] = encodedSize(format, bitsPerValue);
encoders[bpv] = PackedInts.getEncoder(
format, packedIntsVersion, bitsPerValue);
decoders[bpv] = PackedInts.getDecoder(
format, packedIntsVersion, bitsPerValue);
iterations[bpv] = computeIterations(decoders[bpv]);
}
}
/**
* Write a block of data (<code>For</code> format).
*
* @param data the data to write
* @param encoded a buffer to use to encode data
* @param out the destination output
* @throws IOException
*/
void writeBlock(int[] data, byte[] encoded, IndexOutput out) throws IOException {
if (isAllEqual(data)) {
out.writeVInt(ALL_VALUES_EQUAL);
out.writeVInt(data[0]);
return;
}
final int numBits = bitsRequired(data);
assert numBits > 0 && numBits <= 32 : numBits;
final PackedInts.Encoder encoder = encoders[numBits];
final int iters = iterations[numBits];
assert iters * encoder.valueCount() >= BLOCK_SIZE;
final int encodedSize = encodedSizes[numBits];
assert (iters * encoder.blockCount()) << 3 >= encodedSize;
out.writeVInt(numBits);
encoder.encode(data, 0, encoded, 0, iters);
out.writeBytes(encoded, encodedSize);
}
/**
* Read the next block of data (<code>For</code> format).
*
* @param in the input to use to read data
* @param encoded a buffer that can be used to store encoded data
* @param decoded where to write decoded data
* @throws IOException
*/
void readBlock(IndexInput in, byte[] encoded, int[] decoded) throws IOException {
final int numBits = in.readVInt();
assert numBits <= 32 : numBits;
if (numBits == ALL_VALUES_EQUAL) {
final int value = in.readVInt();
Arrays.fill(decoded, 0, BLOCK_SIZE, value);
return;
}
final int encodedSize = encodedSizes[numBits];
in.readBytes(encoded, 0, encodedSize);
final PackedInts.Decoder decoder = decoders[numBits];
final int iters = iterations[numBits];
assert iters * decoder.valueCount() >= BLOCK_SIZE;
decoder.decode(encoded, 0, decoded, 0, iters);
}
/**
* Skip the next block of data.
*
* @param in the input where to read data
* @throws IOException
*/
void skipBlock(IndexInput in) throws IOException {
final int numBits = in.readVInt();
if (numBits == ALL_VALUES_EQUAL) {
in.readVInt();
return;
}
assert numBits > 0 && numBits <= 32 : numBits;
final int encodedSize = encodedSizes[numBits];
in.seek(in.getFilePointer() + encodedSize);
}
private static boolean isAllEqual(final int[] data) {
final long v = data[0];
for (int i = 1; i < BLOCK_SIZE; ++i) {
if (data[i] != v) {
return false;
}
}
return true;
}
/**
* Compute the number of bits required to serialize any of the longs in
* <code>data</code>.
*/
private static int bitsRequired(final int[] data) {
long or = 0;
for (int i = 0; i < BLOCK_SIZE; ++i) {
assert data[i] >= 0;
or |= data[i];
}
return PackedInts.bitsRequired(or);
}
}

View File

@ -0,0 +1,25 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
</head>
<body>
BlockPostingsFormat file format.
</body>
</html>

View File

@ -77,76 +77,48 @@ public abstract class FixedIntBlockIndexInput extends IntIndexInput {
private static class Reader extends IntIndexInput.Reader { private static class Reader extends IntIndexInput.Reader {
private final IndexInput in; private final IndexInput in;
protected final int[] pending;
int upto;
private boolean seekPending;
private long pendingFP;
private int pendingUpto;
private long lastBlockFP;
private final BlockReader blockReader; private final BlockReader blockReader;
private final int blockSize; private final int blockSize;
private final IntsRef bulkResult = new IntsRef(); private final int[] pending;
private int upto;
private boolean seekPending;
private long pendingFP;
private long lastBlockFP = -1;
public Reader(final IndexInput in, final int[] pending, final BlockReader blockReader) { public Reader(final IndexInput in, final int[] pending, final BlockReader blockReader) {
this.in = in; this.in = in;
this.pending = pending; this.pending = pending;
this.blockSize = pending.length; this.blockSize = pending.length;
bulkResult.ints = pending;
this.blockReader = blockReader; this.blockReader = blockReader;
upto = blockSize; upto = blockSize;
} }
void seek(final long fp, final int upto) { void seek(final long fp, final int upto) {
pendingFP = fp; assert upto < blockSize;
pendingUpto = upto; if (seekPending || fp != lastBlockFP) {
seekPending = true; pendingFP = fp;
} seekPending = true;
private void maybeSeek() throws IOException {
if (seekPending) {
if (pendingFP != lastBlockFP) {
// need new block
in.seek(pendingFP);
lastBlockFP = pendingFP;
blockReader.readBlock();
}
upto = pendingUpto;
seekPending = false;
} }
this.upto = upto;
} }
@Override @Override
public int next() throws IOException { public int next() throws IOException {
this.maybeSeek(); if (seekPending) {
if (upto == blockSize) { // Seek & load new block
in.seek(pendingFP);
lastBlockFP = pendingFP;
blockReader.readBlock();
seekPending = false;
} else if (upto == blockSize) {
// Load new block
lastBlockFP = in.getFilePointer(); lastBlockFP = in.getFilePointer();
blockReader.readBlock(); blockReader.readBlock();
upto = 0; upto = 0;
} }
return pending[upto++]; return pending[upto++];
} }
@Override
public IntsRef read(final int count) throws IOException {
this.maybeSeek();
if (upto == blockSize) {
blockReader.readBlock();
upto = 0;
}
bulkResult.offset = upto;
if (upto + count < blockSize) {
bulkResult.length = count;
upto += count;
} else {
bulkResult.length = blockSize - upto;
upto = blockSize;
}
return bulkResult;
}
} }
private class Index extends IntIndexInput.Index { private class Index extends IntIndexInput.Index {
@ -178,7 +150,7 @@ public abstract class FixedIntBlockIndexInput extends IntIndexInput {
} }
@Override @Override
public void set(final IntIndexInput.Index other) { public void copyFrom(final IntIndexInput.Index other) {
final Index idx = (Index) other; final Index idx = (Index) other;
fp = idx.fp; fp = idx.fp;
upto = idx.upto; upto = idx.upto;

View File

@ -90,12 +90,10 @@ public abstract class VariableIntBlockIndexInput extends IntIndexInput {
private long lastBlockFP; private long lastBlockFP;
private int blockSize; private int blockSize;
private final BlockReader blockReader; private final BlockReader blockReader;
private final IntsRef bulkResult = new IntsRef();
public Reader(final IndexInput in, final int[] pending, final BlockReader blockReader) { public Reader(final IndexInput in, final int[] pending, final BlockReader blockReader) {
this.in = in; this.in = in;
this.pending = pending; this.pending = pending;
bulkResult.ints = pending;
this.blockReader = blockReader; this.blockReader = blockReader;
} }
@ -146,26 +144,6 @@ public abstract class VariableIntBlockIndexInput extends IntIndexInput {
return pending[upto++]; return pending[upto++];
} }
@Override
public IntsRef read(final int count) throws IOException {
this.maybeSeek();
if (upto == blockSize) {
lastBlockFP = in.getFilePointer();
blockSize = blockReader.readBlock();
upto = 0;
}
bulkResult.offset = upto;
if (upto + count < blockSize) {
bulkResult.length = count;
upto += count;
} else {
bulkResult.length = blockSize - upto;
upto = blockSize;
}
return bulkResult;
}
} }
private class Index extends IntIndexInput.Index { private class Index extends IntIndexInput.Index {
@ -204,7 +182,7 @@ public abstract class VariableIntBlockIndexInput extends IntIndexInput {
} }
@Override @Override
public void set(final IntIndexInput.Index other) { public void copyFrom(final IntIndexInput.Index other) {
final Index idx = (Index) other; final Index idx = (Index) other;
fp = idx.fp; fp = idx.fp;
upto = idx.upto; upto = idx.upto;

View File

@ -159,7 +159,7 @@ import org.apache.lucene.util.fst.FST; // javadocs
* with the frequency of the term in that document (except when frequencies are * with the frequency of the term in that document (except when frequencies are
* omitted: {@link IndexOptions#DOCS_ONLY}).</p> * omitted: {@link IndexOptions#DOCS_ONLY}).</p>
* <ul> * <ul>
* <li>FreqFile (.frq) --&gt; Header, &lt;TermFreqs, SkipData&gt; <sup>TermCount</sup></li> * <li>FreqFile (.frq) --&gt; Header, &lt;TermFreqs, SkipData?&gt; <sup>TermCount</sup></li>
* <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li> * <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
* <li>TermFreqs --&gt; &lt;TermFreq&gt; <sup>DocFreq</sup></li> * <li>TermFreqs --&gt; &lt;TermFreq&gt; <sup>DocFreq</sup></li>
* <li>TermFreq --&gt; DocDelta[, Freq?]</li> * <li>TermFreq --&gt; DocDelta[, Freq?]</li>

View File

@ -185,8 +185,6 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
int lastDocID; int lastDocID;
int df; int df;
/** Adds a new doc in this term. If this returns null
* then we just skip consuming positions/payloads. */
@Override @Override
public void startDoc(int docID, int termDocFreq) throws IOException { public void startDoc(int docID, int termDocFreq) throws IOException {
// if (DEBUG) System.out.println("SPW: startDoc seg=" + segment + " docID=" + docID + " tf=" + termDocFreq + " freqOut.fp=" + freqOut.getFilePointer()); // if (DEBUG) System.out.println("SPW: startDoc seg=" + segment + " docID=" + docID + " tf=" + termDocFreq + " freqOut.fp=" + freqOut.getFilePointer());

View File

@ -44,7 +44,7 @@ public abstract class IntIndexInput implements Closeable {
/** Seeks primary stream to the last read offset */ /** Seeks primary stream to the last read offset */
public abstract void seek(IntIndexInput.Reader stream) throws IOException; public abstract void seek(IntIndexInput.Reader stream) throws IOException;
public abstract void set(Index other); public abstract void copyFrom(Index other);
@Override @Override
public abstract Index clone(); public abstract Index clone();
@ -55,23 +55,5 @@ public abstract class IntIndexInput implements Closeable {
/** Reads next single int */ /** Reads next single int */
public abstract int next() throws IOException; public abstract int next() throws IOException;
/** Reads next chunk of ints */
private IntsRef bulkResult;
/** Read up to count ints. */
public IntsRef read(int count) throws IOException {
if (bulkResult == null) {
bulkResult = new IntsRef();
bulkResult.ints = new int[count];
} else {
bulkResult.grow(count);
}
for(int i=0;i<count;i++) {
bulkResult.ints[i] = next();
}
bulkResult.length = count;
return bulkResult;
}
} }
} }

View File

@ -160,13 +160,13 @@ public class SepPostingsReader extends PostingsReaderBase {
if (docIndex == null) { if (docIndex == null) {
docIndex = other.docIndex.clone(); docIndex = other.docIndex.clone();
} else { } else {
docIndex.set(other.docIndex); docIndex.copyFrom(other.docIndex);
} }
if (other.freqIndex != null) { if (other.freqIndex != null) {
if (freqIndex == null) { if (freqIndex == null) {
freqIndex = other.freqIndex.clone(); freqIndex = other.freqIndex.clone();
} else { } else {
freqIndex.set(other.freqIndex); freqIndex.copyFrom(other.freqIndex);
} }
} else { } else {
freqIndex = null; freqIndex = null;
@ -175,7 +175,7 @@ public class SepPostingsReader extends PostingsReaderBase {
if (posIndex == null) { if (posIndex == null) {
posIndex = other.posIndex.clone(); posIndex = other.posIndex.clone();
} else { } else {
posIndex.set(other.posIndex); posIndex.copyFrom(other.posIndex);
} }
} else { } else {
posIndex = null; posIndex = null;
@ -352,11 +352,11 @@ public class SepPostingsReader extends PostingsReaderBase {
// TODO: can't we only do this if consumer // TODO: can't we only do this if consumer
// skipped consuming the previous docs? // skipped consuming the previous docs?
docIndex.set(termState.docIndex); docIndex.copyFrom(termState.docIndex);
docIndex.seek(docReader); docIndex.seek(docReader);
if (!omitTF) { if (!omitTF) {
freqIndex.set(termState.freqIndex); freqIndex.copyFrom(termState.freqIndex);
freqIndex.seek(freqReader); freqIndex.seek(freqReader);
} }
@ -516,15 +516,15 @@ public class SepPostingsReader extends PostingsReaderBase {
// TODO: can't we only do this if consumer // TODO: can't we only do this if consumer
// skipped consuming the previous docs? // skipped consuming the previous docs?
docIndex.set(termState.docIndex); docIndex.copyFrom(termState.docIndex);
docIndex.seek(docReader); docIndex.seek(docReader);
//System.out.println(" docIndex=" + docIndex); //System.out.println(" docIndex=" + docIndex);
freqIndex.set(termState.freqIndex); freqIndex.copyFrom(termState.freqIndex);
freqIndex.seek(freqReader); freqIndex.seek(freqReader);
//System.out.println(" freqIndex=" + freqIndex); //System.out.println(" freqIndex=" + freqIndex);
posIndex.set(termState.posIndex); posIndex.copyFrom(termState.posIndex);
//System.out.println(" posIndex=" + posIndex); //System.out.println(" posIndex=" + posIndex);
posSeekPending = true; posSeekPending = true;
payloadPending = false; payloadPending = false;
@ -629,7 +629,7 @@ public class SepPostingsReader extends PostingsReaderBase {
// NOTE: don't seek pos here; do it lazily // NOTE: don't seek pos here; do it lazily
// instead. Eg a PhraseQuery may skip to many // instead. Eg a PhraseQuery may skip to many
// docs before finally asking for positions... // docs before finally asking for positions...
posIndex.set(skipper.getPosIndex()); posIndex.copyFrom(skipper.getPosIndex());
posSeekPending = true; posSeekPending = true;
count = newCount; count = newCount;
doc = accum = skipper.getDoc(); doc = accum = skipper.getDoc();

View File

@ -108,12 +108,12 @@ class SepSkipListReader extends MultiLevelSkipListReader {
lastPayloadPointer = payloadBasePointer; lastPayloadPointer = payloadBasePointer;
for(int i=0;i<maxNumberOfSkipLevels;i++) { for(int i=0;i<maxNumberOfSkipLevels;i++) {
docIndex[i].set(docBaseIndex); docIndex[i].copyFrom(docBaseIndex);
if (freqIndex != null) { if (freqIndex != null) {
freqIndex[i].set(freqBaseIndex); freqIndex[i].copyFrom(freqBaseIndex);
} }
if (posBaseIndex != null) { if (posBaseIndex != null) {
posIndex[i].set(posBaseIndex); posIndex[i].copyFrom(posBaseIndex);
} }
} }
Arrays.fill(payloadPointer, payloadBasePointer); Arrays.fill(payloadPointer, payloadBasePointer);
@ -145,20 +145,20 @@ class SepSkipListReader extends MultiLevelSkipListReader {
lastPayloadPointer = payloadPointer[level]; lastPayloadPointer = payloadPointer[level];
lastPayloadLength = payloadLength[level]; lastPayloadLength = payloadLength[level];
if (freqIndex != null) { if (freqIndex != null) {
lastFreqIndex.set(freqIndex[level]); lastFreqIndex.copyFrom(freqIndex[level]);
} }
lastDocIndex.set(docIndex[level]); lastDocIndex.copyFrom(docIndex[level]);
if (lastPosIndex != null) { if (lastPosIndex != null) {
lastPosIndex.set(posIndex[level]); lastPosIndex.copyFrom(posIndex[level]);
} }
if (level > 0) { if (level > 0) {
if (freqIndex != null) { if (freqIndex != null) {
freqIndex[level-1].set(freqIndex[level]); freqIndex[level-1].copyFrom(freqIndex[level]);
} }
docIndex[level-1].set(docIndex[level]); docIndex[level-1].copyFrom(docIndex[level]);
if (posIndex != null) { if (posIndex != null) {
posIndex[level-1].set(posIndex[level]); posIndex[level-1].copyFrom(posIndex[level]);
} }
} }
} }

View File

@ -34,6 +34,8 @@ import org.apache.lucene.index.FieldInfo.IndexOptions;
public class FieldInfos implements Iterable<FieldInfo> { public class FieldInfos implements Iterable<FieldInfo> {
private final boolean hasFreq; private final boolean hasFreq;
private final boolean hasProx; private final boolean hasProx;
private final boolean hasPayloads;
private final boolean hasOffsets;
private final boolean hasVectors; private final boolean hasVectors;
private final boolean hasNorms; private final boolean hasNorms;
private final boolean hasDocValues; private final boolean hasDocValues;
@ -45,6 +47,8 @@ public class FieldInfos implements Iterable<FieldInfo> {
public FieldInfos(FieldInfo[] infos) { public FieldInfos(FieldInfo[] infos) {
boolean hasVectors = false; boolean hasVectors = false;
boolean hasProx = false; boolean hasProx = false;
boolean hasPayloads = false;
boolean hasOffsets = false;
boolean hasFreq = false; boolean hasFreq = false;
boolean hasNorms = false; boolean hasNorms = false;
boolean hasDocValues = false; boolean hasDocValues = false;
@ -58,12 +62,16 @@ public class FieldInfos implements Iterable<FieldInfo> {
hasVectors |= info.hasVectors(); hasVectors |= info.hasVectors();
hasProx |= info.isIndexed() && info.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; hasProx |= info.isIndexed() && info.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
hasFreq |= info.isIndexed() && info.getIndexOptions() != IndexOptions.DOCS_ONLY; hasFreq |= info.isIndexed() && info.getIndexOptions() != IndexOptions.DOCS_ONLY;
hasOffsets |= info.isIndexed() && info.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
hasNorms |= info.hasNorms(); hasNorms |= info.hasNorms();
hasDocValues |= info.hasDocValues(); hasDocValues |= info.hasDocValues();
hasPayloads |= info.hasPayloads();
} }
this.hasVectors = hasVectors; this.hasVectors = hasVectors;
this.hasProx = hasProx; this.hasProx = hasProx;
this.hasPayloads = hasPayloads;
this.hasOffsets = hasOffsets;
this.hasFreq = hasFreq; this.hasFreq = hasFreq;
this.hasNorms = hasNorms; this.hasNorms = hasNorms;
this.hasDocValues = hasDocValues; this.hasDocValues = hasDocValues;
@ -80,6 +88,16 @@ public class FieldInfos implements Iterable<FieldInfo> {
return hasProx; return hasProx;
} }
/** Returns true if any fields have payloads */
public boolean hasPayloads() {
return hasPayloads;
}
/** Returns true if any fields have offsets */
public boolean hasOffsets() {
return hasOffsets;
}
/** /**
* @return true if at least one field has any vectors * @return true if at least one field has any vectors
*/ */

View File

@ -17,12 +17,12 @@ package org.apache.lucene.util.packed;
* limitations under the License. * limitations under the License.
*/ */
import org.apache.lucene.store.DataInput;
import org.apache.lucene.util.RamUsageEstimator;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays; import java.util.Arrays;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.util.RamUsageEstimator;
/** /**
* Space optimized random access capable array of values with a fixed number of * Space optimized random access capable array of values with a fixed number of
* bits/value. Values are packed contiguously. * bits/value. Values are packed contiguously.
@ -146,12 +146,12 @@ class Packed64 extends PackedInts.MutableImpl {
assert off + len <= arr.length; assert off + len <= arr.length;
final int originalIndex = index; final int originalIndex = index;
final BulkOperation op = BulkOperation.of(PackedInts.Format.PACKED, bitsPerValue); final PackedInts.Decoder decoder = BulkOperation.of(PackedInts.Format.PACKED, bitsPerValue);
// go to the next block where the value does not span across two blocks // go to the next block where the value does not span across two blocks
final int offsetInBlocks = index % op.values(); final int offsetInBlocks = index % decoder.valueCount();
if (offsetInBlocks != 0) { if (offsetInBlocks != 0) {
for (int i = offsetInBlocks; i < op.values() && len > 0; ++i) { for (int i = offsetInBlocks; i < decoder.valueCount() && len > 0; ++i) {
arr[off++] = get(index++); arr[off++] = get(index++);
--len; --len;
} }
@ -161,12 +161,12 @@ class Packed64 extends PackedInts.MutableImpl {
} }
// bulk get // bulk get
assert index % op.values() == 0; assert index % decoder.valueCount() == 0;
int blockIndex = (int) ((long) index * bitsPerValue) >>> BLOCK_BITS; int blockIndex = (int) ((long) index * bitsPerValue) >>> BLOCK_BITS;
assert (((long)index * bitsPerValue) & MOD_MASK) == 0; assert (((long)index * bitsPerValue) & MOD_MASK) == 0;
final int iterations = len / op.values(); final int iterations = len / decoder.valueCount();
op.get(blocks, blockIndex, arr, off, iterations); decoder.decode(blocks, blockIndex, arr, off, iterations);
final int gotValues = iterations * op.values(); final int gotValues = iterations * decoder.valueCount();
index += gotValues; index += gotValues;
len -= gotValues; len -= gotValues;
assert len >= 0; assert len >= 0;
@ -210,12 +210,12 @@ class Packed64 extends PackedInts.MutableImpl {
assert off + len <= arr.length; assert off + len <= arr.length;
final int originalIndex = index; final int originalIndex = index;
final BulkOperation op = BulkOperation.of(PackedInts.Format.PACKED, bitsPerValue); final PackedInts.Encoder encoder = BulkOperation.of(PackedInts.Format.PACKED, bitsPerValue);
// go to the next block where the value does not span across two blocks // go to the next block where the value does not span across two blocks
final int offsetInBlocks = index % op.values(); final int offsetInBlocks = index % encoder.valueCount();
if (offsetInBlocks != 0) { if (offsetInBlocks != 0) {
for (int i = offsetInBlocks; i < op.values() && len > 0; ++i) { for (int i = offsetInBlocks; i < encoder.valueCount() && len > 0; ++i) {
set(index++, arr[off++]); set(index++, arr[off++]);
--len; --len;
} }
@ -224,13 +224,13 @@ class Packed64 extends PackedInts.MutableImpl {
} }
} }
// bulk get // bulk set
assert index % op.values() == 0; assert index % encoder.valueCount() == 0;
int blockIndex = (int) ((long) index * bitsPerValue) >>> BLOCK_BITS; int blockIndex = (int) ((long) index * bitsPerValue) >>> BLOCK_BITS;
assert (((long)index * bitsPerValue) & MOD_MASK) == 0; assert (((long)index * bitsPerValue) & MOD_MASK) == 0;
final int iterations = len / op.values(); final int iterations = len / encoder.valueCount();
op.set(blocks, blockIndex, arr, off, iterations); encoder.encode(arr, off, blocks, blockIndex, iterations);
final int setValues = iterations * op.values(); final int setValues = iterations * encoder.valueCount();
index += setValues; index += setValues;
len -= setValues; len -= setValues;
assert len >= 0; assert len >= 0;

View File

@ -86,12 +86,12 @@ abstract class Packed64SingleBlock extends PackedInts.MutableImpl {
// bulk get // bulk get
assert index % valuesPerBlock == 0; assert index % valuesPerBlock == 0;
final BulkOperation op = BulkOperation.of(PackedInts.Format.PACKED_SINGLE_BLOCK, bitsPerValue); final PackedInts.Decoder decoder = BulkOperation.of(PackedInts.Format.PACKED_SINGLE_BLOCK, bitsPerValue);
assert op.blocks() == 1; assert decoder.blockCount() == 1;
assert op.values() == valuesPerBlock; assert decoder.valueCount() == valuesPerBlock;
final int blockIndex = index / valuesPerBlock; final int blockIndex = index / valuesPerBlock;
final int nblocks = (index + len) / valuesPerBlock - blockIndex; final int nblocks = (index + len) / valuesPerBlock - blockIndex;
op.get(blocks, blockIndex, arr, off, nblocks); decoder.decode(blocks, blockIndex, arr, off, nblocks);
final int diff = nblocks * valuesPerBlock; final int diff = nblocks * valuesPerBlock;
index += diff; len -= diff; index += diff; len -= diff;
@ -131,11 +131,11 @@ abstract class Packed64SingleBlock extends PackedInts.MutableImpl {
// bulk set // bulk set
assert index % valuesPerBlock == 0; assert index % valuesPerBlock == 0;
final BulkOperation op = BulkOperation.of(PackedInts.Format.PACKED_SINGLE_BLOCK, bitsPerValue); final BulkOperation op = BulkOperation.of(PackedInts.Format.PACKED_SINGLE_BLOCK, bitsPerValue);
assert op.blocks() == 1; assert op.blockCount() == 1;
assert op.values() == valuesPerBlock; assert op.valueCount() == valuesPerBlock;
final int blockIndex = index / valuesPerBlock; final int blockIndex = index / valuesPerBlock;
final int nblocks = (index + len) / valuesPerBlock - blockIndex; final int nblocks = (index + len) / valuesPerBlock - blockIndex;
op.set(blocks, blockIndex, arr, off, nblocks); op.encode(arr, off, blocks, blockIndex, nblocks);
final int diff = nblocks * valuesPerBlock; final int diff = nblocks * valuesPerBlock;
index += diff; len -= diff; index += diff; len -= diff;

View File

@ -18,6 +18,7 @@ package org.apache.lucene.util.packed;
*/ */
import java.io.Closeable; import java.io.Closeable;
import java.io.IOException;
import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataInput;
@ -25,8 +26,6 @@ import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.LongsRef; import org.apache.lucene.util.LongsRef;
import java.io.IOException;
/** /**
* Simplistic compression for array of unsigned long values. * Simplistic compression for array of unsigned long values.
* Each value is >= 0 and <= a specified maximum value. The * Each value is >= 0 and <= a specified maximum value. The
@ -66,6 +65,14 @@ public class PackedInts {
public final static int VERSION_START = 0; public final static int VERSION_START = 0;
public final static int VERSION_CURRENT = VERSION_START; public final static int VERSION_CURRENT = VERSION_START;
private static void checkVersion(int version) {
if (version < VERSION_START) {
throw new IllegalArgumentException("Version is too old, should be at least " + VERSION_START + " (got " + version + ")");
} else if (version > VERSION_CURRENT) {
throw new IllegalArgumentException("Version is too new, should be at most " + VERSION_CURRENT + " (got " + version + ")");
}
}
/** /**
* A format to write packed ints. * A format to write packed ints.
* *
@ -241,6 +248,146 @@ public class PackedInts {
return new FormatAndBits(format, actualBitsPerValue); return new FormatAndBits(format, actualBitsPerValue);
} }
/**
* A decoder for packed integers.
*/
public static interface Decoder {
/**
* The minimum number of long blocks to decode in a single call.
*/
int blockCount();
/**
* The number of values that can be stored in <code>blockCount()</code> long
* blocks.
*/
int valueCount();
/**
* Read <code>iterations * blockCount()</code> blocks from <code>blocks</code>,
* decode them and write <code>iterations * valueCount()</code> values into
* <code>values</code>.
*
* @param blocks the long blocks that hold packed integer values
* @param blocksOffset the offset where to start reading blocks
* @param values the values buffer
* @param valuesOffset the offset where to start writing values
* @param iterations controls how much data to decode
*/
void decode(long[] blocks, int blocksOffset, long[] values, int valuesOffset, int iterations);
/**
* Read <code>8 * iterations * blockCount()</code> blocks from <code>blocks</code>,
* decode them and write <code>iterations * valueCount()</code> values into
* <code>values</code>.
*
* @param blocks the long blocks that hold packed integer values
* @param blocksOffset the offset where to start reading blocks
* @param values the values buffer
* @param valuesOffset the offset where to start writing values
* @param iterations controls how much data to decode
*/
void decode(byte[] blocks, int blocksOffset, long[] values, int valuesOffset, int iterations);
/**
* Read <code>iterations * blockCount()</code> blocks from <code>blocks</code>,
* decode them and write <code>iterations * valueCount()</code> values into
* <code>values</code>.
*
* @param blocks the long blocks that hold packed integer values
* @param blocksOffset the offset where to start reading blocks
* @param values the values buffer
* @param valuesOffset the offset where to start writing values
* @param iterations controls how much data to decode
*/
void decode(long[] blocks, int blocksOffset, int[] values, int valuesOffset, int iterations);
/**
* Read <code>8 * iterations * blockCount()</code> blocks from <code>blocks</code>,
* decode them and write <code>iterations * valueCount()</code> values into
* <code>values</code>.
*
* @param blocks the long blocks that hold packed integer values
* @param blocksOffset the offset where to start reading blocks
* @param values the values buffer
* @param valuesOffset the offset where to start writing values
* @param iterations controls how much data to decode
*/
void decode(byte[] blocks, int blocksOffset, int[] values, int valuesOffset, int iterations);
}
/**
* An encoder for packed integers.
*/
public static interface Encoder {
/**
* The minimum number of long blocks to encode in a single call.
*/
int blockCount();
/**
* The number of values that can be stored in <code>blockCount()</code> long
* blocks.
*/
int valueCount();
/**
* Read <code>iterations * valueCount()</code> values from <code>values</code>,
* encode them and write <code>iterations * blockCount()</code> blocks into
* <code>blocks</code>.
*
* @param blocks the long blocks that hold packed integer values
* @param blocksOffset the offset where to start writing blocks
* @param values the values buffer
* @param valuesOffset the offset where to start reading values
* @param iterations controls how much data to encode
*/
void encode(long[] values, int valuesOffset, long[] blocks, int blocksOffset, int iterations);
/**
* Read <code>iterations * valueCount()</code> values from <code>values</code>,
* encode them and write <code>8 * iterations * blockCount()</code> blocks into
* <code>blocks</code>.
*
* @param blocks the long blocks that hold packed integer values
* @param blocksOffset the offset where to start writing blocks
* @param values the values buffer
* @param valuesOffset the offset where to start reading values
* @param iterations controls how much data to encode
*/
void encode(long[] values, int valuesOffset, byte[] blocks, int blocksOffset, int iterations);
/**
* Read <code>iterations * valueCount()</code> values from <code>values</code>,
* encode them and write <code>iterations * blockCount()</code> blocks into
* <code>blocks</code>.
*
* @param blocks the long blocks that hold packed integer values
* @param blocksOffset the offset where to start writing blocks
* @param values the values buffer
* @param valuesOffset the offset where to start reading values
* @param iterations controls how much data to encode
*/
void encode(int[] values, int valuesOffset, long[] blocks, int blocksOffset, int iterations);
/**
* Read <code>iterations * valueCount()</code> values from <code>values</code>,
* encode them and write <code>8 * iterations * blockCount()</code> blocks into
* <code>blocks</code>.
*
* @param blocks the long blocks that hold packed integer values
* @param blocksOffset the offset where to start writing blocks
* @param values the values buffer
* @param valuesOffset the offset where to start reading values
* @param iterations controls how much data to encode
*/
void encode(int[] values, int valuesOffset, byte[] blocks, int blocksOffset, int iterations);
}
/** /**
* A read-only random access array of positive integers. * A read-only random access array of positive integers.
* @lucene.internal * @lucene.internal
@ -490,8 +637,7 @@ public class PackedInts {
protected final int valueCount; protected final int valueCount;
protected final int bitsPerValue; protected final int bitsPerValue;
protected Writer(DataOutput out, int valueCount, int bitsPerValue) protected Writer(DataOutput out, int valueCount, int bitsPerValue) {
throws IOException {
assert bitsPerValue <= 64; assert bitsPerValue <= 64;
assert valueCount >= 0 || valueCount == -1; assert valueCount >= 0 || valueCount == -1;
this.out = out; this.out = out;
@ -528,6 +674,32 @@ public class PackedInts {
public abstract int ord(); public abstract int ord();
} }
/**
* Get a {@link Decoder}.
*
* @param format the format used to store packed ints
* @param version the compatibility version
* @param bitsPerValue the number of bits per value
* @return a decoder
*/
public static Decoder getDecoder(Format format, int version, int bitsPerValue) {
checkVersion(version);
return BulkOperation.of(format, bitsPerValue);
}
/**
* Get an {@link Encoder}.
*
* @param format the format used to store packed ints
* @param version the compatibility version
* @param bitsPerValue the number of bits per value
* @return an encoder
*/
public static Encoder getEncoder(Format format, int version, int bitsPerValue) {
checkVersion(version);
return BulkOperation.of(format, bitsPerValue);
}
/** /**
* Expert: Restore a {@link Reader} from a stream without reading metadata at * Expert: Restore a {@link Reader} from a stream without reading metadata at
* the beginning of the stream. This method is useful to restore data from * the beginning of the stream. This method is useful to restore data from
@ -546,6 +718,7 @@ public class PackedInts {
*/ */
public static Reader getReaderNoHeader(DataInput in, Format format, int version, public static Reader getReaderNoHeader(DataInput in, Format format, int version,
int valueCount, int bitsPerValue) throws IOException { int valueCount, int bitsPerValue) throws IOException {
checkVersion(version);
switch (format) { switch (format) {
case PACKED_SINGLE_BLOCK: case PACKED_SINGLE_BLOCK:
return Packed64SingleBlock.create(in, valueCount, bitsPerValue); return Packed64SingleBlock.create(in, valueCount, bitsPerValue);
@ -612,7 +785,8 @@ public class PackedInts {
* @lucene.internal * @lucene.internal
*/ */
public static ReaderIterator getReaderIteratorNoHeader(DataInput in, Format format, int version, public static ReaderIterator getReaderIteratorNoHeader(DataInput in, Format format, int version,
int valueCount, int bitsPerValue, int mem) throws IOException { int valueCount, int bitsPerValue, int mem) {
checkVersion(version);
return new PackedReaderIterator(format, valueCount, bitsPerValue, in, mem); return new PackedReaderIterator(format, valueCount, bitsPerValue, in, mem);
} }
@ -652,7 +826,8 @@ public class PackedInts {
* @lucene.internal * @lucene.internal
*/ */
public static Reader getDirectReaderNoHeader(IndexInput in, Format format, public static Reader getDirectReaderNoHeader(IndexInput in, Format format,
int version, int valueCount, int bitsPerValue) throws IOException { int version, int valueCount, int bitsPerValue) {
checkVersion(version);
switch (format) { switch (format) {
case PACKED: case PACKED:
return new DirectPackedReader(bitsPerValue, valueCount, in); return new DirectPackedReader(bitsPerValue, valueCount, in);
@ -784,7 +959,7 @@ public class PackedInts {
* @lucene.internal * @lucene.internal
*/ */
public static Writer getWriterNoHeader( public static Writer getWriterNoHeader(
DataOutput out, Format format, int valueCount, int bitsPerValue, int mem) throws IOException { DataOutput out, Format format, int valueCount, int bitsPerValue, int mem) {
return new PackedWriter(format, out, valueCount, bitsPerValue, mem); return new PackedWriter(format, out, valueCount, bitsPerValue, mem);
} }

View File

@ -38,10 +38,10 @@ final class PackedReaderIterator extends PackedInts.ReaderIteratorImpl {
bulkOperation = BulkOperation.of(format, bitsPerValue); bulkOperation = BulkOperation.of(format, bitsPerValue);
iterations = bulkOperation.computeIterations(valueCount, mem); iterations = bulkOperation.computeIterations(valueCount, mem);
assert iterations > 0; assert iterations > 0;
nextBlocks = new long[iterations * bulkOperation.blocks()]; nextBlocks = new long[iterations * bulkOperation.blockCount()];
nextValues = new LongsRef(new long[iterations * bulkOperation.values()], 0, 0); nextValues = new LongsRef(new long[iterations * bulkOperation.valueCount()], 0, 0);
assert iterations * bulkOperation.values() == nextValues.longs.length; assert iterations * bulkOperation.valueCount() == nextValues.longs.length;
assert iterations * bulkOperation.blocks() == nextBlocks.length; assert iterations * bulkOperation.blockCount() == nextBlocks.length;
nextValues.offset = nextValues.longs.length; nextValues.offset = nextValues.longs.length;
position = -1; position = -1;
} }
@ -70,7 +70,7 @@ final class PackedReaderIterator extends PackedInts.ReaderIteratorImpl {
nextBlocks[i] = 0L; nextBlocks[i] = 0L;
} }
bulkOperation.get(nextBlocks, 0, nextValues.longs, 0, iterations); bulkOperation.decode(nextBlocks, 0, nextValues.longs, 0, iterations);
nextValues.offset = 0; nextValues.offset = 0;
} }

View File

@ -21,6 +21,7 @@ import org.apache.lucene.store.DataOutput;
import java.io.EOFException; import java.io.EOFException;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays;
// Packs high order byte first, to match // Packs high order byte first, to match
// IndexOutput.writeInt/Long/Short byte order // IndexOutput.writeInt/Long/Short byte order
@ -29,21 +30,20 @@ final class PackedWriter extends PackedInts.Writer {
boolean finished; boolean finished;
final PackedInts.Format format; final PackedInts.Format format;
final BulkOperation bulkOperation; final BulkOperation encoder;
final long[] nextBlocks; final long[] nextBlocks;
final long[] nextValues; final long[] nextValues;
final int iterations; final int iterations;
int off; int off;
int written; int written;
PackedWriter(PackedInts.Format format, DataOutput out, int valueCount, int bitsPerValue, int mem) PackedWriter(PackedInts.Format format, DataOutput out, int valueCount, int bitsPerValue, int mem) {
throws IOException {
super(out, valueCount, bitsPerValue); super(out, valueCount, bitsPerValue);
this.format = format; this.format = format;
bulkOperation = BulkOperation.of(format, bitsPerValue); encoder = BulkOperation.of(format, bitsPerValue);
iterations = bulkOperation.computeIterations(valueCount, mem); iterations = encoder.computeIterations(valueCount, mem);
nextBlocks = new long[iterations * bulkOperation.blocks()]; nextBlocks = new long[iterations * encoder.blockCount()];
nextValues = new long[iterations * bulkOperation.values()]; nextValues = new long[iterations * encoder.valueCount()];
off = 0; off = 0;
written = 0; written = 0;
finished = false; finished = false;
@ -63,8 +63,7 @@ final class PackedWriter extends PackedInts.Writer {
} }
nextValues[off++] = v; nextValues[off++] = v;
if (off == nextValues.length) { if (off == nextValues.length) {
flush(nextValues.length); flush();
off = 0;
} }
++written; ++written;
} }
@ -77,16 +76,17 @@ final class PackedWriter extends PackedInts.Writer {
add(0L); add(0L);
} }
} }
flush(off); flush();
finished = true; finished = true;
} }
private void flush(int nvalues) throws IOException { private void flush() throws IOException {
bulkOperation.set(nextBlocks, 0, nextValues, 0, iterations); encoder.encode(nextValues, 0, nextBlocks, 0, iterations);
final int blocks = format.nblocks(bitsPerValue, nvalues); final int blocks = format.nblocks(bitsPerValue, off);
for (int i = 0; i < blocks; ++i) { for (int i = 0; i < blocks; ++i) {
out.writeLong(nextBlocks[i]); out.writeLong(nextBlocks[i]);
} }
Arrays.fill(nextValues, 0L);
off = 0; off = 0;
} }

View File

@ -42,22 +42,52 @@ package org.apache.lucene.util.packed;
* limitations under the License. * limitations under the License.
*/ */
import java.util.EnumMap; import java.nio.LongBuffer;
import java.nio.ByteBuffer;
/** /**
* Efficient sequential read/write of packed integers. * Efficient sequential read/write of packed integers.
*/ */
abstract class BulkOperation { enum BulkOperation implements PackedInts.Decoder, PackedInts.Encoder {
"""
static final EnumMap<PackedInts.Format, BulkOperation[]> BULK_OPERATIONS = new EnumMap<PackedInts.Format, BulkOperation[]>(PackedInts.Format.class); FOOTER="""
public static BulkOperation of(PackedInts.Format format, int bitsPerValue) { private static long[] toLongArray(int[] ints, int offset, int length) {
assert bitsPerValue > 0 && bitsPerValue <= 64; long[] arr = new long[length];
BulkOperation[] ops = BULK_OPERATIONS.get(format); for (int i = 0; i < length; ++i) {
if (ops == null || ops[bitsPerValue] == null) { arr[i] = ints[offset + i];
throw new IllegalArgumentException("format: " + format + ", bitsPerValue: " + bitsPerValue);
} }
return ops[bitsPerValue]; return arr;
}
@Override
public void decode(long[] blocks, int blocksOffset, int[] values, int valuesOffset, int iterations) {
throw new UnsupportedOperationException();
}
@Override
public void decode(byte[] blocks, int blocksOffset, int[] values, int valuesOffset, int iterations) {
throw new UnsupportedOperationException();
}
@Override
public void encode(int[] values, int valuesOffset, long[] blocks, int blocksOffset, int iterations) {
encode(toLongArray(values, valuesOffset, iterations * valueCount()), 0, blocks, blocksOffset, iterations);
}
@Override
public void encode(long[] values, int valuesOffset, byte[] blocks, int blocksOffset, int iterations) {
final long[] longBLocks = new long[blockCount() * iterations];
encode(values, valuesOffset, longBLocks, 0, iterations);
ByteBuffer.wrap(blocks, blocksOffset, 8 * iterations * blockCount()).asLongBuffer().put(longBLocks);
}
@Override
public void encode(int[] values, int valuesOffset, byte[] blocks, int blocksOffset, int iterations) {
final long[] longBLocks = new long[blockCount() * iterations];
encode(values, valuesOffset, longBLocks, 0, iterations);
ByteBuffer.wrap(blocks, blocksOffset, 8 * iterations * blockCount()).asLongBuffer().put(longBLocks);
} }
/** /**
@ -67,7 +97,7 @@ abstract class BulkOperation {
* - 16 bits per value -> b=1, v=4 * - 16 bits per value -> b=1, v=4
* - 24 bits per value -> b=3, v=8 * - 24 bits per value -> b=3, v=8
* - 50 bits per value -> b=25, v=32 * - 50 bits per value -> b=25, v=32
* - 63 bits per value -> b=63, v = 64 * - 63 bits per value -> b=63, v=64
* - ... * - ...
* *
* A bulk read consists in copying <code>iterations*v</code> values that are * A bulk read consists in copying <code>iterations*v</code> values that are
@ -79,87 +109,155 @@ abstract class BulkOperation {
* <code>ramBudget / (8 * (b + v))</code> (since a long is 8 bytes). * <code>ramBudget / (8 * (b + v))</code> (since a long is 8 bytes).
*/ */
public final int computeIterations(int valueCount, int ramBudget) { public final int computeIterations(int valueCount, int ramBudget) {
final int iterations = (ramBudget >>> 3) / (blocks() + values()); final int iterations = (ramBudget >>> 3) / (blockCount() + valueCount());
if (iterations == 0) { if (iterations == 0) {
// at least 1 // at least 1
return 1; return 1;
} else if ((iterations - 1) * blocks() >= valueCount) { } else if ((iterations - 1) * blockCount() >= valueCount) {
// don't allocate for more than the size of the reader // don't allocate for more than the size of the reader
return (int) Math.ceil((double) valueCount / values()); return (int) Math.ceil((double) valueCount / valueCount());
} else { } else {
return iterations; return iterations;
} }
} }
}
/**
* The minimum number of blocks required to perform a bulk get/set.
*/
public abstract int blocks();
/**
* The number of values that can be stored in <code>blocks()</code> blocks.
*/
public abstract int values();
/**
* Get <code>n * values()</code> values from <code>n * blocks()</code> blocks.
*/
public abstract void get(long[] blocks, int blockIndex, long[] values, int valuesIndex, int iterations);
/**
* Set <code>n * values()</code> values into <code>n * blocks()</code> blocks.
*/
public abstract void set(long[] blocks, int blockIndex, long[] values, int valuesIndex, int iterations);
""" """
FOOTER = "}" def casts(typ):
cast_start = "(%s) (" %typ
cast_end = ")"
if typ == "long":
cast_start = ""
cast_end = ""
return cast_start, cast_end
def masks(bits):
if bits == 64:
return "", ""
return "(", " & %sL)" %(hex((1 << bits) - 1))
def get_type(bits):
if bits == 8:
return "byte"
elif bits == 16:
return "short"
elif bits == 32:
return "int"
elif bits == 64:
return "long"
else:
assert False
def packed64singleblock(bpv, f): def packed64singleblock(bpv, f):
values = 64 / bpv values = 64 / bpv
f.write("\n static final class Packed64SingleBlockBulkOperation%d extends BulkOperation {\n\n" %bpv) f.write("\n PACKED_SINGLE_BLOCK_%d {\n\n" %bpv)
f.write(" public int blocks() {\n") f.write(" public int blockCount() {\n")
f.write(" return 1;\n") f.write(" return 1;\n")
f.write(" }\n\n") f.write(" }\n\n")
f.write(" public int values() {\n") f.write(" public int valueCount() {\n")
f.write(" return %d;\n" %values) f.write(" return %d;\n" %values)
f.write(" }\n\n") f.write(" }\n\n")
p64sb_decode(bpv, 32)
p64sb_decode(bpv, 64)
p64sb_encode(bpv, 32)
p64sb_encode(bpv, 64)
f.write(" }")
f.write(" public void get(long[] blocks, int bi, long[] values, int vi, int iterations) {\n") def p64sb_decode(bpv, bits):
f.write(" assert bi + iterations * blocks() <= blocks.length;\n") values = 64 / bpv
f.write(" assert vi + iterations * values() <= values.length;\n") typ = get_type(bits)
cast_start, cast_end = casts(typ)
f.write(" public void decode(long[] blocks, int blocksOffset, %s[] values, int valuesOffset, int iterations) {\n" %typ)
if bits < bpv:
f.write(" throw new UnsupportedOperationException();\n")
f.write(" }\n\n")
return
f.write(" assert blocksOffset + iterations * blockCount() <= blocks.length;\n")
f.write(" assert valuesOffset + iterations * valueCount() <= values.length;\n")
f.write(" for (int i = 0; i < iterations; ++i) {\n") f.write(" for (int i = 0; i < iterations; ++i) {\n")
f.write(" final long block = blocks[bi++];\n") f.write(" final long block = blocks[blocksOffset++];\n")
mask = (1 << bpv) - 1 mask = (1 << bpv) - 1
for i in xrange(values): for i in xrange(values):
block_offset = i / values block_offset = i / values
offset_in_block = i % values offset_in_block = i % values
if i == 0: if i == 0:
f.write(" values[vi++] = block & %dL;\n" %mask) f.write(" values[valuesOffset++] = %sblock & %dL%s;\n" %(cast_start, mask, cast_end))
elif i == values - 1: elif i == values - 1:
f.write(" values[vi++] = block >>> %d;\n" %(i * bpv)) f.write(" values[valuesOffset++] = %sblock >>> %d%s;\n" %(cast_start, i * bpv, cast_end))
else: else:
f.write(" values[vi++] = (block >>> %d) & %dL;\n" %(i * bpv, mask)) f.write(" values[valuesOffset++] = %s(block >>> %d) & %dL%s;\n" %(cast_start, i * bpv, mask, cast_end))
f.write(" }\n") f.write(" }\n")
f.write(" }\n\n") f.write(" }\n\n")
f.write(" public void set(long[] blocks, int bi, long[] values, int vi, int iterations) {\n") f.write(" public void decode(byte[] blocks, int blocksOffset, %s[] values, int valuesOffset, int iterations) {\n" %typ)
f.write(" assert bi + iterations * blocks() <= blocks.length;\n") if bits < bpv:
f.write(" assert vi + iterations * values() <= values.length;\n") f.write(" throw new UnsupportedOperationException();\n")
f.write(" }\n\n")
f.write(" assert blocksOffset + 8 * iterations * blockCount() <= blocks.length;\n")
f.write(" assert valuesOffset + iterations * valueCount() <= values.length;\n")
f.write(" for (int i = 0; i < iterations; ++i) {\n")
if bpv >= 32 and bits > 32:
for i in xrange(7, -1, -1):
f.write(" final long byte%d = blocks[blocksOffset++] & 0xFF;\n" %i)
else:
for i in xrange(7, -1, -1):
f.write(" final int byte%d = blocks[blocksOffset++] & 0xFF;\n" %i)
for i in xrange(values):
byte_start = (i * bpv) / 8
bit_start = (i * bpv) % 8
byte_end = ((i + 1) * bpv - 1) / 8
bit_end = ((i + 1) * bpv - 1) % 8
f.write(" values[valuesOffset++] =")
if byte_start == byte_end:
# only one byte
if bit_start == 0:
if bit_end == 7:
f.write(" byte%d" %byte_start)
else:
f.write(" byte%d & %d" %(byte_start, mask))
else:
if bit_end == 7:
f.write(" byte%d >>> %d" %(byte_start, bit_start))
else:
f.write(" (byte%d >>> %d) & %d" %(byte_start, bit_start, mask))
else:
if bit_start == 0:
f.write(" byte%d" %byte_start)
else:
f.write(" (byte%d >>> %d)" %(byte_start, bit_start))
for b in xrange(byte_start + 1, byte_end):
f.write(" | (byte%d << %d)" %(b, 8 * (b - byte_start) - bit_start))
if bit_end == 7:
f.write(" | (byte%d << %d)" %(byte_end, 8 * (byte_end - byte_start) - bit_start))
else:
f.write(" | ((byte%d & %d) << %d)" %(byte_end, 2 ** (bit_end + 1) - 1, 8 * (byte_end - byte_start) - bit_start))
f.write(";\n")
f.write(" }\n")
f.write(" }\n\n")
def p64sb_encode(bpv, bits):
values = 64 / bpv
typ = get_type(bits)
mask_start, mask_end = masks(bits)
f.write(" public void encode(%s[] values, int valuesOffset, long[] blocks, int blocksOffset, int iterations) {\n" %typ)
if bits < bpv:
f.write(" throw new UnsupportedOperationException();\n")
f.write(" }\n\n")
return
f.write(" assert blocksOffset + iterations * blockCount() <= blocks.length;\n")
f.write(" assert valuesOffset + iterations * valueCount() <= values.length;\n")
f.write(" for (int i = 0; i < iterations; ++i) {\n") f.write(" for (int i = 0; i < iterations; ++i) {\n")
for i in xrange(values): for i in xrange(values):
block_offset = i / values block_offset = i / values
offset_in_block = i % values offset_in_block = i % values
if i == 0: if i == 0:
f.write(" blocks[bi++] = values[vi++]") f.write(" blocks[blocksOffset++] = %svalues[valuesOffset++]%s" %(mask_start, mask_end))
else: else:
f.write(" | (values[vi++] << %d)" %(i * bpv)) f.write(" | (%svalues[valuesOffset++]%s << %d)" %(mask_start, mask_end, i * bpv))
if i == values - 1: if i == values - 1:
f.write(";\n") f.write(";\n")
f.write(" }\n") f.write(" }\n")
f.write(" }\n") f.write(" }\n\n")
f.write(" }\n")
def packed64(bpv, f): def packed64(bpv, f):
blocks = bpv blocks = bpv
@ -169,96 +267,180 @@ def packed64(bpv, f):
values /= 2 values /= 2
assert values * bpv == 64 * blocks, "%d values, %d blocks, %d bits per value" %(values, blocks, bpv) assert values * bpv == 64 * blocks, "%d values, %d blocks, %d bits per value" %(values, blocks, bpv)
mask = (1 << bpv) - 1 mask = (1 << bpv) - 1
f.write(" static final class Packed64BulkOperation%d extends BulkOperation {\n\n" %bpv) f.write(" PACKED_%d {\n\n" %bpv)
f.write(" public int blocks() {\n") f.write(" public int blockCount() {\n")
f.write(" return %d;\n" %blocks) f.write(" return %d;\n" %blocks)
f.write(" }\n\n") f.write(" }\n\n")
f.write(" public int values() {\n") f.write(" public int valueCount() {\n")
f.write(" return %d;\n" %values) f.write(" return %d;\n" %values)
f.write(" }\n\n") f.write(" }\n\n")
if bpv == 64: if bpv == 64:
f.write(""" public void get(long[] blocks, int bi, long[] values, int vi, int iterations) { f.write(""" public void decode(long[] blocks, int blocksOffset, long[] values, int valuesOffset, int iterations) {
System.arraycopy(blocks, bi, values, vi, iterations); System.arraycopy(blocks, blocksOffset, values, valuesOffset, valueCount() * iterations);
} }
public void set(long[] blocks, int bi, long[] values, int vi, int iterations) { public void decode(byte[] blocks, int blocksOffset, long[] values, int valuesOffset, int iterations) {
System.arraycopy(values, bi, blocks, vi, iterations); LongBuffer.wrap(values, valuesOffset, iterations * valueCount()).put(ByteBuffer.wrap(blocks, blocksOffset, 8 * iterations * blockCount()).asLongBuffer());
} }
public void encode(long[] values, int valuesOffset, long[] blocks, int blocksOffset, int iterations) {
System.arraycopy(values, valuesOffset, blocks, blocksOffset, valueCount() * iterations);
}
} }
""") """)
return else:
p64_decode(bpv, 32, values)
p64_decode(bpv, 64, values)
p64_encode(bpv, 32, values)
p64_encode(bpv, 64, values)
f.write(" }\n")
f.write(" public void get(long[] blocks, int bi, long[] values, int vi, int iterations) {\n") def p64_decode(bpv, bits, values):
f.write(" assert bi + iterations * blocks() <= blocks.length;\n") typ = get_type(bits)
f.write(" assert vi + iterations * values() <= values.length;\n") cast_start, cast_end = casts(typ)
f.write(" public void decode(long[] blocks, int blocksOffset, %s[] values, int valuesOffset, int iterations) {\n" %typ)
if bits < bpv:
f.write(" throw new UnsupportedOperationException();\n")
f.write(" }\n\n")
return
f.write(" assert blocksOffset + iterations * blockCount() <= blocks.length;\n")
f.write(" assert valuesOffset + iterations * valueCount() <= values.length;\n")
f.write(" for (int i = 0; i < iterations; ++i) {\n") f.write(" for (int i = 0; i < iterations; ++i) {\n")
mask = (1 << bpv) - 1
for i in xrange(0, values): for i in xrange(0, values):
block_offset = i * bpv / 64 block_offset = i * bpv / 64
bit_offset = (i * bpv) % 64 bit_offset = (i * bpv) % 64
if bit_offset == 0: if bit_offset == 0:
# start of block # start of block
f.write(" final long block%d = blocks[bi++];\n" %block_offset); f.write(" final long block%d = blocks[blocksOffset++];\n" %block_offset);
f.write(" values[vi++] = block%d >>> %d;\n" %(block_offset, 64 - bpv)) f.write(" values[valuesOffset++] = %sblock%d >>> %d%s;\n" %(cast_start, block_offset, 64 - bpv, cast_end))
elif bit_offset + bpv == 64: elif bit_offset + bpv == 64:
# end of block # end of block
f.write(" values[vi++] = block%d & %dL;\n" %(block_offset, mask)) f.write(" values[valuesOffset++] = %sblock%d & %dL%s;\n" %(cast_start, block_offset, mask, cast_end))
elif bit_offset + bpv < 64: elif bit_offset + bpv < 64:
# middle of block # middle of block
f.write(" values[vi++] = (block%d >>> %d) & %dL;\n" %(block_offset, 64 - bit_offset - bpv, mask)) f.write(" values[valuesOffset++] = %s(block%d >>> %d) & %dL%s;\n" %(cast_start, block_offset, 64 - bit_offset - bpv, mask, cast_end))
else: else:
# value spans across 2 blocks # value spans across 2 blocks
mask1 = (1 << (64 - bit_offset)) -1 mask1 = (1 << (64 - bit_offset)) -1
shift1 = bit_offset + bpv - 64 shift1 = bit_offset + bpv - 64
shift2 = 64 - shift1 shift2 = 64 - shift1
f.write(" final long block%d = blocks[bi++];\n" %(block_offset + 1)); f.write(" final long block%d = blocks[blocksOffset++];\n" %(block_offset + 1));
f.write(" values[vi++] = ((block%d & %dL) << %d) | (block%d >>> %d);\n" %(block_offset, mask1, shift1, block_offset + 1, shift2)) f.write(" values[valuesOffset++] = %s((block%d & %dL) << %d) | (block%d >>> %d)%s;\n" %(cast_start, block_offset, mask1, shift1, block_offset + 1, shift2, cast_end))
f.write(" }\n") f.write(" }\n")
f.write(" }\n\n") f.write(" }\n\n")
f.write(" public void set(long[] blocks, int bi, long[] values, int vi, int iterations) {\n") f.write(" public void decode(byte[] blocks, int blocksOffset, %s[] values, int valuesOffset, int iterations) {\n" %typ)
f.write(" assert bi + iterations * blocks() <= blocks.length;\n") if bits < bpv:
f.write(" assert vi + iterations * values() <= values.length;\n") f.write(" throw new UnsupportedOperationException();\n")
f.write(" }\n\n")
return
f.write(" assert blocksOffset + 8 * iterations * blockCount() <= blocks.length;\n")
f.write(" assert valuesOffset + iterations * valueCount() <= values.length;\n")
f.write(" for (int i = 0; i < iterations; ++i) {\n")
blocks = values * bpv / 8
for i in xrange(0, values):
byte_start = i * bpv / 8
bit_start = (i * bpv) % 8
byte_end = ((i + 1) * bpv - 1) / 8
bit_end = ((i + 1) * bpv - 1) % 8
shift = lambda b: 8 * (byte_end - b - 1) + 1 + bit_end
if bit_start == 0:
f.write(" final %s byte%d = blocks[blocksOffset++] & 0xFF;\n" %(typ, byte_start))
for b in xrange(byte_start + 1, byte_end + 1):
f.write(" final %s byte%d = blocks[blocksOffset++] & 0xFF;\n" %(typ, b))
f.write(" values[valuesOffset++] =")
if byte_start == byte_end:
if bit_start == 0:
if bit_end == 7:
f.write(" byte%d" %byte_start)
else:
f.write(" byte%d >>> %d" %(byte_start, 7 - bit_end))
else:
if bit_end == 7:
f.write(" byte%d & %d" %(byte_start, 2 ** (8 - bit_start) - 1))
else:
f.write(" (byte%d >>> %d) & %d" %(byte_start, 7 - bit_end, 2 ** (bit_end - bit_start + 1) - 1))
else:
if bit_start == 0:
f.write(" (byte%d << %d)" %(byte_start, shift(byte_start)))
else:
f.write(" ((byte%d & %d) << %d)" %(byte_start, 2 ** (8 - bit_start) - 1, shift(byte_start)))
for b in xrange(byte_start + 1, byte_end):
f.write(" | (byte%d << %d)" %(b, shift(b)))
if bit_end == 7:
f.write(" | byte%d" %byte_end)
else:
f.write(" | (byte%d >>> %d)" %(byte_end, 7 - bit_end))
f.write(";\n")
f.write(" }\n")
f.write(" }\n\n")
def p64_encode(bpv, bits, values):
typ = get_type(bits)
mask_start, mask_end = masks(bits)
f.write(" public void encode(%s[] values, int valuesOffset, long[] blocks, int blocksOffset, int iterations) {\n" %typ)
f.write(" assert blocksOffset + iterations * blockCount() <= blocks.length;\n")
f.write(" assert valuesOffset + iterations * valueCount() <= values.length;\n")
f.write(" for (int i = 0; i < iterations; ++i) {\n") f.write(" for (int i = 0; i < iterations; ++i) {\n")
for i in xrange(0, values): for i in xrange(0, values):
block_offset = i * bpv / 64 block_offset = i * bpv / 64
bit_offset = (i * bpv) % 64 bit_offset = (i * bpv) % 64
if bit_offset == 0: if bit_offset == 0:
# start of block # start of block
f.write(" blocks[bi++] = (values[vi++] << %d)" %(64 - bpv)) f.write(" blocks[blocksOffset++] = (%svalues[valuesOffset++]%s << %d)" %(mask_start, mask_end, 64 - bpv))
elif bit_offset + bpv == 64: elif bit_offset + bpv == 64:
# end of block # end of block
f.write(" | values[vi++];\n") f.write(" | %svalues[valuesOffset++]%s;\n" %(mask_start, mask_end))
elif bit_offset + bpv < 64: elif bit_offset + bpv < 64:
# inside a block # inside a block
f.write(" | (values[vi++] << %d)" %(64 - bit_offset - bpv)) f.write(" | (%svalues[valuesOffset++]%s << %d)" %(mask_start, mask_end, 64 - bit_offset - bpv))
else: else:
# value spans across 2 blocks # value spans across 2 blocks
right_bits = bit_offset + bpv - 64 right_bits = bit_offset + bpv - 64
f.write(" | (values[vi] >>> %d);\n" %right_bits) f.write(" | (%svalues[valuesOffset]%s >>> %d);\n" %(mask_start, mask_end, right_bits))
f.write(" blocks[bi++] = (values[vi++] << %d)" %(64 - right_bits)) f.write(" blocks[blocksOffset++] = (%svalues[valuesOffset++]%s << %d)" %(mask_start, mask_end, 64 - right_bits))
f.write(" }\n") f.write(" }\n")
f.write(" }\n") f.write(" }\n\n")
f.write(" }\n\n")
if __name__ == '__main__': if __name__ == '__main__':
p64_bpv = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 16, 21, 32] p64_bpv = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 16, 21, 32]
f = open(OUTPUT_FILE, 'w') f = open(OUTPUT_FILE, 'w')
f.write(HEADER) f.write(HEADER)
f.write(" static {\n")
f.write(" BULK_OPERATIONS.put(PackedInts.Format.PACKED, new BulkOperation[65]);")
for bpv in xrange(1, 65):
f.write(" BULK_OPERATIONS.get(PackedInts.Format.PACKED)[%d] = new Packed64BulkOperation%d();\n" %(bpv, bpv))
f.write(" BULK_OPERATIONS.put(PackedInts.Format.PACKED_SINGLE_BLOCK, new BulkOperation[65]);\n")
for bpv in PACKED_64_SINGLE_BLOCK_BPV:
f.write(" BULK_OPERATIONS.get(PackedInts.Format.PACKED_SINGLE_BLOCK)[%d] = new Packed64SingleBlockBulkOperation%d();\n" %(bpv, bpv))
f.write(" }\n")
for bpv in xrange(1, 65): for bpv in xrange(1, 65):
packed64(bpv, f) packed64(bpv, f)
f.write(" ,\n")
for bpv in PACKED_64_SINGLE_BLOCK_BPV: for bpv in PACKED_64_SINGLE_BLOCK_BPV:
if bpv != PACKED_64_SINGLE_BLOCK_BPV[0]:
f.write(" ,\n")
packed64singleblock(bpv,f) packed64singleblock(bpv,f)
f.write(" ;\n\n")
f.write(" public static BulkOperation of(PackedInts.Format format, int bitsPerValue) {\n")
f.write(" switch (format) {\n")
f.write(" case PACKED:\n")
f.write(" switch (bitsPerValue) {\n")
for i in xrange(1, 65):
f.write(" case %d:\n" %i)
f.write(" return PACKED_%d;\n" %i)
f.write(" default:\n")
f.write(" throw new AssertionError();\n")
f.write(" }\n")
f.write(" case PACKED_SINGLE_BLOCK:\n")
f.write(" switch (bitsPerValue) {\n")
for i in PACKED_64_SINGLE_BLOCK_BPV:
f.write(" case %d:\n" %i)
f.write(" return PACKED_SINGLE_BLOCK_%d;\n" %i)
f.write(" default:\n")
f.write(" throw new AssertionError();\n")
f.write(" }\n")
f.write(" default:\n")
f.write(" throw new AssertionError();\n")
f.write(" }\n")
f.write(" }\n")
f.write(FOOTER) f.write(FOOTER)
f.close() f.close()

View File

@ -19,3 +19,4 @@ org.apache.lucene.codecs.simpletext.SimpleTextPostingsFormat
org.apache.lucene.codecs.memory.MemoryPostingsFormat org.apache.lucene.codecs.memory.MemoryPostingsFormat
org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat
org.apache.lucene.codecs.memory.DirectPostingsFormat org.apache.lucene.codecs.memory.DirectPostingsFormat
org.apache.lucene.codecs.block.BlockPostingsFormat

View File

@ -0,0 +1,94 @@
package org.apache.lucene.codecs.block;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import static org.apache.lucene.codecs.block.BlockPostingsFormat.BLOCK_SIZE;
import static org.apache.lucene.codecs.block.ForUtil.MAX_DATA_SIZE;
import static org.apache.lucene.codecs.block.ForUtil.MAX_ENCODED_SIZE;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.packed.PackedInts;
import com.carrotsearch.randomizedtesting.generators.RandomInts;
public class TestForUtil extends LuceneTestCase {
public void testEncodeDecode() throws IOException {
final int iterations = RandomInts.randomIntBetween(random(), 1, 1000);
final float acceptableOverheadRatio = random().nextFloat();
final int[] values = new int[(iterations - 1) * BLOCK_SIZE + ForUtil.MAX_DATA_SIZE];
for (int i = 0; i < iterations; ++i) {
final int bpv = random().nextInt(32);
if (bpv == 0) {
final int value = RandomInts.randomIntBetween(random(), 0, Integer.MAX_VALUE);
for (int j = 0; j < BLOCK_SIZE; ++j) {
values[i * BLOCK_SIZE + j] = value;
}
} else {
for (int j = 0; j < BLOCK_SIZE; ++j) {
values[i * BLOCK_SIZE + j] = RandomInts.randomIntBetween(random(),
0, (int) PackedInts.maxValue(bpv));
}
}
}
final Directory d = new RAMDirectory();
final long endPointer;
{
// encode
IndexOutput out = d.createOutput("test.bin", IOContext.DEFAULT);
final ForUtil forUtil = new ForUtil(acceptableOverheadRatio, out);
for (int i = 0; i < iterations; ++i) {
forUtil.writeBlock(
Arrays.copyOfRange(values, i * BLOCK_SIZE, values.length),
new byte[MAX_ENCODED_SIZE], out);
}
endPointer = out.getFilePointer();
out.close();
}
{
// decode
IndexInput in = d.openInput("test.bin", IOContext.READONCE);
final ForUtil forUtil = new ForUtil(in);
for (int i = 0; i < iterations; ++i) {
if (random().nextBoolean()) {
forUtil.skipBlock(in);
continue;
}
final int[] restored = new int[MAX_DATA_SIZE];
forUtil.readBlock(in, new byte[MAX_ENCODED_SIZE], restored);
assertArrayEquals(Arrays.copyOfRange(values, i * BLOCK_SIZE, (i + 1) * BLOCK_SIZE),
Arrays.copyOf(restored, BLOCK_SIZE));
}
assertEquals(endPointer, in.getFilePointer());
in.close();
}
}
}

View File

@ -933,6 +933,7 @@ public class TestPostingsFormat extends LuceneTestCase {
// NOTE: you can also test "weaker" index options than // NOTE: you can also test "weaker" index options than
// you indexed with: // you indexed with:
testTerms(fieldsProducer, EnumSet.allOf(Option.class), IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); testTerms(fieldsProducer, EnumSet.allOf(Option.class), IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
//testTerms(fieldsProducer, EnumSet.complementOf(EnumSet.of(Option.THREADS)), IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
fieldsProducer.close(); fieldsProducer.close();
dir.close(); dir.close();

View File

@ -444,7 +444,7 @@ public class TestPostingsOffsets extends LuceneTestCase {
makeToken("foo", 1, 0, 3), makeToken("foo", 1, 0, 3),
makeToken("foo", 0, 0, 3), makeToken("foo", 0, 0, 3),
makeToken("foo", 0, 0, 3) makeToken("foo", 0, 0, 3)
}); });
} }
public void testLegalbutVeryLargeOffsets() throws Exception { public void testLegalbutVeryLargeOffsets() throws Exception {

View File

@ -18,17 +18,24 @@ package org.apache.lucene.util.packed;
*/ */
import java.io.IOException; import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.LongBuffer;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.Locale; import java.util.Locale;
import java.util.Random; import java.util.Random;
import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.*; import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.LongsRef; import org.apache.lucene.util.LongsRef;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
import org.apache.lucene.util.LuceneTestCase.Slow; import org.apache.lucene.util.LuceneTestCase.Slow;
import org.apache.lucene.util._TestUtil;
import org.apache.lucene.util.packed.PackedInts.Reader; import org.apache.lucene.util.packed.PackedInts.Reader;
@Slow @Slow
@ -622,4 +629,106 @@ public class TestPackedInts extends LuceneTestCase {
} }
} }
public void testEncodeDecode() {
for (PackedInts.Format format : PackedInts.Format.values()) {
for (int bpv = 1; bpv <= 64; ++bpv) {
if (!format.isSupported(bpv)) {
continue;
}
String msg = format + " " + bpv;
final PackedInts.Encoder encoder = PackedInts.getEncoder(format, PackedInts.VERSION_CURRENT, bpv);
final PackedInts.Decoder decoder = PackedInts.getDecoder(format, PackedInts.VERSION_CURRENT, bpv);
final int blockCount = encoder.blockCount();
final int valueCount = encoder.valueCount();
assertEquals(blockCount, decoder.blockCount());
assertEquals(valueCount, decoder.valueCount());
final int iterations = random().nextInt(100);
final int blocksOffset = random().nextInt(100);
final int valuesOffset = random().nextInt(100);
final int blocksOffset2 = random().nextInt(100);
final int blocksLen = iterations * blockCount;
// 1. generate random inputs
final long[] blocks = new long[blocksOffset + blocksLen];
for (int i = 0; i < blocks.length; ++i) {
blocks[i] = random().nextLong();
if (format == PackedInts.Format.PACKED_SINGLE_BLOCK && 64 % bpv != 0) {
// clear highest bits for packed
final int toClear = 64 % bpv;
blocks[i] = (blocks[i] << toClear) >>> toClear;
}
}
// 2. decode
final long[] values = new long[valuesOffset + iterations * valueCount];
decoder.decode(blocks, blocksOffset, values, valuesOffset, iterations);
for (long value : values) {
assertTrue(value <= PackedInts.maxValue(bpv));
}
// test decoding to int[]
final int[] intValues;
if (bpv <= 32) {
intValues = new int[values.length];
decoder.decode(blocks, blocksOffset, intValues, valuesOffset, iterations);
assertTrue(equals(intValues, values));
} else {
intValues = null;
}
// 3. re-encode
final long[] blocks2 = new long[blocksOffset2 + blocksLen];
encoder.encode(values, valuesOffset, blocks2, blocksOffset2, iterations);
assertArrayEquals(msg, Arrays.copyOfRange(blocks, blocksOffset, blocks.length),
Arrays.copyOfRange(blocks2, blocksOffset2, blocks2.length));
// test encoding from int[]
if (bpv <= 32) {
final long[] blocks3 = new long[blocks2.length];
encoder.encode(intValues, valuesOffset, blocks3, blocksOffset2, iterations);
assertArrayEquals(msg, blocks2, blocks3);
}
// 4. byte[] decoding
final byte[] byteBlocks = new byte[8 * blocks.length];
ByteBuffer.wrap(byteBlocks).asLongBuffer().put(blocks);
final long[] values2 = new long[valuesOffset + iterations * valueCount];
decoder.decode(byteBlocks, blocksOffset * 8, values2, valuesOffset, iterations);
for (long value : values2) {
assertTrue(msg, value <= PackedInts.maxValue(bpv));
}
assertArrayEquals(msg, values, values2);
// test decoding to int[]
if (bpv <= 32) {
final int[] intValues2 = new int[values2.length];
decoder.decode(byteBlocks, blocksOffset * 8, intValues2, valuesOffset, iterations);
assertTrue(msg, equals(intValues2, values2));
}
// 5. byte[] encoding
final byte[] blocks3 = new byte[8 * (blocksOffset2 + blocksLen)];
encoder.encode(values, valuesOffset, blocks3, 8 * blocksOffset2, iterations);
assertEquals(msg, LongBuffer.wrap(blocks2), ByteBuffer.wrap(blocks3).asLongBuffer());
// test encoding from int[]
if (bpv <= 32) {
final byte[] blocks4 = new byte[blocks3.length];
encoder.encode(intValues, valuesOffset, blocks4, 8 * blocksOffset2, iterations);
assertArrayEquals(msg, blocks3, blocks4);
}
}
}
}
private static boolean equals(int[] ints, long[] longs) {
if (ints.length != longs.length) {
return false;
}
for (int i = 0; i < ints.length; ++i) {
if ((ints[i] & 0xFFFFFFFFL) != longs[i]) {
return false;
}
}
return true;
}
} }

View File

@ -87,7 +87,7 @@ public class MockSingleIntIndexInput extends IntIndexInput {
} }
@Override @Override
public void set(IntIndexInput.Index other) { public void copyFrom(IntIndexInput.Index other) {
fp = ((MockSingleIntIndexInputIndex) other).fp; fp = ((MockSingleIntIndexInputIndex) other).fp;
} }

View File

@ -310,9 +310,7 @@ public abstract class LuceneTestCase extends Assert {
"MockFixedIntBlock", "MockFixedIntBlock",
"MockVariableIntBlock", "MockVariableIntBlock",
"MockSep", "MockSep",
"MockRandom", "MockRandom"
"For",
"PFor"
)); ));
// ----------------------------------------------------------------- // -----------------------------------------------------------------