mirror of https://github.com/apache/lucene.git
LUCENE-3892: merge BlockPostingsFormat from branch
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1375486 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
commit
03003209b4
|
@ -15,6 +15,11 @@ New Features
|
|||
that you must store term vector positions to store payloads.
|
||||
(Robert Muir)
|
||||
|
||||
* LUCENE-3892: Add a new BlockPostingsFormat that bulk-encodes docs,
|
||||
freqs and positions in large (size 128) packed-int blocks for faster
|
||||
search performance. This was from Han Jiang's 2012 Google Summer of
|
||||
Code project (Han Jiang, Adrien Grand, Robert Muir, Mike McCandless)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-4299: Added Terms.hasPositions() and Terms.hasOffsets().
|
||||
|
|
|
@ -724,7 +724,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
|
|||
// Write term stats, to separate byte[] blob:
|
||||
bytesWriter2.writeVInt(term.stats.docFreq);
|
||||
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
|
||||
assert term.stats.totalTermFreq >= term.stats.docFreq;
|
||||
assert term.stats.totalTermFreq >= term.stats.docFreq: term.stats.totalTermFreq + " vs " + term.stats.docFreq;
|
||||
bytesWriter2.writeVLong(term.stats.totalTermFreq - term.stats.docFreq);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -59,30 +59,36 @@ public abstract class MultiLevelSkipListReader {
|
|||
private int skipInterval[]; // skipInterval of each level
|
||||
private int[] numSkipped; // number of docs skipped per level
|
||||
|
||||
private int[] skipDoc; // doc id of current skip entry per level
|
||||
protected int[] skipDoc; // doc id of current skip entry per level
|
||||
private int lastDoc; // doc id of last read skip entry with docId <= target
|
||||
private long[] childPointer; // child pointer of current skip entry per level
|
||||
private long lastChildPointer; // childPointer of last read skip entry with docId <= target
|
||||
|
||||
private boolean inputIsBuffered;
|
||||
private final int skipMultiplier;
|
||||
|
||||
public MultiLevelSkipListReader(IndexInput skipStream, int maxSkipLevels, int skipInterval) {
|
||||
protected MultiLevelSkipListReader(IndexInput skipStream, int maxSkipLevels, int skipInterval, int skipMultiplier) {
|
||||
this.skipStream = new IndexInput[maxSkipLevels];
|
||||
this.skipPointer = new long[maxSkipLevels];
|
||||
this.childPointer = new long[maxSkipLevels];
|
||||
this.numSkipped = new int[maxSkipLevels];
|
||||
this.maxNumberOfSkipLevels = maxSkipLevels;
|
||||
this.skipInterval = new int[maxSkipLevels];
|
||||
this.skipMultiplier = skipMultiplier;
|
||||
this.skipStream [0]= skipStream;
|
||||
this.inputIsBuffered = (skipStream instanceof BufferedIndexInput);
|
||||
this.skipInterval[0] = skipInterval;
|
||||
for (int i = 1; i < maxSkipLevels; i++) {
|
||||
// cache skip intervals
|
||||
this.skipInterval[i] = this.skipInterval[i - 1] * skipInterval;
|
||||
this.skipInterval[i] = this.skipInterval[i - 1] * skipMultiplier;
|
||||
}
|
||||
skipDoc = new int[maxSkipLevels];
|
||||
}
|
||||
|
||||
// skipMultiplier and skipInterval are the same:
|
||||
protected MultiLevelSkipListReader(IndexInput skipStream, int maxSkipLevels, int skipInterval) {
|
||||
this(skipStream, maxSkipLevels, skipInterval, skipInterval);
|
||||
}
|
||||
|
||||
/** Returns the id of the doc to which the last call of {@link #skipTo(int)}
|
||||
* has skipped. */
|
||||
|
@ -187,7 +193,12 @@ public abstract class MultiLevelSkipListReader {
|
|||
|
||||
/** Loads the skip levels */
|
||||
private void loadSkipLevels() throws IOException {
|
||||
numberOfSkipLevels = MathUtil.log(docCount, skipInterval[0]);
|
||||
if (docCount <= skipInterval[0]) {
|
||||
numberOfSkipLevels = 1;
|
||||
} else {
|
||||
numberOfSkipLevels = 1+MathUtil.log(docCount/skipInterval[0], skipMultiplier);
|
||||
}
|
||||
|
||||
if (numberOfSkipLevels > maxNumberOfSkipLevels) {
|
||||
numberOfSkipLevels = maxNumberOfSkipLevels;
|
||||
}
|
||||
|
|
|
@ -26,6 +26,8 @@ import org.apache.lucene.util.MathUtil;
|
|||
/**
|
||||
* This abstract class writes skip lists with multiple levels.
|
||||
*
|
||||
* <pre>
|
||||
*
|
||||
* Example for skipInterval = 3:
|
||||
* c (skip level 2)
|
||||
* c c c (skip level 1)
|
||||
|
@ -45,6 +47,7 @@ import org.apache.lucene.util.MathUtil;
|
|||
*
|
||||
* While this class takes care of writing the different skip levels,
|
||||
* subclasses must define the actual format of the skip data.
|
||||
* </pre>
|
||||
* @lucene.experimental
|
||||
*/
|
||||
|
||||
|
@ -55,14 +58,22 @@ public abstract class MultiLevelSkipListWriter {
|
|||
// the skip interval in the list with level = 0
|
||||
private int skipInterval;
|
||||
|
||||
// skipInterval used for level > 0
|
||||
private int skipMultiplier;
|
||||
|
||||
// for every skip level a different buffer is used
|
||||
private RAMOutputStream[] skipBuffer;
|
||||
|
||||
protected MultiLevelSkipListWriter(int skipInterval, int maxSkipLevels, int df) {
|
||||
protected MultiLevelSkipListWriter(int skipInterval, int skipMultiplier, int maxSkipLevels, int df) {
|
||||
this.skipInterval = skipInterval;
|
||||
this.skipMultiplier = skipMultiplier;
|
||||
|
||||
// calculate the maximum number of skip levels for this document frequency
|
||||
numberOfSkipLevels = MathUtil.log(df, skipInterval);
|
||||
if (df <= skipInterval) {
|
||||
numberOfSkipLevels = 1;
|
||||
} else {
|
||||
numberOfSkipLevels = 1+MathUtil.log(df/skipInterval, skipMultiplier);
|
||||
}
|
||||
|
||||
// make sure it does not exceed maxSkipLevels
|
||||
if (numberOfSkipLevels > maxSkipLevels) {
|
||||
|
@ -70,6 +81,11 @@ public abstract class MultiLevelSkipListWriter {
|
|||
}
|
||||
}
|
||||
|
||||
// skipMultiplier and skipInterval are the same:
|
||||
protected MultiLevelSkipListWriter(int skipInterval, int maxSkipLevels, int df) {
|
||||
this(skipInterval, skipInterval, maxSkipLevels, df);
|
||||
}
|
||||
|
||||
protected void init() {
|
||||
skipBuffer = new RAMOutputStream[numberOfSkipLevels];
|
||||
for (int i = 0; i < numberOfSkipLevels; i++) {
|
||||
|
@ -104,11 +120,15 @@ public abstract class MultiLevelSkipListWriter {
|
|||
* @throws IOException
|
||||
*/
|
||||
public void bufferSkip(int df) throws IOException {
|
||||
int numLevels;
|
||||
|
||||
assert df % skipInterval == 0;
|
||||
int numLevels = 1;
|
||||
df /= skipInterval;
|
||||
|
||||
// determine max level
|
||||
for (numLevels = 0; (df % skipInterval) == 0 && numLevels < numberOfSkipLevels; df /= skipInterval) {
|
||||
while ((df % skipMultiplier) == 0 && numLevels < numberOfSkipLevels) {
|
||||
numLevels++;
|
||||
df /= skipMultiplier;
|
||||
}
|
||||
|
||||
long childPointer = 0;
|
||||
|
@ -150,5 +170,4 @@ public abstract class MultiLevelSkipListWriter {
|
|||
|
||||
return skipPointer;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,418 @@
|
|||
package org.apache.lucene.codecs.block;
|
||||
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.BlockTreeTermsReader;
|
||||
import org.apache.lucene.codecs.BlockTreeTermsWriter;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.codecs.FieldsProducer;
|
||||
import org.apache.lucene.codecs.MultiLevelSkipListWriter;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/**
|
||||
* Block postings format, which encodes postings in packed int blocks
|
||||
* for faster decode.
|
||||
*
|
||||
* <p>
|
||||
* Basic idea:
|
||||
* <ul>
|
||||
* <li>
|
||||
* <b>Packed Block and VInt Block</b>:
|
||||
* <p>In packed block, integers are encoded with the same bit width ({@link PackedInts packed format}),
|
||||
* the block size (i.e. number of integers inside block) is fixed. </p>
|
||||
* <p>In VInt block, integers are encoded as {@link DataOutput#writeVInt VInt},
|
||||
* the block size is variable.</p>
|
||||
* </li>
|
||||
*
|
||||
* <li>
|
||||
* <b>Block structure</b>:
|
||||
* <p>When the postings is long enough, BlockPostingsFormat will try to encode most integer data
|
||||
* as packed block.</p>
|
||||
* <p>Take a term with 259 documents as example, the first 256 document ids are encoded as two packed
|
||||
* blocks, while the remaining 3 as one VInt block. </p>
|
||||
* <p>Different kinds of data are always encoded separately into different packed blocks, but may
|
||||
* possible be encoded into a same VInt block. </p>
|
||||
* <p>This strategy is applied to pairs:
|
||||
* <document number, frequency>,
|
||||
* <position, payload length>,
|
||||
* <position, offset start, offset length>, and
|
||||
* <position, payload length, offsetstart, offset length>.</p>
|
||||
* </li>
|
||||
*
|
||||
* <li>
|
||||
* <b>Skipper setting</b>:
|
||||
* <p>The structure of skip table is quite similar to Lucene40PostingsFormat. Skip interval is the
|
||||
* same as block size, and each skip entry points to the beginning of each block. However, for
|
||||
* the first block, skip data is omitted.</p>
|
||||
* </li>
|
||||
*
|
||||
* <li>
|
||||
* <b>Positions, Payloads, and Offsets</b>:
|
||||
* <p>A position is an integer indicating where the term occurs at within one document.
|
||||
* A payload is a blob of metadata associated with current position.
|
||||
* An offset is a pair of integers indicating the tokenized start/end offsets for given term
|
||||
* in current position. </p>
|
||||
* <p>When payloads and offsets are not omitted, numPositions==numPayloads==numOffsets (assuming a
|
||||
* null payload contributes one count). As mentioned in block structure, it is possible to encode
|
||||
* these three either combined or separately.
|
||||
* <p>For all the cases, payloads and offsets are stored together. When encoded as packed block,
|
||||
* position data is separated out as .pos, while payloads and offsets are encoded in .pay (payload
|
||||
* metadata will also be stored directly in .pay). When encoded as VInt block, all these three are
|
||||
* stored in .pos (so as payload metadata).</p>
|
||||
* <p>With this strategy, the majority of payload and offset data will be outside .pos file.
|
||||
* So for queries that require only position data, running on a full index with payloads and offsets,
|
||||
* this reduces disk pre-fetches.</p>
|
||||
* </li>
|
||||
* </ul>
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* Files and detailed format:
|
||||
* <ul>
|
||||
* <li><tt>.tim</tt>: <a href="#Termdictionary">Term Dictionary</a></li>
|
||||
* <li><tt>.tip</tt>: <a href="#Termindex">Term Index</a></li>
|
||||
* <li><tt>.doc</tt>: <a href="#Frequencies">Frequencies and Skip Data</a></li>
|
||||
* <li><tt>.pos</tt>: <a href="#Positions">Positions</a></li>
|
||||
* <li><tt>.pay</tt>: <a href="#Payloads">Payloads and Offsets</a></li>
|
||||
* </ul>
|
||||
* </p>
|
||||
*
|
||||
* <a name="Termdictionary" id="Termdictionary"></a>
|
||||
* <dl>
|
||||
* <dd>
|
||||
* <b>Term Dictionary</b>
|
||||
*
|
||||
* <p>The .tim file format is quite similar to Lucene40PostingsFormat,
|
||||
* with minor difference in MetadataBlock</p>
|
||||
*
|
||||
* <ul>
|
||||
* <!-- TODO: expand on this, its not really correct and doesnt explain sub-blocks etc -->
|
||||
* <li>TermDictionary(.tim) --> Header, DirOffset, PostingsHeader, PackedBlockSize,
|
||||
* <Block><sup>NumBlocks</sup>, FieldSummary</li>
|
||||
* <li>Block --> SuffixBlock, StatsBlock, MetadataBlock</li>
|
||||
* <li>SuffixBlock --> EntryCount, SuffixLength, {@link DataOutput#writeByte byte}<sup>SuffixLength</sup></li>
|
||||
* <li>StatsBlock --> StatsLength, <DocFreq, TotalTermFreq><sup>EntryCount</sup></li>
|
||||
* <li>MetadataBlock --> MetaLength, <DocFPDelta,
|
||||
* <PosFPDelta, PosVIntBlockFPDelta?, PayFPDelta?>?,
|
||||
* SkipFPDelta?><sup>EntryCount</sup></li>
|
||||
* <li>FieldSummary --> NumFields, <FieldNumber, NumTerms, RootCodeLength,
|
||||
* {@link DataOutput#writeByte byte}<sup>RootCodeLength</sup>, SumDocFreq, DocCount>
|
||||
* <sup>NumFields</sup></li>
|
||||
* <li>Header, PostingsHeader --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||
* <li>DirOffset --> {@link DataOutput#writeLong Uint64}</li>
|
||||
* <li>PackedBlockSize, EntryCount, SuffixLength, StatsLength, DocFreq, MetaLength,
|
||||
* PosVIntBlockFPDelta, SkipFPDelta, NumFields, FieldNumber, RootCodeLength, DocCount -->
|
||||
* {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>TotalTermFreq, DocFPDelta, PosFPDelta, PayFPDelta, NumTerms, SumTotalTermFreq, SumDocFreq -->
|
||||
* {@link DataOutput#writeVLong VLong}</li>
|
||||
* </ul>
|
||||
* <p>Notes:</p>
|
||||
* <ul>
|
||||
* <li>Here explains MetadataBlock only, other fields are mentioned in
|
||||
* <a href="../lucene40/Lucene40PostingsFormat.html#Termdictionary">Lucene40PostingsFormat:TermDictionary</a>
|
||||
* </li>
|
||||
* <li>PackedBlockSize is the fixed block size for packed blocks. In packed block, bit width is
|
||||
* determined by the largest integer. Smaller block size result in smaller variance among width
|
||||
* of integers hence smaller indexes. Larger block size result in more efficient bulk i/o hence
|
||||
* better acceleration. This value should always be a multiple of 64, currently fixed as 128 as
|
||||
* a tradeoff. It is also the skip interval used to accelerate {@link DocsEnum#advance(int)}.
|
||||
* <li>DocFPDelta determines the position of this term's TermFreqs within the .doc file.
|
||||
* In particular, it is the difference of file offset between this term's
|
||||
* data and previous term's data (or zero, for the first term in the block).On disk it is
|
||||
* stored as the difference from previous value in sequence. </li>
|
||||
* <li>PosFPDelta determines the position of this term's TermPositions within the .pos file.
|
||||
* While PayFPDelta determines the position of this term's <TermPayloads, TermOffsets?> within
|
||||
* the .pay file. Similar to DocFPDelta, it is the difference between two file positions (or
|
||||
* neglected, for fields that omit payloads and offsets).</li>
|
||||
* <li>PosVIntBlockFPDelta determines the position of this term's last TermPosition in last pos packed
|
||||
* block within the .pos file. It is synonym for PayVIntBlockFPDelta or OffsetVIntBlockFPDelta.
|
||||
* This is actually used to indicate whether it is necessary to load following
|
||||
* payloads and offsets from .pos instead of .pay. Every time a new block of positions are to be
|
||||
* loaded, the PostingsReader will use this value to check whether current block is packed format
|
||||
* or VInt. When packed format, payloads and offsets are fetched from .pay, otherwise from .pos.
|
||||
* (this value is neglected when total number of positions i.e. totalTermFreq is less or equal
|
||||
* to PackedBlockSize).
|
||||
* <li>SkipFPDelta determines the position of this term's SkipData within the .doc
|
||||
* file. In particular, it is the length of the TermFreq data.
|
||||
* SkipDelta is only stored if DocFreq is not smaller than SkipMinimum
|
||||
* (i.e. 8 in BlockPostingsFormat).</li>
|
||||
* </ul>
|
||||
* </dd>
|
||||
* </dl>
|
||||
*
|
||||
* <a name="Termindex" id="Termindex"></a>
|
||||
* <dl>
|
||||
* <dd>
|
||||
* <b>Term Index</b>
|
||||
* <p>The .tim file format is mentioned in
|
||||
* <a href="../lucene40/Lucene40PostingsFormat.html#Termindex">Lucene40PostingsFormat:TermIndex</a>
|
||||
* </dd>
|
||||
* </dl>
|
||||
*
|
||||
*
|
||||
* <a name="Frequencies" id="Frequencies"></a>
|
||||
* <dl>
|
||||
* <dd>
|
||||
* <b>Frequencies and Skip Data</b>
|
||||
*
|
||||
* <p>The .doc file contains the lists of documents which contain each term, along
|
||||
* with the frequency of the term in that document (except when frequencies are
|
||||
* omitted: {@link IndexOptions#DOCS_ONLY}). It also saves skip data to the beginning of
|
||||
* each packed or VInt block, when the length of document list is larger than packed block size.</p>
|
||||
*
|
||||
* <ul>
|
||||
* <li>docFile(.doc) --> Header, <TermFreqs, SkipData?><sup>TermCount</sup></li>
|
||||
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||
* <li>TermFreqs --> <PackedBlock> <sup>PackedDocBlockNum</sup>,
|
||||
* VIntBlock? </li>
|
||||
* <li>PackedBlock --> PackedDocDeltaBlock, PackedFreqBlock?
|
||||
* <li>VIntBlock --> <DocDelta[, Freq?]><sup>DocFreq-PackedBlockSize*PackedDocBlockNum</sup>
|
||||
* <li>SkipData --> <<SkipLevelLength, SkipLevel>
|
||||
* <sup>NumSkipLevels-1</sup>, SkipLevel>, SkipDatum?</li>
|
||||
* <li>SkipLevel --> <SkipDatum> <sup>TrimmedDocFreq/(PackedBlockSize^(Level + 1))</sup></li>
|
||||
* <li>SkipDatum --> DocSkip, DocFPSkip, <PosFPSkip, PosBlockOffset, PayLength?,
|
||||
* OffsetStart?, PayFPSkip?>?, SkipChildLevelPointer?</li>
|
||||
* <li>PackedDocDeltaBlock, PackedFreqBlock --> {@link PackedInts PackedInts}</li>
|
||||
* <li>DocDelta, Freq, DocSkip, DocFPSkip, PosFPSkip, PosBlockOffset, PayLength, OffsetStart, PayFPSkip
|
||||
* -->
|
||||
* {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>SkipChildLevelPointer --> {@link DataOutput#writeVLong VLong}</li>
|
||||
* </ul>
|
||||
* <p>Notes:</p>
|
||||
* <ul>
|
||||
* <li>PackedDocDeltaBlock is theoretically generated from two steps:
|
||||
* <ol>
|
||||
* <li>Calculate the difference between each document number and previous one,
|
||||
* and get a d-gaps list (for the first document, use absolute value); </li>
|
||||
* <li>For those d-gaps from first one to PackedDocBlockNum*PackedBlockSize<sup>th</sup>,
|
||||
* separately encode as packed blocks.</li>
|
||||
* </ol>
|
||||
* If frequencies are not omitted, PackedFreqBlock will be generated without d-gap step.
|
||||
* </li>
|
||||
* <li>VIntBlock stores remaining d-gaps (along with frequencies when possible) with a format
|
||||
* mentioned in
|
||||
* <a href="../lucene40/Lucene40PostingsFormat.html#Frequencies">Lucene40PostingsFormat:Frequencies</a>
|
||||
* </li>
|
||||
* <li>PackedDocBlockNum is the number of packed blocks for current term's docids or frequencies.
|
||||
* In particular, PackedDocBlockNum = floor(DocFreq/PackedBlockSize) </li>
|
||||
* <li>TrimmedDocFreq = DocFreq % PackedBlockSize == 0 ? DocFreq - 1 : DocFreq.
|
||||
* We use this trick since the definition of skip entry is a little different from base interface.
|
||||
* In {@link MultiLevelSkipListWriter}, skip data is assumed to be saved for
|
||||
* skipInterval<sup>th</sup>, 2*skipInterval<sup>th</sup> ... posting in the list. However,
|
||||
* in BlockPostingsFormat, the skip data is saved for skipInterval+1<sup>th</sup>,
|
||||
* 2*skipInterval+1<sup>th</sup> ... posting (skipInterval==PackedBlockSize in this case).
|
||||
* When DocFreq is multiple of PackedBlockSize, MultiLevelSkipListWriter will expect one
|
||||
* more skip data than BlockSkipWriter. </li>
|
||||
* <li>SkipDatum is the metadata of one skip entry.
|
||||
* For the first block (no matter packed or VInt), it is omitted.</li>
|
||||
* <li>DocSkip records the document number of every PackedBlockSize<sup>th</sup> document number in
|
||||
* the postings (i.e. last document number in each packed block). On disk it is stored as the
|
||||
* difference from previous value in the sequence. </li>
|
||||
* <li>DocFPSkip records the file offsets of each block (excluding )posting at
|
||||
* PackedBlockSize+1<sup>th</sup>, 2*PackedBlockSize+1<sup>th</sup> ... , in DocFile.
|
||||
* The file offsets are relative to the start of current term's TermFreqs.
|
||||
* On disk it is also stored as the difference from previous SkipDatum in the sequence.</li>
|
||||
* <li>Since positions and payloads are also block encoded, the skip should skip to related block first,
|
||||
* then fetch the values according to in-block offset. PosFPSkip and PayFPSkip record the file
|
||||
* offsets of related block in .pos and .pay, respectively. While PosBlockOffset indicates
|
||||
* which value to fetch inside the related block (PayBlockOffset is unnecessary since it is always
|
||||
* equal to PosBlockOffset). Same as DocFPSkip, the file offsets are relative to the start of
|
||||
* current term's TermFreqs, and stored as a difference sequence.</li>
|
||||
* <li>PayLength indicates the length of last payload.</li>
|
||||
* <li>OffsetStart indicates the first value of last offset pair.</li>
|
||||
* </ul>
|
||||
* </dd>
|
||||
* </dl>
|
||||
*
|
||||
* <a name="Positions" id="Positions"></a>
|
||||
* <dl>
|
||||
* <dd>
|
||||
* <b>Positions</b>
|
||||
* <p>The .pos file contains the lists of positions that each term occurs at within documents. It also
|
||||
* sometimes stores part of payloads and offsets for speedup.</p>
|
||||
* <ul>
|
||||
* <li>PosFile(.pos) --> Header, <TermPositions> <sup>TermCount</sup></li>
|
||||
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||
* <li>TermPositions --> <PackedPosDeltaBlock> <sup>PackedPosBlockNum</sup>,
|
||||
* VIntBlock? </li>
|
||||
* <li>VIntBlock --> PosVIntCount, <PosDelta[, PayLength?], PayData?,
|
||||
* OffsetStartDelta?, OffsetLength?><sup>PosVIntCount</sup>
|
||||
* <li>PackedPosDeltaBlock --> {@link PackedInts PackedInts}</li>
|
||||
* <li>PosVIntCount, PosDelta, OffsetStartDelta, OffsetLength -->
|
||||
* {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>PayData --> {@link DataOutput#writeByte byte}<sup>PayLength</sup></li>
|
||||
* </ul>
|
||||
* <p>Notes:</p>
|
||||
* <ul>
|
||||
* <li>TermPositions are order by term (terms are implicit, from the term dictionary), and position
|
||||
* values for each term document pair are incremental, and ordered by document number.</li>
|
||||
* <li>PackedPosBlockNum is the number of packed blocks for current term's positions, payloads or offsets.
|
||||
* In particular, PackedPosBlockNum = floor(totalTermFreq/PackedBlockSize) </li>
|
||||
* <li>PosVIntCount is the number of positions encoded as VInt format. In particular,
|
||||
* PosVIntCount = totalTermFreq - PackedPosBlockNum*PackedBlockSize</li>
|
||||
* <li>The procedure how PackedPosDeltaBlock is generated is the same as PackedDocDeltaBlock
|
||||
* in chapter <a href="#Frequencies">Frequencies and Skip Data</a>.</li>
|
||||
* <li>PosDelta is the same as the format mentioned in
|
||||
* <a href="../lucene40/Lucene40PostingsFormat.html#Positions">Lucene40PostingsFormat:Positions</a>
|
||||
* </li>
|
||||
* <li>OffsetStartDelta is the difference between this position's startOffset from the previous
|
||||
* occurrence (or zero, if this is the first occurrence in this document).</li>
|
||||
* <li>OffsetLength indicates the length of the current offset (endOffset-startOffset).</li>
|
||||
* <li>PayloadData is the blob of metadata associated with current position.</li>
|
||||
* </ul>
|
||||
* </dd>
|
||||
* </dl>
|
||||
*
|
||||
* <a name="Payloads" id="Payloads"></a>
|
||||
* <dl>
|
||||
* <dd>
|
||||
* <b>Payloads and Offsets</b>
|
||||
* <p>The .pay file will store payloads and offsets associated with certain term-document positions.
|
||||
* Some payloads and offsets will be separated out into .pos file, for speedup reason.</p>
|
||||
* <ul>
|
||||
* <li>PayFile(.pay): --> Header, <TermPayloads, TermOffsets?> <sup>TermCount</sup></li>
|
||||
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||
* <li>TermPayloads --> <PackedPayLengthBlock, SumPayLength, PayData> <sup>PackedPayBlockNum</sup>
|
||||
* <li>TermOffsets --> <PackedOffsetStartDeltaBlock, PackedOffsetLengthBlock> <sup>PackedPayBlockNum</sup>
|
||||
* <li>PackedPayLengthBlock, PackedOffsetStartDeltaBlock, PackedOffsetLengthBlock --> {@link PackedInts PackedInts}</li>
|
||||
* <li>SumPayLength --> {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>PayData --> {@link DataOutput#writeByte byte}<sup>SumPayLength</sup></li>
|
||||
* </ul>
|
||||
* <p>Notes:</p>
|
||||
* <ul>
|
||||
* <li>The order of TermPayloads/TermOffsets will be the same as TermPositions, note that part of
|
||||
* payload/offsets are stored in .pos.</li>
|
||||
* <li>The procedure how PackedPayLengthBlock and PackedOffsetLengthBlock are generated is the
|
||||
* same as PackedFreqBlock in chapter <a href="#Frequencies">Frequencies and Skip Data</a>.
|
||||
* While PackedStartDeltaBlock follows a same procedure as PackedDocDeltaBlock.</li>
|
||||
* <li>PackedPayBlockNum is always equal to PackedPosBlockNum, for the same term. It is also synonym
|
||||
* for PackedOffsetBlockNum.</li>
|
||||
* <li>SumPayLength is the total length of payloads written within one block, should be the sum
|
||||
* of PayLengths in one packed block.</li>
|
||||
* <li>PayLength in PackedPayLengthBlock is the length of each payload, associated with current
|
||||
* position.</li>
|
||||
* </ul>
|
||||
* </dd>
|
||||
* </dl>
|
||||
* </p>
|
||||
*
|
||||
*/
|
||||
|
||||
public final class BlockPostingsFormat extends PostingsFormat {
|
||||
/**
|
||||
* Filename extension for document number, frequencies, and skip data.
|
||||
* See chapter: <a href="#Frequencies">Frequencies and Skip Data</a>
|
||||
*/
|
||||
public static final String DOC_EXTENSION = "doc";
|
||||
|
||||
/**
|
||||
* Filename extension for positions.
|
||||
* See chapter: <a href="#Positions">Positions</a>
|
||||
*/
|
||||
public static final String POS_EXTENSION = "pos";
|
||||
|
||||
/**
|
||||
* Filename extension for payloads and offsets.
|
||||
* See chapter: <a href="#Payloads">Payloads and Offsets</a>
|
||||
*/
|
||||
public static final String PAY_EXTENSION = "pay";
|
||||
|
||||
private final int minTermBlockSize;
|
||||
private final int maxTermBlockSize;
|
||||
|
||||
/**
|
||||
* Fixed packed block size, number of integers encoded in
|
||||
* a single packed block.
|
||||
*/
|
||||
// NOTE: must be multiple of 64 because of PackedInts long-aligned encoding/decoding
|
||||
public final static int BLOCK_SIZE = 128;
|
||||
|
||||
public BlockPostingsFormat() {
|
||||
this(BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
|
||||
}
|
||||
|
||||
public BlockPostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
|
||||
super("Block");
|
||||
this.minTermBlockSize = minTermBlockSize;
|
||||
assert minTermBlockSize > 1;
|
||||
this.maxTermBlockSize = maxTermBlockSize;
|
||||
assert minTermBlockSize <= maxTermBlockSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return getName() + "(blocksize=" + BLOCK_SIZE + ")";
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
PostingsWriterBase postingsWriter = new BlockPostingsWriter(state);
|
||||
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsConsumer ret = new BlockTreeTermsWriter(state,
|
||||
postingsWriter,
|
||||
minTermBlockSize,
|
||||
maxTermBlockSize);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(postingsWriter);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
PostingsReaderBase postingsReader = new BlockPostingsReader(state.dir,
|
||||
state.fieldInfos,
|
||||
state.segmentInfo,
|
||||
state.context,
|
||||
state.segmentSuffix);
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsProducer ret = new BlockTreeTermsReader(state.dir,
|
||||
state.fieldInfos,
|
||||
state.segmentInfo.name,
|
||||
postingsReader,
|
||||
state.context,
|
||||
state.segmentSuffix,
|
||||
state.termsIndexDivisor);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(postingsReader);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,563 @@
|
|||
package org.apache.lucene.codecs.block;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import static org.apache.lucene.codecs.block.BlockPostingsFormat.BLOCK_SIZE;
|
||||
import static org.apache.lucene.codecs.block.ForUtil.MAX_DATA_SIZE;
|
||||
import static org.apache.lucene.codecs.block.ForUtil.MAX_ENCODED_SIZE;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.TermStats;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.RAMOutputStream;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
|
||||
/**
|
||||
* Concrete class that writes docId(maybe frq,pos,offset,payloads) list
|
||||
* with postings format.
|
||||
*
|
||||
* Postings list for each term will be stored separately.
|
||||
*
|
||||
* @see BlockSkipWriter for details about skipping setting and postings layout.
|
||||
*
|
||||
*/
|
||||
final class BlockPostingsWriter extends PostingsWriterBase {
|
||||
|
||||
/**
|
||||
* Expert: The maximum number of skip levels. Smaller values result in
|
||||
* slightly smaller indexes, but slower skipping in big posting lists.
|
||||
*/
|
||||
static final int maxSkipLevels = 10;
|
||||
|
||||
final static String TERMS_CODEC = "BlockPostingsWriterTerms";
|
||||
final static String DOC_CODEC = "BlockPostingsWriterDoc";
|
||||
final static String POS_CODEC = "BlockPostingsWriterPos";
|
||||
final static String PAY_CODEC = "BlockPostingsWriterPay";
|
||||
|
||||
// Increment version to change it:
|
||||
final static int VERSION_START = 0;
|
||||
final static int VERSION_CURRENT = VERSION_START;
|
||||
|
||||
final IndexOutput docOut;
|
||||
final IndexOutput posOut;
|
||||
final IndexOutput payOut;
|
||||
|
||||
private IndexOutput termsOut;
|
||||
|
||||
// How current field indexes postings:
|
||||
private boolean fieldHasFreqs;
|
||||
private boolean fieldHasPositions;
|
||||
private boolean fieldHasOffsets;
|
||||
private boolean fieldHasPayloads;
|
||||
|
||||
// Holds starting file pointers for each term:
|
||||
private long docTermStartFP;
|
||||
private long posTermStartFP;
|
||||
private long payTermStartFP;
|
||||
|
||||
final int[] docDeltaBuffer;
|
||||
final int[] freqBuffer;
|
||||
private int docBufferUpto;
|
||||
|
||||
final int[] posDeltaBuffer;
|
||||
final int[] payloadLengthBuffer;
|
||||
final int[] offsetStartDeltaBuffer;
|
||||
final int[] offsetLengthBuffer;
|
||||
private int posBufferUpto;
|
||||
|
||||
private byte[] payloadBytes;
|
||||
private int payloadByteUpto;
|
||||
|
||||
private int lastBlockDocID;
|
||||
private long lastBlockPosFP;
|
||||
private long lastBlockPayFP;
|
||||
private int lastBlockPosBufferUpto;
|
||||
private int lastBlockStartOffset;
|
||||
private int lastBlockPayloadByteUpto;
|
||||
|
||||
private int lastDocID;
|
||||
private int lastPosition;
|
||||
private int lastStartOffset;
|
||||
private int docCount;
|
||||
|
||||
final byte[] encoded;
|
||||
|
||||
private final ForUtil forUtil;
|
||||
private final BlockSkipWriter skipWriter;
|
||||
|
||||
public BlockPostingsWriter(SegmentWriteState state, float acceptableOverheadRatio) throws IOException {
|
||||
super();
|
||||
|
||||
docOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockPostingsFormat.DOC_EXTENSION),
|
||||
state.context);
|
||||
IndexOutput posOut = null;
|
||||
IndexOutput payOut = null;
|
||||
boolean success = false;
|
||||
try {
|
||||
CodecUtil.writeHeader(docOut, DOC_CODEC, VERSION_CURRENT);
|
||||
forUtil = new ForUtil(acceptableOverheadRatio, docOut);
|
||||
if (state.fieldInfos.hasProx()) {
|
||||
posDeltaBuffer = new int[MAX_DATA_SIZE];
|
||||
posOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockPostingsFormat.POS_EXTENSION),
|
||||
state.context);
|
||||
CodecUtil.writeHeader(posOut, POS_CODEC, VERSION_CURRENT);
|
||||
|
||||
if (state.fieldInfos.hasPayloads()) {
|
||||
payloadBytes = new byte[128];
|
||||
payloadLengthBuffer = new int[MAX_DATA_SIZE];
|
||||
} else {
|
||||
payloadBytes = null;
|
||||
payloadLengthBuffer = null;
|
||||
}
|
||||
|
||||
if (state.fieldInfos.hasOffsets()) {
|
||||
offsetStartDeltaBuffer = new int[MAX_DATA_SIZE];
|
||||
offsetLengthBuffer = new int[MAX_DATA_SIZE];
|
||||
} else {
|
||||
offsetStartDeltaBuffer = null;
|
||||
offsetLengthBuffer = null;
|
||||
}
|
||||
|
||||
if (state.fieldInfos.hasPayloads() || state.fieldInfos.hasOffsets()) {
|
||||
payOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockPostingsFormat.PAY_EXTENSION),
|
||||
state.context);
|
||||
CodecUtil.writeHeader(payOut, PAY_CODEC, VERSION_CURRENT);
|
||||
}
|
||||
} else {
|
||||
posDeltaBuffer = null;
|
||||
payloadLengthBuffer = null;
|
||||
offsetStartDeltaBuffer = null;
|
||||
offsetLengthBuffer = null;
|
||||
payloadBytes = null;
|
||||
}
|
||||
this.payOut = payOut;
|
||||
this.posOut = posOut;
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(docOut, posOut, payOut);
|
||||
}
|
||||
}
|
||||
|
||||
docDeltaBuffer = new int[MAX_DATA_SIZE];
|
||||
freqBuffer = new int[MAX_DATA_SIZE];
|
||||
|
||||
// TODO: should we try skipping every 2/4 blocks...?
|
||||
skipWriter = new BlockSkipWriter(maxSkipLevels,
|
||||
BLOCK_SIZE,
|
||||
state.segmentInfo.getDocCount(),
|
||||
docOut,
|
||||
posOut,
|
||||
payOut);
|
||||
|
||||
encoded = new byte[MAX_ENCODED_SIZE];
|
||||
}
|
||||
|
||||
public BlockPostingsWriter(SegmentWriteState state) throws IOException {
|
||||
this(state, PackedInts.COMPACT);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void start(IndexOutput termsOut) throws IOException {
|
||||
this.termsOut = termsOut;
|
||||
CodecUtil.writeHeader(termsOut, TERMS_CODEC, VERSION_CURRENT);
|
||||
termsOut.writeVInt(BLOCK_SIZE);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setField(FieldInfo fieldInfo) {
|
||||
IndexOptions indexOptions = fieldInfo.getIndexOptions();
|
||||
fieldHasFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
|
||||
fieldHasPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||
fieldHasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
fieldHasPayloads = fieldInfo.hasPayloads();
|
||||
skipWriter.setField(fieldHasPositions, fieldHasOffsets, fieldHasPayloads);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startTerm() {
|
||||
docTermStartFP = docOut.getFilePointer();
|
||||
if (fieldHasPositions) {
|
||||
posTermStartFP = posOut.getFilePointer();
|
||||
if (fieldHasPayloads || fieldHasOffsets) {
|
||||
payTermStartFP = payOut.getFilePointer();
|
||||
}
|
||||
}
|
||||
lastDocID = 0;
|
||||
lastBlockDocID = -1;
|
||||
// if (DEBUG) {
|
||||
// System.out.println("FPW.startTerm startFP=" + docTermStartFP);
|
||||
// }
|
||||
skipWriter.resetSkip();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startDoc(int docID, int termDocFreq) throws IOException {
|
||||
// if (DEBUG) {
|
||||
// System.out.println("FPW.startDoc docID["+docBufferUpto+"]=" + docID);
|
||||
// }
|
||||
// Have collected a block of docs, and get a new doc.
|
||||
// Should write skip data as well as postings list for
|
||||
// current block.
|
||||
if (lastBlockDocID != -1 && docBufferUpto == 0) {
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" bufferSkip at writeBlock: lastDocID=" + lastBlockDocID + " docCount=" + (docCount-1));
|
||||
// }
|
||||
skipWriter.bufferSkip(lastBlockDocID, docCount, lastBlockPosFP, lastBlockPayFP, lastBlockPosBufferUpto, lastBlockStartOffset, lastBlockPayloadByteUpto);
|
||||
}
|
||||
|
||||
final int docDelta = docID - lastDocID;
|
||||
|
||||
if (docID < 0 || (docCount > 0 && docDelta <= 0)) {
|
||||
throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " ) (docOut: " + docOut + ")");
|
||||
}
|
||||
|
||||
docDeltaBuffer[docBufferUpto] = docDelta;
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" docDeltaBuffer[" + docBufferUpto + "]=" + docDelta);
|
||||
// }
|
||||
if (fieldHasFreqs) {
|
||||
freqBuffer[docBufferUpto] = termDocFreq;
|
||||
}
|
||||
docBufferUpto++;
|
||||
docCount++;
|
||||
|
||||
if (docBufferUpto == BLOCK_SIZE) {
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" write docDelta block @ fp=" + docOut.getFilePointer());
|
||||
// }
|
||||
forUtil.writeBlock(docDeltaBuffer, encoded, docOut);
|
||||
if (fieldHasFreqs) {
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" write freq block @ fp=" + docOut.getFilePointer());
|
||||
// }
|
||||
forUtil.writeBlock(freqBuffer, encoded, docOut);
|
||||
}
|
||||
// NOTE: don't set docBufferUpto back to 0 here;
|
||||
// finishDoc will do so (because it needs to see that
|
||||
// the block was filled so it can save skip data)
|
||||
}
|
||||
|
||||
|
||||
lastDocID = docID;
|
||||
lastPosition = 0;
|
||||
lastStartOffset = 0;
|
||||
}
|
||||
|
||||
/** Add a new position & payload */
|
||||
@Override
|
||||
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
|
||||
// if (DEBUG) {
|
||||
// System.out.println("FPW.addPosition pos=" + position + " posBufferUpto=" + posBufferUpto + (fieldHasPayloads ? " payloadByteUpto=" + payloadByteUpto: ""));
|
||||
// }
|
||||
posDeltaBuffer[posBufferUpto] = position - lastPosition;
|
||||
if (fieldHasPayloads) {
|
||||
if (payload == null || payload.length == 0) {
|
||||
// no payload
|
||||
payloadLengthBuffer[posBufferUpto] = 0;
|
||||
} else {
|
||||
payloadLengthBuffer[posBufferUpto] = payload.length;
|
||||
if (payloadByteUpto + payload.length > payloadBytes.length) {
|
||||
payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payload.length);
|
||||
}
|
||||
System.arraycopy(payload.bytes, payload.offset, payloadBytes, payloadByteUpto, payload.length);
|
||||
payloadByteUpto += payload.length;
|
||||
}
|
||||
}
|
||||
|
||||
if (fieldHasOffsets) {
|
||||
assert startOffset >= lastStartOffset;
|
||||
assert endOffset >= startOffset;
|
||||
offsetStartDeltaBuffer[posBufferUpto] = startOffset - lastStartOffset;
|
||||
offsetLengthBuffer[posBufferUpto] = endOffset - startOffset;
|
||||
lastStartOffset = startOffset;
|
||||
}
|
||||
|
||||
posBufferUpto++;
|
||||
lastPosition = position;
|
||||
if (posBufferUpto == BLOCK_SIZE) {
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" write pos bulk block @ fp=" + posOut.getFilePointer());
|
||||
// }
|
||||
forUtil.writeBlock(posDeltaBuffer, encoded, posOut);
|
||||
|
||||
if (fieldHasPayloads) {
|
||||
forUtil.writeBlock(payloadLengthBuffer, encoded, payOut);
|
||||
payOut.writeVInt(payloadByteUpto);
|
||||
payOut.writeBytes(payloadBytes, 0, payloadByteUpto);
|
||||
payloadByteUpto = 0;
|
||||
}
|
||||
if (fieldHasOffsets) {
|
||||
forUtil.writeBlock(offsetStartDeltaBuffer, encoded, payOut);
|
||||
forUtil.writeBlock(offsetLengthBuffer, encoded, payOut);
|
||||
}
|
||||
posBufferUpto = 0;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void finishDoc() throws IOException {
|
||||
// Since we don't know df for current term, we had to buffer
|
||||
// those skip data for each block, and when a new doc comes,
|
||||
// write them to skip file.
|
||||
if (docBufferUpto == BLOCK_SIZE) {
|
||||
lastBlockDocID = lastDocID;
|
||||
if (posOut != null) {
|
||||
if (payOut != null) {
|
||||
lastBlockPayFP = payOut.getFilePointer();
|
||||
}
|
||||
lastBlockPosFP = posOut.getFilePointer();
|
||||
lastBlockPosBufferUpto = posBufferUpto;
|
||||
lastBlockStartOffset = lastStartOffset;
|
||||
lastBlockPayloadByteUpto = payloadByteUpto;
|
||||
}
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" docBufferUpto="+docBufferUpto+" now get lastBlockDocID="+lastBlockDocID+" lastBlockPosFP=" + lastBlockPosFP + " lastBlockPosBufferUpto=" + lastBlockPosBufferUpto + " lastBlockPayloadByteUpto=" + lastBlockPayloadByteUpto);
|
||||
// }
|
||||
docBufferUpto = 0;
|
||||
}
|
||||
}
|
||||
|
||||
private static class PendingTerm {
|
||||
public final long docStartFP;
|
||||
public final long posStartFP;
|
||||
public final long payStartFP;
|
||||
public final int skipOffset;
|
||||
public final int lastPosBlockOffset;
|
||||
|
||||
public PendingTerm(long docStartFP, long posStartFP, long payStartFP, int skipOffset, int lastPosBlockOffset) {
|
||||
this.docStartFP = docStartFP;
|
||||
this.posStartFP = posStartFP;
|
||||
this.payStartFP = payStartFP;
|
||||
this.skipOffset = skipOffset;
|
||||
this.lastPosBlockOffset = lastPosBlockOffset;
|
||||
}
|
||||
}
|
||||
|
||||
private final List<PendingTerm> pendingTerms = new ArrayList<PendingTerm>();
|
||||
|
||||
/** Called when we are done adding docs to this term */
|
||||
@Override
|
||||
public void finishTerm(TermStats stats) throws IOException {
|
||||
assert stats.docFreq > 0;
|
||||
|
||||
// TODO: wasteful we are counting this (counting # docs
|
||||
// for this term) in two places?
|
||||
assert stats.docFreq == docCount: stats.docFreq + " vs " + docCount;
|
||||
|
||||
// if (DEBUG) {
|
||||
// System.out.println("FPW.finishTerm docFreq=" + stats.docFreq);
|
||||
// }
|
||||
|
||||
// if (DEBUG) {
|
||||
// if (docBufferUpto > 0) {
|
||||
// System.out.println(" write doc/freq vInt block (count=" + docBufferUpto + ") at fp=" + docOut.getFilePointer() + " docTermStartFP=" + docTermStartFP);
|
||||
// }
|
||||
// }
|
||||
|
||||
// vInt encode the remaining doc deltas and freqs:
|
||||
for(int i=0;i<docBufferUpto;i++) {
|
||||
final int docDelta = docDeltaBuffer[i];
|
||||
final int freq = freqBuffer[i];
|
||||
if (!fieldHasFreqs) {
|
||||
docOut.writeVInt(docDelta);
|
||||
} else if (freqBuffer[i] == 1) {
|
||||
docOut.writeVInt((docDelta<<1)|1);
|
||||
} else {
|
||||
docOut.writeVInt(docDelta<<1);
|
||||
docOut.writeVInt(freq);
|
||||
}
|
||||
}
|
||||
|
||||
final int lastPosBlockOffset;
|
||||
|
||||
if (fieldHasPositions) {
|
||||
// if (DEBUG) {
|
||||
// if (posBufferUpto > 0) {
|
||||
// System.out.println(" write pos vInt block (count=" + posBufferUpto + ") at fp=" + posOut.getFilePointer() + " posTermStartFP=" + posTermStartFP + " hasPayloads=" + fieldHasPayloads + " hasOffsets=" + fieldHasOffsets);
|
||||
// }
|
||||
// }
|
||||
|
||||
// totalTermFreq is just total number of positions(or payloads, or offsets)
|
||||
// associated with current term.
|
||||
assert stats.totalTermFreq != -1;
|
||||
if (stats.totalTermFreq > BLOCK_SIZE) {
|
||||
// record file offset for last pos in last block
|
||||
lastPosBlockOffset = (int) (posOut.getFilePointer() - posTermStartFP);
|
||||
} else {
|
||||
lastPosBlockOffset = -1;
|
||||
}
|
||||
if (posBufferUpto > 0) {
|
||||
posOut.writeVInt(posBufferUpto);
|
||||
|
||||
// TODO: should we send offsets/payloads to
|
||||
// .pay...? seems wasteful (have to store extra
|
||||
// vLong for low (< BLOCK_SIZE) DF terms = vast vast
|
||||
// majority)
|
||||
|
||||
// vInt encode the remaining positions/payloads/offsets:
|
||||
int lastPayloadLength = -1;
|
||||
int payloadBytesReadUpto = 0;
|
||||
for(int i=0;i<posBufferUpto;i++) {
|
||||
final int posDelta = posDeltaBuffer[i];
|
||||
if (fieldHasPayloads) {
|
||||
final int payloadLength = payloadLengthBuffer[i];
|
||||
if (payloadLength != lastPayloadLength) {
|
||||
lastPayloadLength = payloadLength;
|
||||
posOut.writeVInt((posDelta<<1)|1);
|
||||
posOut.writeVInt(payloadLength);
|
||||
} else {
|
||||
posOut.writeVInt(posDelta<<1);
|
||||
}
|
||||
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" i=" + i + " payloadLen=" + payloadLength);
|
||||
// }
|
||||
|
||||
if (payloadLength != 0) {
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" write payload @ pos.fp=" + posOut.getFilePointer());
|
||||
// }
|
||||
posOut.writeBytes(payloadBytes, payloadBytesReadUpto, payloadLength);
|
||||
payloadBytesReadUpto += payloadLength;
|
||||
}
|
||||
} else {
|
||||
posOut.writeVInt(posDelta);
|
||||
}
|
||||
|
||||
if (fieldHasOffsets) {
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" write offset @ pos.fp=" + posOut.getFilePointer());
|
||||
// }
|
||||
posOut.writeVInt(offsetStartDeltaBuffer[i]);
|
||||
posOut.writeVInt(offsetLengthBuffer[i]);
|
||||
}
|
||||
}
|
||||
|
||||
if (fieldHasPayloads) {
|
||||
assert payloadBytesReadUpto == payloadByteUpto;
|
||||
payloadByteUpto = 0;
|
||||
}
|
||||
}
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" totalTermFreq=" + stats.totalTermFreq + " lastPosBlockOffset=" + lastPosBlockOffset);
|
||||
// }
|
||||
} else {
|
||||
lastPosBlockOffset = -1;
|
||||
}
|
||||
|
||||
int skipOffset;
|
||||
if (docCount > BLOCK_SIZE) {
|
||||
skipOffset = (int) (skipWriter.writeSkip(docOut) - docTermStartFP);
|
||||
|
||||
// if (DEBUG) {
|
||||
// System.out.println("skip packet " + (docOut.getFilePointer() - (docTermStartFP + skipOffset)) + " bytes");
|
||||
// }
|
||||
} else {
|
||||
skipOffset = -1;
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" no skip: docCount=" + docCount);
|
||||
// }
|
||||
}
|
||||
|
||||
long payStartFP;
|
||||
if (stats.totalTermFreq >= BLOCK_SIZE) {
|
||||
payStartFP = payTermStartFP;
|
||||
} else {
|
||||
payStartFP = -1;
|
||||
}
|
||||
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" payStartFP=" + payStartFP);
|
||||
// }
|
||||
|
||||
pendingTerms.add(new PendingTerm(docTermStartFP, posTermStartFP, payStartFP, skipOffset, lastPosBlockOffset));
|
||||
docBufferUpto = 0;
|
||||
posBufferUpto = 0;
|
||||
lastDocID = 0;
|
||||
docCount = 0;
|
||||
}
|
||||
|
||||
private final RAMOutputStream bytesWriter = new RAMOutputStream();
|
||||
|
||||
@Override
|
||||
public void flushTermsBlock(int start, int count) throws IOException {
|
||||
|
||||
if (count == 0) {
|
||||
termsOut.writeByte((byte) 0);
|
||||
return;
|
||||
}
|
||||
|
||||
assert start <= pendingTerms.size();
|
||||
assert count <= start;
|
||||
|
||||
final int limit = pendingTerms.size() - start + count;
|
||||
|
||||
long lastDocStartFP = 0;
|
||||
long lastPosStartFP = 0;
|
||||
long lastPayStartFP = 0;
|
||||
for(int idx=limit-count; idx<limit; idx++) {
|
||||
PendingTerm term = pendingTerms.get(idx);
|
||||
|
||||
bytesWriter.writeVLong(term.docStartFP - lastDocStartFP);
|
||||
lastDocStartFP = term.docStartFP;
|
||||
|
||||
if (fieldHasPositions) {
|
||||
bytesWriter.writeVLong(term.posStartFP - lastPosStartFP);
|
||||
lastPosStartFP = term.posStartFP;
|
||||
if (term.lastPosBlockOffset != -1) {
|
||||
bytesWriter.writeVInt(term.lastPosBlockOffset);
|
||||
}
|
||||
if ((fieldHasPayloads || fieldHasOffsets) && term.payStartFP != -1) {
|
||||
bytesWriter.writeVLong(term.payStartFP - lastPayStartFP);
|
||||
lastPayStartFP = term.payStartFP;
|
||||
}
|
||||
}
|
||||
|
||||
if (term.skipOffset != -1) {
|
||||
bytesWriter.writeVInt(term.skipOffset);
|
||||
}
|
||||
}
|
||||
|
||||
termsOut.writeVInt((int) bytesWriter.getFilePointer());
|
||||
bytesWriter.writeTo(termsOut);
|
||||
bytesWriter.reset();
|
||||
|
||||
// Remove the terms we just wrote:
|
||||
pendingTerms.subList(limit-count, limit).clear();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
IOUtils.close(docOut, posOut, payOut);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,244 @@
|
|||
package org.apache.lucene.codecs.block;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.codecs.MultiLevelSkipListReader;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
|
||||
/**
|
||||
* Implements the skip list reader for block postings format
|
||||
* that stores positions and payloads.
|
||||
*
|
||||
* Although this skipper uses MultiLevelSkipListReader as an interface,
|
||||
* its definition of skip position will be a little different.
|
||||
*
|
||||
* For example, when skipInterval = blockSize = 3, df = 2*skipInterval = 6,
|
||||
*
|
||||
* 0 1 2 3 4 5
|
||||
* d d d d d d (posting list)
|
||||
* ^ ^ (skip point in MultiLeveSkipWriter)
|
||||
* ^ (skip point in BlockSkipWriter)
|
||||
*
|
||||
* In this case, MultiLevelSkipListReader will use the last document as a skip point,
|
||||
* while BlockSkipReader should assume no skip point will comes.
|
||||
*
|
||||
* If we use the interface directly in BlockSkipReader, it may silly try to read
|
||||
* another skip data after the only skip point is loaded.
|
||||
*
|
||||
* To illustrate this, we can call skipTo(d[5]), since skip point d[3] has smaller docId,
|
||||
* and numSkipped+blockSize== df, the MultiLevelSkipListReader will assume the skip list
|
||||
* isn't exhausted yet, and try to load a non-existed skip point
|
||||
*
|
||||
* Therefore, we'll trim df before passing it to the interface. see trim(int)
|
||||
*
|
||||
*/
|
||||
final class BlockSkipReader extends MultiLevelSkipListReader {
|
||||
// private boolean DEBUG = BlockPostingsReader.DEBUG;
|
||||
private final int blockSize;
|
||||
|
||||
private long docPointer[];
|
||||
private long posPointer[];
|
||||
private long payPointer[];
|
||||
private int posBufferUpto[];
|
||||
private int startOffset[];
|
||||
private int payloadByteUpto[];
|
||||
|
||||
private long lastPosPointer;
|
||||
private long lastPayPointer;
|
||||
private int lastStartOffset;
|
||||
private int lastPayloadByteUpto;
|
||||
private long lastDocPointer;
|
||||
private int lastPosBufferUpto;
|
||||
|
||||
public BlockSkipReader(IndexInput skipStream, int maxSkipLevels, int blockSize, boolean hasPos, boolean hasOffsets, boolean hasPayloads) {
|
||||
super(skipStream, maxSkipLevels, blockSize, 8);
|
||||
this.blockSize = blockSize;
|
||||
docPointer = new long[maxSkipLevels];
|
||||
if (hasPos) {
|
||||
posPointer = new long[maxSkipLevels];
|
||||
posBufferUpto = new int[maxSkipLevels];
|
||||
if (hasPayloads) {
|
||||
payloadByteUpto = new int[maxSkipLevels];
|
||||
} else {
|
||||
payloadByteUpto = null;
|
||||
}
|
||||
if (hasOffsets) {
|
||||
startOffset = new int[maxSkipLevels];
|
||||
} else {
|
||||
startOffset = null;
|
||||
}
|
||||
if (hasOffsets || hasPayloads) {
|
||||
payPointer = new long[maxSkipLevels];
|
||||
} else {
|
||||
payPointer = null;
|
||||
}
|
||||
} else {
|
||||
posPointer = null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Trim original docFreq to tell skipReader read proper number of skip points.
|
||||
*
|
||||
* Since our definition in BlockSkip* is a little different from MultiLevelSkip*
|
||||
* This trimmed docFreq will prevent skipReader from:
|
||||
* 1. silly reading a non-existed skip point after the last block boundary
|
||||
* 2. moving into the vInt block
|
||||
*
|
||||
*/
|
||||
protected int trim(int df) {
|
||||
return df % blockSize == 0? df - 1: df;
|
||||
}
|
||||
|
||||
public void init(long skipPointer, long docBasePointer, long posBasePointer, long payBasePointer, int df) {
|
||||
super.init(skipPointer, trim(df));
|
||||
lastDocPointer = docBasePointer;
|
||||
lastPosPointer = posBasePointer;
|
||||
lastPayPointer = payBasePointer;
|
||||
|
||||
Arrays.fill(docPointer, docBasePointer);
|
||||
if (posPointer != null) {
|
||||
Arrays.fill(posPointer, posBasePointer);
|
||||
if (payPointer != null) {
|
||||
Arrays.fill(payPointer, payBasePointer);
|
||||
}
|
||||
} else {
|
||||
assert posBasePointer == 0;
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns the doc pointer of the doc to which the last call of
|
||||
* {@link MultiLevelSkipListReader#skipTo(int)} has skipped. */
|
||||
public long getDocPointer() {
|
||||
return lastDocPointer;
|
||||
}
|
||||
|
||||
public long getPosPointer() {
|
||||
return lastPosPointer;
|
||||
}
|
||||
|
||||
public int getPosBufferUpto() {
|
||||
return lastPosBufferUpto;
|
||||
}
|
||||
|
||||
public long getPayPointer() {
|
||||
return lastPayPointer;
|
||||
}
|
||||
|
||||
public int getStartOffset() {
|
||||
return lastStartOffset;
|
||||
}
|
||||
|
||||
public int getPayloadByteUpto() {
|
||||
return lastPayloadByteUpto;
|
||||
}
|
||||
|
||||
public int getNextSkipDoc() {
|
||||
return skipDoc[0];
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void seekChild(int level) throws IOException {
|
||||
super.seekChild(level);
|
||||
// if (DEBUG) {
|
||||
// System.out.println("seekChild level=" + level);
|
||||
// }
|
||||
docPointer[level] = lastDocPointer;
|
||||
if (posPointer != null) {
|
||||
posPointer[level] = lastPosPointer;
|
||||
posBufferUpto[level] = lastPosBufferUpto;
|
||||
if (startOffset != null) {
|
||||
startOffset[level] = lastStartOffset;
|
||||
}
|
||||
if (payloadByteUpto != null) {
|
||||
payloadByteUpto[level] = lastPayloadByteUpto;
|
||||
}
|
||||
if (payPointer != null) {
|
||||
payPointer[level] = lastPayPointer;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void setLastSkipData(int level) {
|
||||
super.setLastSkipData(level);
|
||||
lastDocPointer = docPointer[level];
|
||||
// if (DEBUG) {
|
||||
// System.out.println("setLastSkipData level=" + level);
|
||||
// System.out.println(" lastDocPointer=" + lastDocPointer);
|
||||
// }
|
||||
if (posPointer != null) {
|
||||
lastPosPointer = posPointer[level];
|
||||
lastPosBufferUpto = posBufferUpto[level];
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" lastPosPointer=" + lastPosPointer + " lastPosBUfferUpto=" + lastPosBufferUpto);
|
||||
// }
|
||||
if (payPointer != null) {
|
||||
lastPayPointer = payPointer[level];
|
||||
}
|
||||
if (startOffset != null) {
|
||||
lastStartOffset = startOffset[level];
|
||||
}
|
||||
if (payloadByteUpto != null) {
|
||||
lastPayloadByteUpto = payloadByteUpto[level];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int readSkipData(int level, IndexInput skipStream) throws IOException {
|
||||
// if (DEBUG) {
|
||||
// System.out.println("readSkipData level=" + level);
|
||||
// }
|
||||
int delta = skipStream.readVInt();
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" delta=" + delta);
|
||||
// }
|
||||
docPointer[level] += skipStream.readVInt();
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" docFP=" + docPointer[level]);
|
||||
// }
|
||||
|
||||
if (posPointer != null) {
|
||||
posPointer[level] += skipStream.readVInt();
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" posFP=" + posPointer[level]);
|
||||
// }
|
||||
posBufferUpto[level] = skipStream.readVInt();
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" posBufferUpto=" + posBufferUpto[level]);
|
||||
// }
|
||||
|
||||
if (payloadByteUpto != null) {
|
||||
payloadByteUpto[level] = skipStream.readVInt();
|
||||
}
|
||||
|
||||
if (startOffset != null) {
|
||||
startOffset[level] += skipStream.readVInt();
|
||||
}
|
||||
|
||||
if (payPointer != null) {
|
||||
payPointer[level] += skipStream.readVInt();
|
||||
}
|
||||
}
|
||||
return delta;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,163 @@
|
|||
package org.apache.lucene.codecs.block;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.codecs.MultiLevelSkipListWriter;
|
||||
|
||||
/**
|
||||
* Write skip lists with multiple levels, and support skip within block ints.
|
||||
*
|
||||
* Assume that docFreq = 28, skipInterval = blockSize = 12
|
||||
*
|
||||
* | block#0 | | block#1 | |vInts|
|
||||
* d d d d d d d d d d d d d d d d d d d d d d d d d d d d (posting list)
|
||||
* ^ ^ (level 0 skip point)
|
||||
*
|
||||
* Note that skipWriter will ignore first document in block#0, since
|
||||
* it is useless as a skip point. Also, we'll never skip into the vInts
|
||||
* block, only record skip data at the start its start point(if it exist).
|
||||
*
|
||||
* For each skip point, we will record:
|
||||
* 1. docID in former position, i.e. for position 12, record docID[11], etc.
|
||||
* 2. its related file points(position, payload),
|
||||
* 3. related numbers or uptos(position, payload).
|
||||
* 4. start offset.
|
||||
*
|
||||
*/
|
||||
final class BlockSkipWriter extends MultiLevelSkipListWriter {
|
||||
// private boolean DEBUG = BlockPostingsReader.DEBUG;
|
||||
|
||||
private int[] lastSkipDoc;
|
||||
private long[] lastSkipDocPointer;
|
||||
private long[] lastSkipPosPointer;
|
||||
private long[] lastSkipPayPointer;
|
||||
private int[] lastStartOffset;
|
||||
private int[] lastPayloadByteUpto;
|
||||
|
||||
private final IndexOutput docOut;
|
||||
private final IndexOutput posOut;
|
||||
private final IndexOutput payOut;
|
||||
|
||||
private int curDoc;
|
||||
private long curDocPointer;
|
||||
private long curPosPointer;
|
||||
private long curPayPointer;
|
||||
private int curPosBufferUpto;
|
||||
private int curStartOffset;
|
||||
private int curPayloadByteUpto;
|
||||
private boolean fieldHasPositions;
|
||||
private boolean fieldHasOffsets;
|
||||
private boolean fieldHasPayloads;
|
||||
|
||||
public BlockSkipWriter(int maxSkipLevels, int blockSize, int docCount, IndexOutput docOut, IndexOutput posOut, IndexOutput payOut) {
|
||||
super(blockSize, 8, maxSkipLevels, docCount);
|
||||
this.docOut = docOut;
|
||||
this.posOut = posOut;
|
||||
this.payOut = payOut;
|
||||
|
||||
lastSkipDoc = new int[maxSkipLevels];
|
||||
lastSkipDocPointer = new long[maxSkipLevels];
|
||||
if (posOut != null) {
|
||||
lastSkipPosPointer = new long[maxSkipLevels];
|
||||
if (payOut != null) {
|
||||
lastSkipPayPointer = new long[maxSkipLevels];
|
||||
}
|
||||
lastStartOffset = new int[maxSkipLevels];
|
||||
lastPayloadByteUpto = new int[maxSkipLevels];
|
||||
}
|
||||
}
|
||||
|
||||
public void setField(boolean fieldHasPositions, boolean fieldHasOffsets, boolean fieldHasPayloads) {
|
||||
this.fieldHasPositions = fieldHasPositions;
|
||||
this.fieldHasOffsets = fieldHasOffsets;
|
||||
this.fieldHasPayloads = fieldHasPayloads;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void resetSkip() {
|
||||
super.resetSkip();
|
||||
Arrays.fill(lastSkipDoc, 0);
|
||||
Arrays.fill(lastSkipDocPointer, docOut.getFilePointer());
|
||||
if (fieldHasPositions) {
|
||||
Arrays.fill(lastSkipPosPointer, posOut.getFilePointer());
|
||||
if (fieldHasOffsets) {
|
||||
Arrays.fill(lastStartOffset, 0);
|
||||
}
|
||||
if (fieldHasPayloads) {
|
||||
Arrays.fill(lastPayloadByteUpto, 0);
|
||||
}
|
||||
if (fieldHasOffsets || fieldHasPayloads) {
|
||||
Arrays.fill(lastSkipPayPointer, payOut.getFilePointer());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the values for the current skip data.
|
||||
*/
|
||||
public void bufferSkip(int doc, int numDocs, long posFP, long payFP, int posBufferUpto, int startOffset, int payloadByteUpto) throws IOException {
|
||||
this.curDoc = doc;
|
||||
this.curDocPointer = docOut.getFilePointer();
|
||||
this.curPosPointer = posFP;
|
||||
this.curPayPointer = payFP;
|
||||
this.curPosBufferUpto = posBufferUpto;
|
||||
this.curPayloadByteUpto = payloadByteUpto;
|
||||
this.curStartOffset = startOffset;
|
||||
bufferSkip(numDocs);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void writeSkipData(int level, IndexOutput skipBuffer) throws IOException {
|
||||
int delta = curDoc - lastSkipDoc[level];
|
||||
// if (DEBUG) {
|
||||
// System.out.println("writeSkipData level=" + level + " lastDoc=" + curDoc + " delta=" + delta + " curDocPointer=" + curDocPointer);
|
||||
// }
|
||||
skipBuffer.writeVInt(delta);
|
||||
lastSkipDoc[level] = curDoc;
|
||||
|
||||
skipBuffer.writeVInt((int) (curDocPointer - lastSkipDocPointer[level]));
|
||||
lastSkipDocPointer[level] = curDocPointer;
|
||||
|
||||
if (fieldHasPositions) {
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" curPosPointer=" + curPosPointer + " curPosBufferUpto=" + curPosBufferUpto);
|
||||
// }
|
||||
skipBuffer.writeVInt((int) (curPosPointer - lastSkipPosPointer[level]));
|
||||
lastSkipPosPointer[level] = curPosPointer;
|
||||
skipBuffer.writeVInt(curPosBufferUpto);
|
||||
|
||||
if (fieldHasPayloads) {
|
||||
skipBuffer.writeVInt(curPayloadByteUpto);
|
||||
}
|
||||
|
||||
if (fieldHasOffsets) {
|
||||
skipBuffer.writeVInt(curStartOffset - lastStartOffset[level]);
|
||||
lastStartOffset[level] = curStartOffset;
|
||||
}
|
||||
|
||||
if (fieldHasOffsets || fieldHasPayloads) {
|
||||
skipBuffer.writeVInt((int) (curPayPointer - lastSkipPayPointer[level]));
|
||||
lastSkipPayPointer[level] = curPayPointer;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,247 @@
|
|||
package org.apache.lucene.codecs.block;
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.packed.PackedInts.Decoder;
|
||||
import org.apache.lucene.util.packed.PackedInts.FormatAndBits;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
import static org.apache.lucene.codecs.block.BlockPostingsFormat.BLOCK_SIZE;
|
||||
|
||||
/**
|
||||
* Encode all values in normal area with fixed bit width,
|
||||
* which is determined by the max value in this block.
|
||||
*/
|
||||
final class ForUtil {
|
||||
|
||||
/**
|
||||
* Special number of bits per value used whenever all values to encode are equal.
|
||||
*/
|
||||
private static final int ALL_VALUES_EQUAL = 0;
|
||||
|
||||
/**
|
||||
* Upper limit of the number of bytes that might be required to stored
|
||||
* <code>BLOCK_SIZE</code> encoded values.
|
||||
*/
|
||||
static final int MAX_ENCODED_SIZE = BLOCK_SIZE * 4;
|
||||
|
||||
/**
|
||||
* Upper limit of the number of values that might be decoded in a single call to
|
||||
* {@link #readBlock(IndexInput, byte[], int[])}. Although values after
|
||||
* <code>BLOCK_SIZE</code> are garbage, it is necessary to allocate value buffers
|
||||
* whose size is >= MAX_DATA_SIZE to avoid {@link ArrayIndexOutOfBoundsException}s.
|
||||
*/
|
||||
static final int MAX_DATA_SIZE;
|
||||
static {
|
||||
int maxDataSize = 0;
|
||||
for(int version=PackedInts.VERSION_START;version<=PackedInts.VERSION_CURRENT;version++) {
|
||||
for (PackedInts.Format format : PackedInts.Format.values()) {
|
||||
for (int bpv = 1; bpv <= 32; ++bpv) {
|
||||
if (!format.isSupported(bpv)) {
|
||||
continue;
|
||||
}
|
||||
final PackedInts.Decoder decoder = PackedInts.getDecoder(format, version, bpv);
|
||||
final int iterations = computeIterations(decoder);
|
||||
maxDataSize = Math.max(maxDataSize, iterations * decoder.valueCount());
|
||||
}
|
||||
}
|
||||
}
|
||||
MAX_DATA_SIZE = maxDataSize;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the number of iterations required to decode <code>BLOCK_SIZE</code>
|
||||
* values with the provided {@link Decoder}.
|
||||
*/
|
||||
private static int computeIterations(PackedInts.Decoder decoder) {
|
||||
return (int) Math.ceil((float) BLOCK_SIZE / decoder.valueCount());
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the number of bytes required to encode a block of values that require
|
||||
* <code>bitsPerValue</code> bits per value with format <code>format</code>.
|
||||
*/
|
||||
private static int encodedSize(PackedInts.Format format, int bitsPerValue) {
|
||||
return format.nblocks(bitsPerValue, BLOCK_SIZE) << 3;
|
||||
}
|
||||
|
||||
private final int[] encodedSizes;
|
||||
private final PackedInts.Encoder[] encoders;
|
||||
private final PackedInts.Decoder[] decoders;
|
||||
private final int[] iterations;
|
||||
|
||||
/**
|
||||
* Create a new {@link ForUtil} instance and save state into <code>out</code>.
|
||||
*/
|
||||
ForUtil(float acceptableOverheadRatio, DataOutput out) throws IOException {
|
||||
out.writeVInt(PackedInts.VERSION_CURRENT);
|
||||
encodedSizes = new int[33];
|
||||
encoders = new PackedInts.Encoder[33];
|
||||
decoders = new PackedInts.Decoder[33];
|
||||
iterations = new int[33];
|
||||
|
||||
for (int bpv = 1; bpv <= 32; ++bpv) {
|
||||
final FormatAndBits formatAndBits = PackedInts.fastestFormatAndBits(
|
||||
BLOCK_SIZE, bpv, acceptableOverheadRatio);
|
||||
assert formatAndBits.format.isSupported(formatAndBits.bitsPerValue);
|
||||
assert formatAndBits.bitsPerValue <= 32;
|
||||
encodedSizes[bpv] = encodedSize(formatAndBits.format, formatAndBits.bitsPerValue);
|
||||
encoders[bpv] = PackedInts.getEncoder(
|
||||
formatAndBits.format, PackedInts.VERSION_CURRENT, formatAndBits.bitsPerValue);
|
||||
decoders[bpv] = PackedInts.getDecoder(
|
||||
formatAndBits.format, PackedInts.VERSION_CURRENT, formatAndBits.bitsPerValue);
|
||||
iterations[bpv] = computeIterations(decoders[bpv]);
|
||||
|
||||
out.writeVInt(formatAndBits.format.getId() << 5 | (formatAndBits.bitsPerValue - 1));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Restore a {@link ForUtil} from a {@link DataInput}.
|
||||
*/
|
||||
ForUtil(DataInput in) throws IOException {
|
||||
int packedIntsVersion = in.readVInt();
|
||||
if (packedIntsVersion != PackedInts.VERSION_START) {
|
||||
throw new CorruptIndexException("expected version=" + PackedInts.VERSION_START + " but got version=" + packedIntsVersion);
|
||||
}
|
||||
encodedSizes = new int[33];
|
||||
encoders = new PackedInts.Encoder[33];
|
||||
decoders = new PackedInts.Decoder[33];
|
||||
iterations = new int[33];
|
||||
|
||||
for (int bpv = 1; bpv <= 32; ++bpv) {
|
||||
final int code = in.readVInt();
|
||||
final int formatId = code >>> 5;
|
||||
final int bitsPerValue = (code & 31) + 1;
|
||||
|
||||
final PackedInts.Format format = PackedInts.Format.byId(formatId);
|
||||
assert format.isSupported(bitsPerValue);
|
||||
encodedSizes[bpv] = encodedSize(format, bitsPerValue);
|
||||
encoders[bpv] = PackedInts.getEncoder(
|
||||
format, packedIntsVersion, bitsPerValue);
|
||||
decoders[bpv] = PackedInts.getDecoder(
|
||||
format, packedIntsVersion, bitsPerValue);
|
||||
iterations[bpv] = computeIterations(decoders[bpv]);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Write a block of data (<code>For</code> format).
|
||||
*
|
||||
* @param data the data to write
|
||||
* @param encoded a buffer to use to encode data
|
||||
* @param out the destination output
|
||||
* @throws IOException
|
||||
*/
|
||||
void writeBlock(int[] data, byte[] encoded, IndexOutput out) throws IOException {
|
||||
if (isAllEqual(data)) {
|
||||
out.writeVInt(ALL_VALUES_EQUAL);
|
||||
out.writeVInt(data[0]);
|
||||
return;
|
||||
}
|
||||
|
||||
final int numBits = bitsRequired(data);
|
||||
assert numBits > 0 && numBits <= 32 : numBits;
|
||||
final PackedInts.Encoder encoder = encoders[numBits];
|
||||
final int iters = iterations[numBits];
|
||||
assert iters * encoder.valueCount() >= BLOCK_SIZE;
|
||||
final int encodedSize = encodedSizes[numBits];
|
||||
assert (iters * encoder.blockCount()) << 3 >= encodedSize;
|
||||
|
||||
out.writeVInt(numBits);
|
||||
|
||||
encoder.encode(data, 0, encoded, 0, iters);
|
||||
out.writeBytes(encoded, encodedSize);
|
||||
}
|
||||
|
||||
/**
|
||||
* Read the next block of data (<code>For</code> format).
|
||||
*
|
||||
* @param in the input to use to read data
|
||||
* @param encoded a buffer that can be used to store encoded data
|
||||
* @param decoded where to write decoded data
|
||||
* @throws IOException
|
||||
*/
|
||||
void readBlock(IndexInput in, byte[] encoded, int[] decoded) throws IOException {
|
||||
final int numBits = in.readVInt();
|
||||
assert numBits <= 32 : numBits;
|
||||
|
||||
if (numBits == ALL_VALUES_EQUAL) {
|
||||
final int value = in.readVInt();
|
||||
Arrays.fill(decoded, 0, BLOCK_SIZE, value);
|
||||
return;
|
||||
}
|
||||
|
||||
final int encodedSize = encodedSizes[numBits];
|
||||
in.readBytes(encoded, 0, encodedSize);
|
||||
|
||||
final PackedInts.Decoder decoder = decoders[numBits];
|
||||
final int iters = iterations[numBits];
|
||||
assert iters * decoder.valueCount() >= BLOCK_SIZE;
|
||||
|
||||
decoder.decode(encoded, 0, decoded, 0, iters);
|
||||
}
|
||||
|
||||
/**
|
||||
* Skip the next block of data.
|
||||
*
|
||||
* @param in the input where to read data
|
||||
* @throws IOException
|
||||
*/
|
||||
void skipBlock(IndexInput in) throws IOException {
|
||||
final int numBits = in.readVInt();
|
||||
if (numBits == ALL_VALUES_EQUAL) {
|
||||
in.readVInt();
|
||||
return;
|
||||
}
|
||||
assert numBits > 0 && numBits <= 32 : numBits;
|
||||
final int encodedSize = encodedSizes[numBits];
|
||||
in.seek(in.getFilePointer() + encodedSize);
|
||||
}
|
||||
|
||||
private static boolean isAllEqual(final int[] data) {
|
||||
final long v = data[0];
|
||||
for (int i = 1; i < BLOCK_SIZE; ++i) {
|
||||
if (data[i] != v) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the number of bits required to serialize any of the longs in
|
||||
* <code>data</code>.
|
||||
*/
|
||||
private static int bitsRequired(final int[] data) {
|
||||
long or = 0;
|
||||
for (int i = 0; i < BLOCK_SIZE; ++i) {
|
||||
assert data[i] >= 0;
|
||||
or |= data[i];
|
||||
}
|
||||
return PackedInts.bitsRequired(or);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
</head>
|
||||
<body>
|
||||
BlockPostingsFormat file format.
|
||||
</body>
|
||||
</html>
|
|
@ -77,76 +77,48 @@ public abstract class FixedIntBlockIndexInput extends IntIndexInput {
|
|||
|
||||
private static class Reader extends IntIndexInput.Reader {
|
||||
private final IndexInput in;
|
||||
|
||||
protected final int[] pending;
|
||||
int upto;
|
||||
|
||||
private boolean seekPending;
|
||||
private long pendingFP;
|
||||
private int pendingUpto;
|
||||
private long lastBlockFP;
|
||||
private final BlockReader blockReader;
|
||||
private final int blockSize;
|
||||
private final IntsRef bulkResult = new IntsRef();
|
||||
private final int[] pending;
|
||||
|
||||
private int upto;
|
||||
private boolean seekPending;
|
||||
private long pendingFP;
|
||||
private long lastBlockFP = -1;
|
||||
|
||||
public Reader(final IndexInput in, final int[] pending, final BlockReader blockReader) {
|
||||
this.in = in;
|
||||
this.pending = pending;
|
||||
this.blockSize = pending.length;
|
||||
bulkResult.ints = pending;
|
||||
this.blockReader = blockReader;
|
||||
upto = blockSize;
|
||||
}
|
||||
|
||||
void seek(final long fp, final int upto) {
|
||||
assert upto < blockSize;
|
||||
if (seekPending || fp != lastBlockFP) {
|
||||
pendingFP = fp;
|
||||
pendingUpto = upto;
|
||||
seekPending = true;
|
||||
}
|
||||
|
||||
private void maybeSeek() throws IOException {
|
||||
if (seekPending) {
|
||||
if (pendingFP != lastBlockFP) {
|
||||
// need new block
|
||||
in.seek(pendingFP);
|
||||
lastBlockFP = pendingFP;
|
||||
blockReader.readBlock();
|
||||
}
|
||||
upto = pendingUpto;
|
||||
seekPending = false;
|
||||
}
|
||||
this.upto = upto;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int next() throws IOException {
|
||||
this.maybeSeek();
|
||||
if (upto == blockSize) {
|
||||
if (seekPending) {
|
||||
// Seek & load new block
|
||||
in.seek(pendingFP);
|
||||
lastBlockFP = pendingFP;
|
||||
blockReader.readBlock();
|
||||
seekPending = false;
|
||||
} else if (upto == blockSize) {
|
||||
// Load new block
|
||||
lastBlockFP = in.getFilePointer();
|
||||
blockReader.readBlock();
|
||||
upto = 0;
|
||||
}
|
||||
|
||||
return pending[upto++];
|
||||
}
|
||||
|
||||
@Override
|
||||
public IntsRef read(final int count) throws IOException {
|
||||
this.maybeSeek();
|
||||
if (upto == blockSize) {
|
||||
blockReader.readBlock();
|
||||
upto = 0;
|
||||
}
|
||||
bulkResult.offset = upto;
|
||||
if (upto + count < blockSize) {
|
||||
bulkResult.length = count;
|
||||
upto += count;
|
||||
} else {
|
||||
bulkResult.length = blockSize - upto;
|
||||
upto = blockSize;
|
||||
}
|
||||
|
||||
return bulkResult;
|
||||
}
|
||||
}
|
||||
|
||||
private class Index extends IntIndexInput.Index {
|
||||
|
@ -178,7 +150,7 @@ public abstract class FixedIntBlockIndexInput extends IntIndexInput {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void set(final IntIndexInput.Index other) {
|
||||
public void copyFrom(final IntIndexInput.Index other) {
|
||||
final Index idx = (Index) other;
|
||||
fp = idx.fp;
|
||||
upto = idx.upto;
|
||||
|
|
|
@ -90,12 +90,10 @@ public abstract class VariableIntBlockIndexInput extends IntIndexInput {
|
|||
private long lastBlockFP;
|
||||
private int blockSize;
|
||||
private final BlockReader blockReader;
|
||||
private final IntsRef bulkResult = new IntsRef();
|
||||
|
||||
public Reader(final IndexInput in, final int[] pending, final BlockReader blockReader) {
|
||||
this.in = in;
|
||||
this.pending = pending;
|
||||
bulkResult.ints = pending;
|
||||
this.blockReader = blockReader;
|
||||
}
|
||||
|
||||
|
@ -146,26 +144,6 @@ public abstract class VariableIntBlockIndexInput extends IntIndexInput {
|
|||
|
||||
return pending[upto++];
|
||||
}
|
||||
|
||||
@Override
|
||||
public IntsRef read(final int count) throws IOException {
|
||||
this.maybeSeek();
|
||||
if (upto == blockSize) {
|
||||
lastBlockFP = in.getFilePointer();
|
||||
blockSize = blockReader.readBlock();
|
||||
upto = 0;
|
||||
}
|
||||
bulkResult.offset = upto;
|
||||
if (upto + count < blockSize) {
|
||||
bulkResult.length = count;
|
||||
upto += count;
|
||||
} else {
|
||||
bulkResult.length = blockSize - upto;
|
||||
upto = blockSize;
|
||||
}
|
||||
|
||||
return bulkResult;
|
||||
}
|
||||
}
|
||||
|
||||
private class Index extends IntIndexInput.Index {
|
||||
|
@ -204,7 +182,7 @@ public abstract class VariableIntBlockIndexInput extends IntIndexInput {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void set(final IntIndexInput.Index other) {
|
||||
public void copyFrom(final IntIndexInput.Index other) {
|
||||
final Index idx = (Index) other;
|
||||
fp = idx.fp;
|
||||
upto = idx.upto;
|
||||
|
|
|
@ -159,7 +159,7 @@ import org.apache.lucene.util.fst.FST; // javadocs
|
|||
* with the frequency of the term in that document (except when frequencies are
|
||||
* omitted: {@link IndexOptions#DOCS_ONLY}).</p>
|
||||
* <ul>
|
||||
* <li>FreqFile (.frq) --> Header, <TermFreqs, SkipData> <sup>TermCount</sup></li>
|
||||
* <li>FreqFile (.frq) --> Header, <TermFreqs, SkipData?> <sup>TermCount</sup></li>
|
||||
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||
* <li>TermFreqs --> <TermFreq> <sup>DocFreq</sup></li>
|
||||
* <li>TermFreq --> DocDelta[, Freq?]</li>
|
||||
|
|
|
@ -185,8 +185,6 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
|
|||
int lastDocID;
|
||||
int df;
|
||||
|
||||
/** Adds a new doc in this term. If this returns null
|
||||
* then we just skip consuming positions/payloads. */
|
||||
@Override
|
||||
public void startDoc(int docID, int termDocFreq) throws IOException {
|
||||
// if (DEBUG) System.out.println("SPW: startDoc seg=" + segment + " docID=" + docID + " tf=" + termDocFreq + " freqOut.fp=" + freqOut.getFilePointer());
|
||||
|
|
|
@ -44,7 +44,7 @@ public abstract class IntIndexInput implements Closeable {
|
|||
/** Seeks primary stream to the last read offset */
|
||||
public abstract void seek(IntIndexInput.Reader stream) throws IOException;
|
||||
|
||||
public abstract void set(Index other);
|
||||
public abstract void copyFrom(Index other);
|
||||
|
||||
@Override
|
||||
public abstract Index clone();
|
||||
|
@ -55,23 +55,5 @@ public abstract class IntIndexInput implements Closeable {
|
|||
|
||||
/** Reads next single int */
|
||||
public abstract int next() throws IOException;
|
||||
|
||||
/** Reads next chunk of ints */
|
||||
private IntsRef bulkResult;
|
||||
|
||||
/** Read up to count ints. */
|
||||
public IntsRef read(int count) throws IOException {
|
||||
if (bulkResult == null) {
|
||||
bulkResult = new IntsRef();
|
||||
bulkResult.ints = new int[count];
|
||||
} else {
|
||||
bulkResult.grow(count);
|
||||
}
|
||||
for(int i=0;i<count;i++) {
|
||||
bulkResult.ints[i] = next();
|
||||
}
|
||||
bulkResult.length = count;
|
||||
return bulkResult;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -160,13 +160,13 @@ public class SepPostingsReader extends PostingsReaderBase {
|
|||
if (docIndex == null) {
|
||||
docIndex = other.docIndex.clone();
|
||||
} else {
|
||||
docIndex.set(other.docIndex);
|
||||
docIndex.copyFrom(other.docIndex);
|
||||
}
|
||||
if (other.freqIndex != null) {
|
||||
if (freqIndex == null) {
|
||||
freqIndex = other.freqIndex.clone();
|
||||
} else {
|
||||
freqIndex.set(other.freqIndex);
|
||||
freqIndex.copyFrom(other.freqIndex);
|
||||
}
|
||||
} else {
|
||||
freqIndex = null;
|
||||
|
@ -175,7 +175,7 @@ public class SepPostingsReader extends PostingsReaderBase {
|
|||
if (posIndex == null) {
|
||||
posIndex = other.posIndex.clone();
|
||||
} else {
|
||||
posIndex.set(other.posIndex);
|
||||
posIndex.copyFrom(other.posIndex);
|
||||
}
|
||||
} else {
|
||||
posIndex = null;
|
||||
|
@ -352,11 +352,11 @@ public class SepPostingsReader extends PostingsReaderBase {
|
|||
|
||||
// TODO: can't we only do this if consumer
|
||||
// skipped consuming the previous docs?
|
||||
docIndex.set(termState.docIndex);
|
||||
docIndex.copyFrom(termState.docIndex);
|
||||
docIndex.seek(docReader);
|
||||
|
||||
if (!omitTF) {
|
||||
freqIndex.set(termState.freqIndex);
|
||||
freqIndex.copyFrom(termState.freqIndex);
|
||||
freqIndex.seek(freqReader);
|
||||
}
|
||||
|
||||
|
@ -516,15 +516,15 @@ public class SepPostingsReader extends PostingsReaderBase {
|
|||
|
||||
// TODO: can't we only do this if consumer
|
||||
// skipped consuming the previous docs?
|
||||
docIndex.set(termState.docIndex);
|
||||
docIndex.copyFrom(termState.docIndex);
|
||||
docIndex.seek(docReader);
|
||||
//System.out.println(" docIndex=" + docIndex);
|
||||
|
||||
freqIndex.set(termState.freqIndex);
|
||||
freqIndex.copyFrom(termState.freqIndex);
|
||||
freqIndex.seek(freqReader);
|
||||
//System.out.println(" freqIndex=" + freqIndex);
|
||||
|
||||
posIndex.set(termState.posIndex);
|
||||
posIndex.copyFrom(termState.posIndex);
|
||||
//System.out.println(" posIndex=" + posIndex);
|
||||
posSeekPending = true;
|
||||
payloadPending = false;
|
||||
|
@ -629,7 +629,7 @@ public class SepPostingsReader extends PostingsReaderBase {
|
|||
// NOTE: don't seek pos here; do it lazily
|
||||
// instead. Eg a PhraseQuery may skip to many
|
||||
// docs before finally asking for positions...
|
||||
posIndex.set(skipper.getPosIndex());
|
||||
posIndex.copyFrom(skipper.getPosIndex());
|
||||
posSeekPending = true;
|
||||
count = newCount;
|
||||
doc = accum = skipper.getDoc();
|
||||
|
|
|
@ -108,12 +108,12 @@ class SepSkipListReader extends MultiLevelSkipListReader {
|
|||
lastPayloadPointer = payloadBasePointer;
|
||||
|
||||
for(int i=0;i<maxNumberOfSkipLevels;i++) {
|
||||
docIndex[i].set(docBaseIndex);
|
||||
docIndex[i].copyFrom(docBaseIndex);
|
||||
if (freqIndex != null) {
|
||||
freqIndex[i].set(freqBaseIndex);
|
||||
freqIndex[i].copyFrom(freqBaseIndex);
|
||||
}
|
||||
if (posBaseIndex != null) {
|
||||
posIndex[i].set(posBaseIndex);
|
||||
posIndex[i].copyFrom(posBaseIndex);
|
||||
}
|
||||
}
|
||||
Arrays.fill(payloadPointer, payloadBasePointer);
|
||||
|
@ -145,20 +145,20 @@ class SepSkipListReader extends MultiLevelSkipListReader {
|
|||
lastPayloadPointer = payloadPointer[level];
|
||||
lastPayloadLength = payloadLength[level];
|
||||
if (freqIndex != null) {
|
||||
lastFreqIndex.set(freqIndex[level]);
|
||||
lastFreqIndex.copyFrom(freqIndex[level]);
|
||||
}
|
||||
lastDocIndex.set(docIndex[level]);
|
||||
lastDocIndex.copyFrom(docIndex[level]);
|
||||
if (lastPosIndex != null) {
|
||||
lastPosIndex.set(posIndex[level]);
|
||||
lastPosIndex.copyFrom(posIndex[level]);
|
||||
}
|
||||
|
||||
if (level > 0) {
|
||||
if (freqIndex != null) {
|
||||
freqIndex[level-1].set(freqIndex[level]);
|
||||
freqIndex[level-1].copyFrom(freqIndex[level]);
|
||||
}
|
||||
docIndex[level-1].set(docIndex[level]);
|
||||
docIndex[level-1].copyFrom(docIndex[level]);
|
||||
if (posIndex != null) {
|
||||
posIndex[level-1].set(posIndex[level]);
|
||||
posIndex[level-1].copyFrom(posIndex[level]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -34,6 +34,8 @@ import org.apache.lucene.index.FieldInfo.IndexOptions;
|
|||
public class FieldInfos implements Iterable<FieldInfo> {
|
||||
private final boolean hasFreq;
|
||||
private final boolean hasProx;
|
||||
private final boolean hasPayloads;
|
||||
private final boolean hasOffsets;
|
||||
private final boolean hasVectors;
|
||||
private final boolean hasNorms;
|
||||
private final boolean hasDocValues;
|
||||
|
@ -45,6 +47,8 @@ public class FieldInfos implements Iterable<FieldInfo> {
|
|||
public FieldInfos(FieldInfo[] infos) {
|
||||
boolean hasVectors = false;
|
||||
boolean hasProx = false;
|
||||
boolean hasPayloads = false;
|
||||
boolean hasOffsets = false;
|
||||
boolean hasFreq = false;
|
||||
boolean hasNorms = false;
|
||||
boolean hasDocValues = false;
|
||||
|
@ -58,12 +62,16 @@ public class FieldInfos implements Iterable<FieldInfo> {
|
|||
hasVectors |= info.hasVectors();
|
||||
hasProx |= info.isIndexed() && info.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||
hasFreq |= info.isIndexed() && info.getIndexOptions() != IndexOptions.DOCS_ONLY;
|
||||
hasOffsets |= info.isIndexed() && info.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
hasNorms |= info.hasNorms();
|
||||
hasDocValues |= info.hasDocValues();
|
||||
hasPayloads |= info.hasPayloads();
|
||||
}
|
||||
|
||||
this.hasVectors = hasVectors;
|
||||
this.hasProx = hasProx;
|
||||
this.hasPayloads = hasPayloads;
|
||||
this.hasOffsets = hasOffsets;
|
||||
this.hasFreq = hasFreq;
|
||||
this.hasNorms = hasNorms;
|
||||
this.hasDocValues = hasDocValues;
|
||||
|
@ -80,6 +88,16 @@ public class FieldInfos implements Iterable<FieldInfo> {
|
|||
return hasProx;
|
||||
}
|
||||
|
||||
/** Returns true if any fields have payloads */
|
||||
public boolean hasPayloads() {
|
||||
return hasPayloads;
|
||||
}
|
||||
|
||||
/** Returns true if any fields have offsets */
|
||||
public boolean hasOffsets() {
|
||||
return hasOffsets;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if at least one field has any vectors
|
||||
*/
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -17,12 +17,12 @@ package org.apache.lucene.util.packed;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
|
||||
/**
|
||||
* Space optimized random access capable array of values with a fixed number of
|
||||
* bits/value. Values are packed contiguously.
|
||||
|
@ -146,12 +146,12 @@ class Packed64 extends PackedInts.MutableImpl {
|
|||
assert off + len <= arr.length;
|
||||
|
||||
final int originalIndex = index;
|
||||
final BulkOperation op = BulkOperation.of(PackedInts.Format.PACKED, bitsPerValue);
|
||||
final PackedInts.Decoder decoder = BulkOperation.of(PackedInts.Format.PACKED, bitsPerValue);
|
||||
|
||||
// go to the next block where the value does not span across two blocks
|
||||
final int offsetInBlocks = index % op.values();
|
||||
final int offsetInBlocks = index % decoder.valueCount();
|
||||
if (offsetInBlocks != 0) {
|
||||
for (int i = offsetInBlocks; i < op.values() && len > 0; ++i) {
|
||||
for (int i = offsetInBlocks; i < decoder.valueCount() && len > 0; ++i) {
|
||||
arr[off++] = get(index++);
|
||||
--len;
|
||||
}
|
||||
|
@ -161,12 +161,12 @@ class Packed64 extends PackedInts.MutableImpl {
|
|||
}
|
||||
|
||||
// bulk get
|
||||
assert index % op.values() == 0;
|
||||
assert index % decoder.valueCount() == 0;
|
||||
int blockIndex = (int) ((long) index * bitsPerValue) >>> BLOCK_BITS;
|
||||
assert (((long)index * bitsPerValue) & MOD_MASK) == 0;
|
||||
final int iterations = len / op.values();
|
||||
op.get(blocks, blockIndex, arr, off, iterations);
|
||||
final int gotValues = iterations * op.values();
|
||||
final int iterations = len / decoder.valueCount();
|
||||
decoder.decode(blocks, blockIndex, arr, off, iterations);
|
||||
final int gotValues = iterations * decoder.valueCount();
|
||||
index += gotValues;
|
||||
len -= gotValues;
|
||||
assert len >= 0;
|
||||
|
@ -210,12 +210,12 @@ class Packed64 extends PackedInts.MutableImpl {
|
|||
assert off + len <= arr.length;
|
||||
|
||||
final int originalIndex = index;
|
||||
final BulkOperation op = BulkOperation.of(PackedInts.Format.PACKED, bitsPerValue);
|
||||
final PackedInts.Encoder encoder = BulkOperation.of(PackedInts.Format.PACKED, bitsPerValue);
|
||||
|
||||
// go to the next block where the value does not span across two blocks
|
||||
final int offsetInBlocks = index % op.values();
|
||||
final int offsetInBlocks = index % encoder.valueCount();
|
||||
if (offsetInBlocks != 0) {
|
||||
for (int i = offsetInBlocks; i < op.values() && len > 0; ++i) {
|
||||
for (int i = offsetInBlocks; i < encoder.valueCount() && len > 0; ++i) {
|
||||
set(index++, arr[off++]);
|
||||
--len;
|
||||
}
|
||||
|
@ -224,13 +224,13 @@ class Packed64 extends PackedInts.MutableImpl {
|
|||
}
|
||||
}
|
||||
|
||||
// bulk get
|
||||
assert index % op.values() == 0;
|
||||
// bulk set
|
||||
assert index % encoder.valueCount() == 0;
|
||||
int blockIndex = (int) ((long) index * bitsPerValue) >>> BLOCK_BITS;
|
||||
assert (((long)index * bitsPerValue) & MOD_MASK) == 0;
|
||||
final int iterations = len / op.values();
|
||||
op.set(blocks, blockIndex, arr, off, iterations);
|
||||
final int setValues = iterations * op.values();
|
||||
final int iterations = len / encoder.valueCount();
|
||||
encoder.encode(arr, off, blocks, blockIndex, iterations);
|
||||
final int setValues = iterations * encoder.valueCount();
|
||||
index += setValues;
|
||||
len -= setValues;
|
||||
assert len >= 0;
|
||||
|
|
|
@ -86,12 +86,12 @@ abstract class Packed64SingleBlock extends PackedInts.MutableImpl {
|
|||
|
||||
// bulk get
|
||||
assert index % valuesPerBlock == 0;
|
||||
final BulkOperation op = BulkOperation.of(PackedInts.Format.PACKED_SINGLE_BLOCK, bitsPerValue);
|
||||
assert op.blocks() == 1;
|
||||
assert op.values() == valuesPerBlock;
|
||||
final PackedInts.Decoder decoder = BulkOperation.of(PackedInts.Format.PACKED_SINGLE_BLOCK, bitsPerValue);
|
||||
assert decoder.blockCount() == 1;
|
||||
assert decoder.valueCount() == valuesPerBlock;
|
||||
final int blockIndex = index / valuesPerBlock;
|
||||
final int nblocks = (index + len) / valuesPerBlock - blockIndex;
|
||||
op.get(blocks, blockIndex, arr, off, nblocks);
|
||||
decoder.decode(blocks, blockIndex, arr, off, nblocks);
|
||||
final int diff = nblocks * valuesPerBlock;
|
||||
index += diff; len -= diff;
|
||||
|
||||
|
@ -131,11 +131,11 @@ abstract class Packed64SingleBlock extends PackedInts.MutableImpl {
|
|||
// bulk set
|
||||
assert index % valuesPerBlock == 0;
|
||||
final BulkOperation op = BulkOperation.of(PackedInts.Format.PACKED_SINGLE_BLOCK, bitsPerValue);
|
||||
assert op.blocks() == 1;
|
||||
assert op.values() == valuesPerBlock;
|
||||
assert op.blockCount() == 1;
|
||||
assert op.valueCount() == valuesPerBlock;
|
||||
final int blockIndex = index / valuesPerBlock;
|
||||
final int nblocks = (index + len) / valuesPerBlock - blockIndex;
|
||||
op.set(blocks, blockIndex, arr, off, nblocks);
|
||||
op.encode(arr, off, blocks, blockIndex, nblocks);
|
||||
final int diff = nblocks * valuesPerBlock;
|
||||
index += diff; len -= diff;
|
||||
|
||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.lucene.util.packed;
|
|||
*/
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
|
@ -25,8 +26,6 @@ import org.apache.lucene.store.DataOutput;
|
|||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.util.LongsRef;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Simplistic compression for array of unsigned long values.
|
||||
* Each value is >= 0 and <= a specified maximum value. The
|
||||
|
@ -66,6 +65,14 @@ public class PackedInts {
|
|||
public final static int VERSION_START = 0;
|
||||
public final static int VERSION_CURRENT = VERSION_START;
|
||||
|
||||
private static void checkVersion(int version) {
|
||||
if (version < VERSION_START) {
|
||||
throw new IllegalArgumentException("Version is too old, should be at least " + VERSION_START + " (got " + version + ")");
|
||||
} else if (version > VERSION_CURRENT) {
|
||||
throw new IllegalArgumentException("Version is too new, should be at most " + VERSION_CURRENT + " (got " + version + ")");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A format to write packed ints.
|
||||
*
|
||||
|
@ -241,6 +248,146 @@ public class PackedInts {
|
|||
return new FormatAndBits(format, actualBitsPerValue);
|
||||
}
|
||||
|
||||
/**
|
||||
* A decoder for packed integers.
|
||||
*/
|
||||
public static interface Decoder {
|
||||
|
||||
/**
|
||||
* The minimum number of long blocks to decode in a single call.
|
||||
*/
|
||||
int blockCount();
|
||||
|
||||
/**
|
||||
* The number of values that can be stored in <code>blockCount()</code> long
|
||||
* blocks.
|
||||
*/
|
||||
int valueCount();
|
||||
|
||||
/**
|
||||
* Read <code>iterations * blockCount()</code> blocks from <code>blocks</code>,
|
||||
* decode them and write <code>iterations * valueCount()</code> values into
|
||||
* <code>values</code>.
|
||||
*
|
||||
* @param blocks the long blocks that hold packed integer values
|
||||
* @param blocksOffset the offset where to start reading blocks
|
||||
* @param values the values buffer
|
||||
* @param valuesOffset the offset where to start writing values
|
||||
* @param iterations controls how much data to decode
|
||||
*/
|
||||
void decode(long[] blocks, int blocksOffset, long[] values, int valuesOffset, int iterations);
|
||||
|
||||
/**
|
||||
* Read <code>8 * iterations * blockCount()</code> blocks from <code>blocks</code>,
|
||||
* decode them and write <code>iterations * valueCount()</code> values into
|
||||
* <code>values</code>.
|
||||
*
|
||||
* @param blocks the long blocks that hold packed integer values
|
||||
* @param blocksOffset the offset where to start reading blocks
|
||||
* @param values the values buffer
|
||||
* @param valuesOffset the offset where to start writing values
|
||||
* @param iterations controls how much data to decode
|
||||
*/
|
||||
void decode(byte[] blocks, int blocksOffset, long[] values, int valuesOffset, int iterations);
|
||||
|
||||
/**
|
||||
* Read <code>iterations * blockCount()</code> blocks from <code>blocks</code>,
|
||||
* decode them and write <code>iterations * valueCount()</code> values into
|
||||
* <code>values</code>.
|
||||
*
|
||||
* @param blocks the long blocks that hold packed integer values
|
||||
* @param blocksOffset the offset where to start reading blocks
|
||||
* @param values the values buffer
|
||||
* @param valuesOffset the offset where to start writing values
|
||||
* @param iterations controls how much data to decode
|
||||
*/
|
||||
void decode(long[] blocks, int blocksOffset, int[] values, int valuesOffset, int iterations);
|
||||
|
||||
/**
|
||||
* Read <code>8 * iterations * blockCount()</code> blocks from <code>blocks</code>,
|
||||
* decode them and write <code>iterations * valueCount()</code> values into
|
||||
* <code>values</code>.
|
||||
*
|
||||
* @param blocks the long blocks that hold packed integer values
|
||||
* @param blocksOffset the offset where to start reading blocks
|
||||
* @param values the values buffer
|
||||
* @param valuesOffset the offset where to start writing values
|
||||
* @param iterations controls how much data to decode
|
||||
*/
|
||||
void decode(byte[] blocks, int blocksOffset, int[] values, int valuesOffset, int iterations);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* An encoder for packed integers.
|
||||
*/
|
||||
public static interface Encoder {
|
||||
|
||||
/**
|
||||
* The minimum number of long blocks to encode in a single call.
|
||||
*/
|
||||
int blockCount();
|
||||
|
||||
/**
|
||||
* The number of values that can be stored in <code>blockCount()</code> long
|
||||
* blocks.
|
||||
*/
|
||||
int valueCount();
|
||||
|
||||
/**
|
||||
* Read <code>iterations * valueCount()</code> values from <code>values</code>,
|
||||
* encode them and write <code>iterations * blockCount()</code> blocks into
|
||||
* <code>blocks</code>.
|
||||
*
|
||||
* @param blocks the long blocks that hold packed integer values
|
||||
* @param blocksOffset the offset where to start writing blocks
|
||||
* @param values the values buffer
|
||||
* @param valuesOffset the offset where to start reading values
|
||||
* @param iterations controls how much data to encode
|
||||
*/
|
||||
void encode(long[] values, int valuesOffset, long[] blocks, int blocksOffset, int iterations);
|
||||
|
||||
/**
|
||||
* Read <code>iterations * valueCount()</code> values from <code>values</code>,
|
||||
* encode them and write <code>8 * iterations * blockCount()</code> blocks into
|
||||
* <code>blocks</code>.
|
||||
*
|
||||
* @param blocks the long blocks that hold packed integer values
|
||||
* @param blocksOffset the offset where to start writing blocks
|
||||
* @param values the values buffer
|
||||
* @param valuesOffset the offset where to start reading values
|
||||
* @param iterations controls how much data to encode
|
||||
*/
|
||||
void encode(long[] values, int valuesOffset, byte[] blocks, int blocksOffset, int iterations);
|
||||
|
||||
/**
|
||||
* Read <code>iterations * valueCount()</code> values from <code>values</code>,
|
||||
* encode them and write <code>iterations * blockCount()</code> blocks into
|
||||
* <code>blocks</code>.
|
||||
*
|
||||
* @param blocks the long blocks that hold packed integer values
|
||||
* @param blocksOffset the offset where to start writing blocks
|
||||
* @param values the values buffer
|
||||
* @param valuesOffset the offset where to start reading values
|
||||
* @param iterations controls how much data to encode
|
||||
*/
|
||||
void encode(int[] values, int valuesOffset, long[] blocks, int blocksOffset, int iterations);
|
||||
|
||||
/**
|
||||
* Read <code>iterations * valueCount()</code> values from <code>values</code>,
|
||||
* encode them and write <code>8 * iterations * blockCount()</code> blocks into
|
||||
* <code>blocks</code>.
|
||||
*
|
||||
* @param blocks the long blocks that hold packed integer values
|
||||
* @param blocksOffset the offset where to start writing blocks
|
||||
* @param values the values buffer
|
||||
* @param valuesOffset the offset where to start reading values
|
||||
* @param iterations controls how much data to encode
|
||||
*/
|
||||
void encode(int[] values, int valuesOffset, byte[] blocks, int blocksOffset, int iterations);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* A read-only random access array of positive integers.
|
||||
* @lucene.internal
|
||||
|
@ -490,8 +637,7 @@ public class PackedInts {
|
|||
protected final int valueCount;
|
||||
protected final int bitsPerValue;
|
||||
|
||||
protected Writer(DataOutput out, int valueCount, int bitsPerValue)
|
||||
throws IOException {
|
||||
protected Writer(DataOutput out, int valueCount, int bitsPerValue) {
|
||||
assert bitsPerValue <= 64;
|
||||
assert valueCount >= 0 || valueCount == -1;
|
||||
this.out = out;
|
||||
|
@ -528,6 +674,32 @@ public class PackedInts {
|
|||
public abstract int ord();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a {@link Decoder}.
|
||||
*
|
||||
* @param format the format used to store packed ints
|
||||
* @param version the compatibility version
|
||||
* @param bitsPerValue the number of bits per value
|
||||
* @return a decoder
|
||||
*/
|
||||
public static Decoder getDecoder(Format format, int version, int bitsPerValue) {
|
||||
checkVersion(version);
|
||||
return BulkOperation.of(format, bitsPerValue);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get an {@link Encoder}.
|
||||
*
|
||||
* @param format the format used to store packed ints
|
||||
* @param version the compatibility version
|
||||
* @param bitsPerValue the number of bits per value
|
||||
* @return an encoder
|
||||
*/
|
||||
public static Encoder getEncoder(Format format, int version, int bitsPerValue) {
|
||||
checkVersion(version);
|
||||
return BulkOperation.of(format, bitsPerValue);
|
||||
}
|
||||
|
||||
/**
|
||||
* Expert: Restore a {@link Reader} from a stream without reading metadata at
|
||||
* the beginning of the stream. This method is useful to restore data from
|
||||
|
@ -546,6 +718,7 @@ public class PackedInts {
|
|||
*/
|
||||
public static Reader getReaderNoHeader(DataInput in, Format format, int version,
|
||||
int valueCount, int bitsPerValue) throws IOException {
|
||||
checkVersion(version);
|
||||
switch (format) {
|
||||
case PACKED_SINGLE_BLOCK:
|
||||
return Packed64SingleBlock.create(in, valueCount, bitsPerValue);
|
||||
|
@ -612,7 +785,8 @@ public class PackedInts {
|
|||
* @lucene.internal
|
||||
*/
|
||||
public static ReaderIterator getReaderIteratorNoHeader(DataInput in, Format format, int version,
|
||||
int valueCount, int bitsPerValue, int mem) throws IOException {
|
||||
int valueCount, int bitsPerValue, int mem) {
|
||||
checkVersion(version);
|
||||
return new PackedReaderIterator(format, valueCount, bitsPerValue, in, mem);
|
||||
}
|
||||
|
||||
|
@ -652,7 +826,8 @@ public class PackedInts {
|
|||
* @lucene.internal
|
||||
*/
|
||||
public static Reader getDirectReaderNoHeader(IndexInput in, Format format,
|
||||
int version, int valueCount, int bitsPerValue) throws IOException {
|
||||
int version, int valueCount, int bitsPerValue) {
|
||||
checkVersion(version);
|
||||
switch (format) {
|
||||
case PACKED:
|
||||
return new DirectPackedReader(bitsPerValue, valueCount, in);
|
||||
|
@ -784,7 +959,7 @@ public class PackedInts {
|
|||
* @lucene.internal
|
||||
*/
|
||||
public static Writer getWriterNoHeader(
|
||||
DataOutput out, Format format, int valueCount, int bitsPerValue, int mem) throws IOException {
|
||||
DataOutput out, Format format, int valueCount, int bitsPerValue, int mem) {
|
||||
return new PackedWriter(format, out, valueCount, bitsPerValue, mem);
|
||||
}
|
||||
|
||||
|
|
|
@ -38,10 +38,10 @@ final class PackedReaderIterator extends PackedInts.ReaderIteratorImpl {
|
|||
bulkOperation = BulkOperation.of(format, bitsPerValue);
|
||||
iterations = bulkOperation.computeIterations(valueCount, mem);
|
||||
assert iterations > 0;
|
||||
nextBlocks = new long[iterations * bulkOperation.blocks()];
|
||||
nextValues = new LongsRef(new long[iterations * bulkOperation.values()], 0, 0);
|
||||
assert iterations * bulkOperation.values() == nextValues.longs.length;
|
||||
assert iterations * bulkOperation.blocks() == nextBlocks.length;
|
||||
nextBlocks = new long[iterations * bulkOperation.blockCount()];
|
||||
nextValues = new LongsRef(new long[iterations * bulkOperation.valueCount()], 0, 0);
|
||||
assert iterations * bulkOperation.valueCount() == nextValues.longs.length;
|
||||
assert iterations * bulkOperation.blockCount() == nextBlocks.length;
|
||||
nextValues.offset = nextValues.longs.length;
|
||||
position = -1;
|
||||
}
|
||||
|
@ -70,7 +70,7 @@ final class PackedReaderIterator extends PackedInts.ReaderIteratorImpl {
|
|||
nextBlocks[i] = 0L;
|
||||
}
|
||||
|
||||
bulkOperation.get(nextBlocks, 0, nextValues.longs, 0, iterations);
|
||||
bulkOperation.decode(nextBlocks, 0, nextValues.longs, 0, iterations);
|
||||
nextValues.offset = 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -21,6 +21,7 @@ import org.apache.lucene.store.DataOutput;
|
|||
|
||||
import java.io.EOFException;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
// Packs high order byte first, to match
|
||||
// IndexOutput.writeInt/Long/Short byte order
|
||||
|
@ -29,21 +30,20 @@ final class PackedWriter extends PackedInts.Writer {
|
|||
|
||||
boolean finished;
|
||||
final PackedInts.Format format;
|
||||
final BulkOperation bulkOperation;
|
||||
final BulkOperation encoder;
|
||||
final long[] nextBlocks;
|
||||
final long[] nextValues;
|
||||
final int iterations;
|
||||
int off;
|
||||
int written;
|
||||
|
||||
PackedWriter(PackedInts.Format format, DataOutput out, int valueCount, int bitsPerValue, int mem)
|
||||
throws IOException {
|
||||
PackedWriter(PackedInts.Format format, DataOutput out, int valueCount, int bitsPerValue, int mem) {
|
||||
super(out, valueCount, bitsPerValue);
|
||||
this.format = format;
|
||||
bulkOperation = BulkOperation.of(format, bitsPerValue);
|
||||
iterations = bulkOperation.computeIterations(valueCount, mem);
|
||||
nextBlocks = new long[iterations * bulkOperation.blocks()];
|
||||
nextValues = new long[iterations * bulkOperation.values()];
|
||||
encoder = BulkOperation.of(format, bitsPerValue);
|
||||
iterations = encoder.computeIterations(valueCount, mem);
|
||||
nextBlocks = new long[iterations * encoder.blockCount()];
|
||||
nextValues = new long[iterations * encoder.valueCount()];
|
||||
off = 0;
|
||||
written = 0;
|
||||
finished = false;
|
||||
|
@ -63,8 +63,7 @@ final class PackedWriter extends PackedInts.Writer {
|
|||
}
|
||||
nextValues[off++] = v;
|
||||
if (off == nextValues.length) {
|
||||
flush(nextValues.length);
|
||||
off = 0;
|
||||
flush();
|
||||
}
|
||||
++written;
|
||||
}
|
||||
|
@ -77,16 +76,17 @@ final class PackedWriter extends PackedInts.Writer {
|
|||
add(0L);
|
||||
}
|
||||
}
|
||||
flush(off);
|
||||
flush();
|
||||
finished = true;
|
||||
}
|
||||
|
||||
private void flush(int nvalues) throws IOException {
|
||||
bulkOperation.set(nextBlocks, 0, nextValues, 0, iterations);
|
||||
final int blocks = format.nblocks(bitsPerValue, nvalues);
|
||||
private void flush() throws IOException {
|
||||
encoder.encode(nextValues, 0, nextBlocks, 0, iterations);
|
||||
final int blocks = format.nblocks(bitsPerValue, off);
|
||||
for (int i = 0; i < blocks; ++i) {
|
||||
out.writeLong(nextBlocks[i]);
|
||||
}
|
||||
Arrays.fill(nextValues, 0L);
|
||||
off = 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -42,22 +42,52 @@ package org.apache.lucene.util.packed;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.EnumMap;
|
||||
import java.nio.LongBuffer;
|
||||
import java.nio.ByteBuffer;
|
||||
|
||||
/**
|
||||
* Efficient sequential read/write of packed integers.
|
||||
*/
|
||||
abstract class BulkOperation {
|
||||
enum BulkOperation implements PackedInts.Decoder, PackedInts.Encoder {
|
||||
"""
|
||||
|
||||
static final EnumMap<PackedInts.Format, BulkOperation[]> BULK_OPERATIONS = new EnumMap<PackedInts.Format, BulkOperation[]>(PackedInts.Format.class);
|
||||
FOOTER="""
|
||||
|
||||
public static BulkOperation of(PackedInts.Format format, int bitsPerValue) {
|
||||
assert bitsPerValue > 0 && bitsPerValue <= 64;
|
||||
BulkOperation[] ops = BULK_OPERATIONS.get(format);
|
||||
if (ops == null || ops[bitsPerValue] == null) {
|
||||
throw new IllegalArgumentException("format: " + format + ", bitsPerValue: " + bitsPerValue);
|
||||
private static long[] toLongArray(int[] ints, int offset, int length) {
|
||||
long[] arr = new long[length];
|
||||
for (int i = 0; i < length; ++i) {
|
||||
arr[i] = ints[offset + i];
|
||||
}
|
||||
return ops[bitsPerValue];
|
||||
return arr;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void decode(long[] blocks, int blocksOffset, int[] values, int valuesOffset, int iterations) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void decode(byte[] blocks, int blocksOffset, int[] values, int valuesOffset, int iterations) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void encode(int[] values, int valuesOffset, long[] blocks, int blocksOffset, int iterations) {
|
||||
encode(toLongArray(values, valuesOffset, iterations * valueCount()), 0, blocks, blocksOffset, iterations);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void encode(long[] values, int valuesOffset, byte[] blocks, int blocksOffset, int iterations) {
|
||||
final long[] longBLocks = new long[blockCount() * iterations];
|
||||
encode(values, valuesOffset, longBLocks, 0, iterations);
|
||||
ByteBuffer.wrap(blocks, blocksOffset, 8 * iterations * blockCount()).asLongBuffer().put(longBLocks);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void encode(int[] values, int valuesOffset, byte[] blocks, int blocksOffset, int iterations) {
|
||||
final long[] longBLocks = new long[blockCount() * iterations];
|
||||
encode(values, valuesOffset, longBLocks, 0, iterations);
|
||||
ByteBuffer.wrap(blocks, blocksOffset, 8 * iterations * blockCount()).asLongBuffer().put(longBLocks);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -67,7 +97,7 @@ abstract class BulkOperation {
|
|||
* - 16 bits per value -> b=1, v=4
|
||||
* - 24 bits per value -> b=3, v=8
|
||||
* - 50 bits per value -> b=25, v=32
|
||||
* - 63 bits per value -> b=63, v = 64
|
||||
* - 63 bits per value -> b=63, v=64
|
||||
* - ...
|
||||
*
|
||||
* A bulk read consists in copying <code>iterations*v</code> values that are
|
||||
|
@ -79,87 +109,155 @@ abstract class BulkOperation {
|
|||
* <code>ramBudget / (8 * (b + v))</code> (since a long is 8 bytes).
|
||||
*/
|
||||
public final int computeIterations(int valueCount, int ramBudget) {
|
||||
final int iterations = (ramBudget >>> 3) / (blocks() + values());
|
||||
final int iterations = (ramBudget >>> 3) / (blockCount() + valueCount());
|
||||
if (iterations == 0) {
|
||||
// at least 1
|
||||
return 1;
|
||||
} else if ((iterations - 1) * blocks() >= valueCount) {
|
||||
} else if ((iterations - 1) * blockCount() >= valueCount) {
|
||||
// don't allocate for more than the size of the reader
|
||||
return (int) Math.ceil((double) valueCount / values());
|
||||
return (int) Math.ceil((double) valueCount / valueCount());
|
||||
} else {
|
||||
return iterations;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The minimum number of blocks required to perform a bulk get/set.
|
||||
*/
|
||||
public abstract int blocks();
|
||||
|
||||
/**
|
||||
* The number of values that can be stored in <code>blocks()</code> blocks.
|
||||
*/
|
||||
public abstract int values();
|
||||
|
||||
/**
|
||||
* Get <code>n * values()</code> values from <code>n * blocks()</code> blocks.
|
||||
*/
|
||||
public abstract void get(long[] blocks, int blockIndex, long[] values, int valuesIndex, int iterations);
|
||||
|
||||
/**
|
||||
* Set <code>n * values()</code> values into <code>n * blocks()</code> blocks.
|
||||
*/
|
||||
public abstract void set(long[] blocks, int blockIndex, long[] values, int valuesIndex, int iterations);
|
||||
|
||||
}
|
||||
"""
|
||||
|
||||
FOOTER = "}"
|
||||
def casts(typ):
|
||||
cast_start = "(%s) (" %typ
|
||||
cast_end = ")"
|
||||
if typ == "long":
|
||||
cast_start = ""
|
||||
cast_end = ""
|
||||
return cast_start, cast_end
|
||||
|
||||
def masks(bits):
|
||||
if bits == 64:
|
||||
return "", ""
|
||||
return "(", " & %sL)" %(hex((1 << bits) - 1))
|
||||
|
||||
def get_type(bits):
|
||||
if bits == 8:
|
||||
return "byte"
|
||||
elif bits == 16:
|
||||
return "short"
|
||||
elif bits == 32:
|
||||
return "int"
|
||||
elif bits == 64:
|
||||
return "long"
|
||||
else:
|
||||
assert False
|
||||
|
||||
def packed64singleblock(bpv, f):
|
||||
values = 64 / bpv
|
||||
f.write("\n static final class Packed64SingleBlockBulkOperation%d extends BulkOperation {\n\n" %bpv)
|
||||
f.write(" public int blocks() {\n")
|
||||
f.write("\n PACKED_SINGLE_BLOCK_%d {\n\n" %bpv)
|
||||
f.write(" public int blockCount() {\n")
|
||||
f.write(" return 1;\n")
|
||||
f.write(" }\n\n")
|
||||
f.write(" public int values() {\n")
|
||||
f.write(" public int valueCount() {\n")
|
||||
f.write(" return %d;\n" %values)
|
||||
f.write(" }\n\n")
|
||||
p64sb_decode(bpv, 32)
|
||||
p64sb_decode(bpv, 64)
|
||||
p64sb_encode(bpv, 32)
|
||||
p64sb_encode(bpv, 64)
|
||||
f.write(" }")
|
||||
|
||||
f.write(" public void get(long[] blocks, int bi, long[] values, int vi, int iterations) {\n")
|
||||
f.write(" assert bi + iterations * blocks() <= blocks.length;\n")
|
||||
f.write(" assert vi + iterations * values() <= values.length;\n")
|
||||
def p64sb_decode(bpv, bits):
|
||||
values = 64 / bpv
|
||||
typ = get_type(bits)
|
||||
cast_start, cast_end = casts(typ)
|
||||
f.write(" public void decode(long[] blocks, int blocksOffset, %s[] values, int valuesOffset, int iterations) {\n" %typ)
|
||||
if bits < bpv:
|
||||
f.write(" throw new UnsupportedOperationException();\n")
|
||||
f.write(" }\n\n")
|
||||
return
|
||||
f.write(" assert blocksOffset + iterations * blockCount() <= blocks.length;\n")
|
||||
f.write(" assert valuesOffset + iterations * valueCount() <= values.length;\n")
|
||||
f.write(" for (int i = 0; i < iterations; ++i) {\n")
|
||||
f.write(" final long block = blocks[bi++];\n")
|
||||
f.write(" final long block = blocks[blocksOffset++];\n")
|
||||
mask = (1 << bpv) - 1
|
||||
for i in xrange(values):
|
||||
block_offset = i / values
|
||||
offset_in_block = i % values
|
||||
if i == 0:
|
||||
f.write(" values[vi++] = block & %dL;\n" %mask)
|
||||
f.write(" values[valuesOffset++] = %sblock & %dL%s;\n" %(cast_start, mask, cast_end))
|
||||
elif i == values - 1:
|
||||
f.write(" values[vi++] = block >>> %d;\n" %(i * bpv))
|
||||
f.write(" values[valuesOffset++] = %sblock >>> %d%s;\n" %(cast_start, i * bpv, cast_end))
|
||||
else:
|
||||
f.write(" values[vi++] = (block >>> %d) & %dL;\n" %(i * bpv, mask))
|
||||
f.write(" values[valuesOffset++] = %s(block >>> %d) & %dL%s;\n" %(cast_start, i * bpv, mask, cast_end))
|
||||
f.write(" }\n")
|
||||
f.write(" }\n\n")
|
||||
|
||||
f.write(" public void set(long[] blocks, int bi, long[] values, int vi, int iterations) {\n")
|
||||
f.write(" assert bi + iterations * blocks() <= blocks.length;\n")
|
||||
f.write(" assert vi + iterations * values() <= values.length;\n")
|
||||
f.write(" public void decode(byte[] blocks, int blocksOffset, %s[] values, int valuesOffset, int iterations) {\n" %typ)
|
||||
if bits < bpv:
|
||||
f.write(" throw new UnsupportedOperationException();\n")
|
||||
f.write(" }\n\n")
|
||||
f.write(" assert blocksOffset + 8 * iterations * blockCount() <= blocks.length;\n")
|
||||
f.write(" assert valuesOffset + iterations * valueCount() <= values.length;\n")
|
||||
f.write(" for (int i = 0; i < iterations; ++i) {\n")
|
||||
if bpv >= 32 and bits > 32:
|
||||
for i in xrange(7, -1, -1):
|
||||
f.write(" final long byte%d = blocks[blocksOffset++] & 0xFF;\n" %i)
|
||||
else:
|
||||
for i in xrange(7, -1, -1):
|
||||
f.write(" final int byte%d = blocks[blocksOffset++] & 0xFF;\n" %i)
|
||||
for i in xrange(values):
|
||||
byte_start = (i * bpv) / 8
|
||||
bit_start = (i * bpv) % 8
|
||||
byte_end = ((i + 1) * bpv - 1) / 8
|
||||
bit_end = ((i + 1) * bpv - 1) % 8
|
||||
f.write(" values[valuesOffset++] =")
|
||||
if byte_start == byte_end:
|
||||
# only one byte
|
||||
if bit_start == 0:
|
||||
if bit_end == 7:
|
||||
f.write(" byte%d" %byte_start)
|
||||
else:
|
||||
f.write(" byte%d & %d" %(byte_start, mask))
|
||||
else:
|
||||
if bit_end == 7:
|
||||
f.write(" byte%d >>> %d" %(byte_start, bit_start))
|
||||
else:
|
||||
f.write(" (byte%d >>> %d) & %d" %(byte_start, bit_start, mask))
|
||||
else:
|
||||
if bit_start == 0:
|
||||
f.write(" byte%d" %byte_start)
|
||||
else:
|
||||
f.write(" (byte%d >>> %d)" %(byte_start, bit_start))
|
||||
for b in xrange(byte_start + 1, byte_end):
|
||||
f.write(" | (byte%d << %d)" %(b, 8 * (b - byte_start) - bit_start))
|
||||
if bit_end == 7:
|
||||
f.write(" | (byte%d << %d)" %(byte_end, 8 * (byte_end - byte_start) - bit_start))
|
||||
else:
|
||||
f.write(" | ((byte%d & %d) << %d)" %(byte_end, 2 ** (bit_end + 1) - 1, 8 * (byte_end - byte_start) - bit_start))
|
||||
f.write(";\n")
|
||||
f.write(" }\n")
|
||||
f.write(" }\n\n")
|
||||
|
||||
def p64sb_encode(bpv, bits):
|
||||
values = 64 / bpv
|
||||
typ = get_type(bits)
|
||||
mask_start, mask_end = masks(bits)
|
||||
f.write(" public void encode(%s[] values, int valuesOffset, long[] blocks, int blocksOffset, int iterations) {\n" %typ)
|
||||
if bits < bpv:
|
||||
f.write(" throw new UnsupportedOperationException();\n")
|
||||
f.write(" }\n\n")
|
||||
return
|
||||
f.write(" assert blocksOffset + iterations * blockCount() <= blocks.length;\n")
|
||||
f.write(" assert valuesOffset + iterations * valueCount() <= values.length;\n")
|
||||
f.write(" for (int i = 0; i < iterations; ++i) {\n")
|
||||
for i in xrange(values):
|
||||
block_offset = i / values
|
||||
offset_in_block = i % values
|
||||
if i == 0:
|
||||
f.write(" blocks[bi++] = values[vi++]")
|
||||
f.write(" blocks[blocksOffset++] = %svalues[valuesOffset++]%s" %(mask_start, mask_end))
|
||||
else:
|
||||
f.write(" | (values[vi++] << %d)" %(i * bpv))
|
||||
f.write(" | (%svalues[valuesOffset++]%s << %d)" %(mask_start, mask_end, i * bpv))
|
||||
if i == values - 1:
|
||||
f.write(";\n")
|
||||
f.write(" }\n")
|
||||
f.write(" }\n")
|
||||
|
||||
f.write(" }\n")
|
||||
f.write(" }\n\n")
|
||||
|
||||
def packed64(bpv, f):
|
||||
blocks = bpv
|
||||
|
@ -169,96 +267,180 @@ def packed64(bpv, f):
|
|||
values /= 2
|
||||
assert values * bpv == 64 * blocks, "%d values, %d blocks, %d bits per value" %(values, blocks, bpv)
|
||||
mask = (1 << bpv) - 1
|
||||
f.write(" static final class Packed64BulkOperation%d extends BulkOperation {\n\n" %bpv)
|
||||
f.write(" public int blocks() {\n")
|
||||
f.write(" PACKED_%d {\n\n" %bpv)
|
||||
f.write(" public int blockCount() {\n")
|
||||
f.write(" return %d;\n" %blocks)
|
||||
f.write(" }\n\n")
|
||||
f.write(" public int values() {\n")
|
||||
f.write(" public int valueCount() {\n")
|
||||
f.write(" return %d;\n" %values)
|
||||
f.write(" }\n\n")
|
||||
|
||||
if bpv == 64:
|
||||
f.write(""" public void get(long[] blocks, int bi, long[] values, int vi, int iterations) {
|
||||
System.arraycopy(blocks, bi, values, vi, iterations);
|
||||
f.write(""" public void decode(long[] blocks, int blocksOffset, long[] values, int valuesOffset, int iterations) {
|
||||
System.arraycopy(blocks, blocksOffset, values, valuesOffset, valueCount() * iterations);
|
||||
}
|
||||
|
||||
public void set(long[] blocks, int bi, long[] values, int vi, int iterations) {
|
||||
System.arraycopy(values, bi, blocks, vi, iterations);
|
||||
public void decode(byte[] blocks, int blocksOffset, long[] values, int valuesOffset, int iterations) {
|
||||
LongBuffer.wrap(values, valuesOffset, iterations * valueCount()).put(ByteBuffer.wrap(blocks, blocksOffset, 8 * iterations * blockCount()).asLongBuffer());
|
||||
}
|
||||
|
||||
public void encode(long[] values, int valuesOffset, long[] blocks, int blocksOffset, int iterations) {
|
||||
System.arraycopy(values, valuesOffset, blocks, blocksOffset, valueCount() * iterations);
|
||||
}
|
||||
|
||||
}
|
||||
""")
|
||||
return
|
||||
else:
|
||||
p64_decode(bpv, 32, values)
|
||||
p64_decode(bpv, 64, values)
|
||||
p64_encode(bpv, 32, values)
|
||||
p64_encode(bpv, 64, values)
|
||||
f.write(" }\n")
|
||||
|
||||
f.write(" public void get(long[] blocks, int bi, long[] values, int vi, int iterations) {\n")
|
||||
f.write(" assert bi + iterations * blocks() <= blocks.length;\n")
|
||||
f.write(" assert vi + iterations * values() <= values.length;\n")
|
||||
def p64_decode(bpv, bits, values):
|
||||
typ = get_type(bits)
|
||||
cast_start, cast_end = casts(typ)
|
||||
|
||||
f.write(" public void decode(long[] blocks, int blocksOffset, %s[] values, int valuesOffset, int iterations) {\n" %typ)
|
||||
if bits < bpv:
|
||||
f.write(" throw new UnsupportedOperationException();\n")
|
||||
f.write(" }\n\n")
|
||||
return
|
||||
f.write(" assert blocksOffset + iterations * blockCount() <= blocks.length;\n")
|
||||
f.write(" assert valuesOffset + iterations * valueCount() <= values.length;\n")
|
||||
f.write(" for (int i = 0; i < iterations; ++i) {\n")
|
||||
mask = (1 << bpv) - 1
|
||||
for i in xrange(0, values):
|
||||
block_offset = i * bpv / 64
|
||||
bit_offset = (i * bpv) % 64
|
||||
if bit_offset == 0:
|
||||
# start of block
|
||||
f.write(" final long block%d = blocks[bi++];\n" %block_offset);
|
||||
f.write(" values[vi++] = block%d >>> %d;\n" %(block_offset, 64 - bpv))
|
||||
f.write(" final long block%d = blocks[blocksOffset++];\n" %block_offset);
|
||||
f.write(" values[valuesOffset++] = %sblock%d >>> %d%s;\n" %(cast_start, block_offset, 64 - bpv, cast_end))
|
||||
elif bit_offset + bpv == 64:
|
||||
# end of block
|
||||
f.write(" values[vi++] = block%d & %dL;\n" %(block_offset, mask))
|
||||
f.write(" values[valuesOffset++] = %sblock%d & %dL%s;\n" %(cast_start, block_offset, mask, cast_end))
|
||||
elif bit_offset + bpv < 64:
|
||||
# middle of block
|
||||
f.write(" values[vi++] = (block%d >>> %d) & %dL;\n" %(block_offset, 64 - bit_offset - bpv, mask))
|
||||
f.write(" values[valuesOffset++] = %s(block%d >>> %d) & %dL%s;\n" %(cast_start, block_offset, 64 - bit_offset - bpv, mask, cast_end))
|
||||
else:
|
||||
# value spans across 2 blocks
|
||||
mask1 = (1 << (64 - bit_offset)) -1
|
||||
shift1 = bit_offset + bpv - 64
|
||||
shift2 = 64 - shift1
|
||||
f.write(" final long block%d = blocks[bi++];\n" %(block_offset + 1));
|
||||
f.write(" values[vi++] = ((block%d & %dL) << %d) | (block%d >>> %d);\n" %(block_offset, mask1, shift1, block_offset + 1, shift2))
|
||||
f.write(" final long block%d = blocks[blocksOffset++];\n" %(block_offset + 1));
|
||||
f.write(" values[valuesOffset++] = %s((block%d & %dL) << %d) | (block%d >>> %d)%s;\n" %(cast_start, block_offset, mask1, shift1, block_offset + 1, shift2, cast_end))
|
||||
f.write(" }\n")
|
||||
f.write(" }\n\n")
|
||||
|
||||
f.write(" public void set(long[] blocks, int bi, long[] values, int vi, int iterations) {\n")
|
||||
f.write(" assert bi + iterations * blocks() <= blocks.length;\n")
|
||||
f.write(" assert vi + iterations * values() <= values.length;\n")
|
||||
f.write(" public void decode(byte[] blocks, int blocksOffset, %s[] values, int valuesOffset, int iterations) {\n" %typ)
|
||||
if bits < bpv:
|
||||
f.write(" throw new UnsupportedOperationException();\n")
|
||||
f.write(" }\n\n")
|
||||
return
|
||||
f.write(" assert blocksOffset + 8 * iterations * blockCount() <= blocks.length;\n")
|
||||
f.write(" assert valuesOffset + iterations * valueCount() <= values.length;\n")
|
||||
f.write(" for (int i = 0; i < iterations; ++i) {\n")
|
||||
blocks = values * bpv / 8
|
||||
for i in xrange(0, values):
|
||||
byte_start = i * bpv / 8
|
||||
bit_start = (i * bpv) % 8
|
||||
byte_end = ((i + 1) * bpv - 1) / 8
|
||||
bit_end = ((i + 1) * bpv - 1) % 8
|
||||
shift = lambda b: 8 * (byte_end - b - 1) + 1 + bit_end
|
||||
if bit_start == 0:
|
||||
f.write(" final %s byte%d = blocks[blocksOffset++] & 0xFF;\n" %(typ, byte_start))
|
||||
for b in xrange(byte_start + 1, byte_end + 1):
|
||||
f.write(" final %s byte%d = blocks[blocksOffset++] & 0xFF;\n" %(typ, b))
|
||||
f.write(" values[valuesOffset++] =")
|
||||
if byte_start == byte_end:
|
||||
if bit_start == 0:
|
||||
if bit_end == 7:
|
||||
f.write(" byte%d" %byte_start)
|
||||
else:
|
||||
f.write(" byte%d >>> %d" %(byte_start, 7 - bit_end))
|
||||
else:
|
||||
if bit_end == 7:
|
||||
f.write(" byte%d & %d" %(byte_start, 2 ** (8 - bit_start) - 1))
|
||||
else:
|
||||
f.write(" (byte%d >>> %d) & %d" %(byte_start, 7 - bit_end, 2 ** (bit_end - bit_start + 1) - 1))
|
||||
else:
|
||||
if bit_start == 0:
|
||||
f.write(" (byte%d << %d)" %(byte_start, shift(byte_start)))
|
||||
else:
|
||||
f.write(" ((byte%d & %d) << %d)" %(byte_start, 2 ** (8 - bit_start) - 1, shift(byte_start)))
|
||||
for b in xrange(byte_start + 1, byte_end):
|
||||
f.write(" | (byte%d << %d)" %(b, shift(b)))
|
||||
if bit_end == 7:
|
||||
f.write(" | byte%d" %byte_end)
|
||||
else:
|
||||
f.write(" | (byte%d >>> %d)" %(byte_end, 7 - bit_end))
|
||||
f.write(";\n")
|
||||
f.write(" }\n")
|
||||
f.write(" }\n\n")
|
||||
|
||||
def p64_encode(bpv, bits, values):
|
||||
typ = get_type(bits)
|
||||
mask_start, mask_end = masks(bits)
|
||||
f.write(" public void encode(%s[] values, int valuesOffset, long[] blocks, int blocksOffset, int iterations) {\n" %typ)
|
||||
f.write(" assert blocksOffset + iterations * blockCount() <= blocks.length;\n")
|
||||
f.write(" assert valuesOffset + iterations * valueCount() <= values.length;\n")
|
||||
f.write(" for (int i = 0; i < iterations; ++i) {\n")
|
||||
for i in xrange(0, values):
|
||||
block_offset = i * bpv / 64
|
||||
bit_offset = (i * bpv) % 64
|
||||
if bit_offset == 0:
|
||||
# start of block
|
||||
f.write(" blocks[bi++] = (values[vi++] << %d)" %(64 - bpv))
|
||||
f.write(" blocks[blocksOffset++] = (%svalues[valuesOffset++]%s << %d)" %(mask_start, mask_end, 64 - bpv))
|
||||
elif bit_offset + bpv == 64:
|
||||
# end of block
|
||||
f.write(" | values[vi++];\n")
|
||||
f.write(" | %svalues[valuesOffset++]%s;\n" %(mask_start, mask_end))
|
||||
elif bit_offset + bpv < 64:
|
||||
# inside a block
|
||||
f.write(" | (values[vi++] << %d)" %(64 - bit_offset - bpv))
|
||||
f.write(" | (%svalues[valuesOffset++]%s << %d)" %(mask_start, mask_end, 64 - bit_offset - bpv))
|
||||
else:
|
||||
# value spans across 2 blocks
|
||||
right_bits = bit_offset + bpv - 64
|
||||
f.write(" | (values[vi] >>> %d);\n" %right_bits)
|
||||
f.write(" blocks[bi++] = (values[vi++] << %d)" %(64 - right_bits))
|
||||
f.write(" | (%svalues[valuesOffset]%s >>> %d);\n" %(mask_start, mask_end, right_bits))
|
||||
f.write(" blocks[blocksOffset++] = (%svalues[valuesOffset++]%s << %d)" %(mask_start, mask_end, 64 - right_bits))
|
||||
f.write(" }\n")
|
||||
f.write(" }\n")
|
||||
|
||||
f.write(" }\n\n")
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
p64_bpv = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 16, 21, 32]
|
||||
f = open(OUTPUT_FILE, 'w')
|
||||
f.write(HEADER)
|
||||
f.write(" static {\n")
|
||||
f.write(" BULK_OPERATIONS.put(PackedInts.Format.PACKED, new BulkOperation[65]);")
|
||||
for bpv in xrange(1, 65):
|
||||
f.write(" BULK_OPERATIONS.get(PackedInts.Format.PACKED)[%d] = new Packed64BulkOperation%d();\n" %(bpv, bpv))
|
||||
f.write(" BULK_OPERATIONS.put(PackedInts.Format.PACKED_SINGLE_BLOCK, new BulkOperation[65]);\n")
|
||||
for bpv in PACKED_64_SINGLE_BLOCK_BPV:
|
||||
f.write(" BULK_OPERATIONS.get(PackedInts.Format.PACKED_SINGLE_BLOCK)[%d] = new Packed64SingleBlockBulkOperation%d();\n" %(bpv, bpv))
|
||||
f.write(" }\n")
|
||||
for bpv in xrange(1, 65):
|
||||
packed64(bpv, f)
|
||||
f.write(" ,\n")
|
||||
for bpv in PACKED_64_SINGLE_BLOCK_BPV:
|
||||
if bpv != PACKED_64_SINGLE_BLOCK_BPV[0]:
|
||||
f.write(" ,\n")
|
||||
packed64singleblock(bpv,f)
|
||||
f.write(" ;\n\n")
|
||||
f.write(" public static BulkOperation of(PackedInts.Format format, int bitsPerValue) {\n")
|
||||
f.write(" switch (format) {\n")
|
||||
|
||||
f.write(" case PACKED:\n")
|
||||
f.write(" switch (bitsPerValue) {\n")
|
||||
for i in xrange(1, 65):
|
||||
f.write(" case %d:\n" %i)
|
||||
f.write(" return PACKED_%d;\n" %i)
|
||||
f.write(" default:\n")
|
||||
f.write(" throw new AssertionError();\n")
|
||||
f.write(" }\n")
|
||||
f.write(" case PACKED_SINGLE_BLOCK:\n")
|
||||
f.write(" switch (bitsPerValue) {\n")
|
||||
for i in PACKED_64_SINGLE_BLOCK_BPV:
|
||||
f.write(" case %d:\n" %i)
|
||||
f.write(" return PACKED_SINGLE_BLOCK_%d;\n" %i)
|
||||
f.write(" default:\n")
|
||||
f.write(" throw new AssertionError();\n")
|
||||
f.write(" }\n")
|
||||
f.write(" default:\n")
|
||||
f.write(" throw new AssertionError();\n")
|
||||
f.write(" }\n")
|
||||
f.write(" }\n")
|
||||
f.write(FOOTER)
|
||||
f.close()
|
||||
|
|
|
@ -19,3 +19,4 @@ org.apache.lucene.codecs.simpletext.SimpleTextPostingsFormat
|
|||
org.apache.lucene.codecs.memory.MemoryPostingsFormat
|
||||
org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat
|
||||
org.apache.lucene.codecs.memory.DirectPostingsFormat
|
||||
org.apache.lucene.codecs.block.BlockPostingsFormat
|
||||
|
|
|
@ -0,0 +1,94 @@
|
|||
package org.apache.lucene.codecs.block;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import static org.apache.lucene.codecs.block.BlockPostingsFormat.BLOCK_SIZE;
|
||||
import static org.apache.lucene.codecs.block.ForUtil.MAX_DATA_SIZE;
|
||||
import static org.apache.lucene.codecs.block.ForUtil.MAX_ENCODED_SIZE;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomInts;
|
||||
|
||||
public class TestForUtil extends LuceneTestCase {
|
||||
|
||||
public void testEncodeDecode() throws IOException {
|
||||
final int iterations = RandomInts.randomIntBetween(random(), 1, 1000);
|
||||
final float acceptableOverheadRatio = random().nextFloat();
|
||||
final int[] values = new int[(iterations - 1) * BLOCK_SIZE + ForUtil.MAX_DATA_SIZE];
|
||||
for (int i = 0; i < iterations; ++i) {
|
||||
final int bpv = random().nextInt(32);
|
||||
if (bpv == 0) {
|
||||
final int value = RandomInts.randomIntBetween(random(), 0, Integer.MAX_VALUE);
|
||||
for (int j = 0; j < BLOCK_SIZE; ++j) {
|
||||
values[i * BLOCK_SIZE + j] = value;
|
||||
}
|
||||
} else {
|
||||
for (int j = 0; j < BLOCK_SIZE; ++j) {
|
||||
values[i * BLOCK_SIZE + j] = RandomInts.randomIntBetween(random(),
|
||||
0, (int) PackedInts.maxValue(bpv));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
final Directory d = new RAMDirectory();
|
||||
final long endPointer;
|
||||
|
||||
{
|
||||
// encode
|
||||
IndexOutput out = d.createOutput("test.bin", IOContext.DEFAULT);
|
||||
final ForUtil forUtil = new ForUtil(acceptableOverheadRatio, out);
|
||||
|
||||
for (int i = 0; i < iterations; ++i) {
|
||||
forUtil.writeBlock(
|
||||
Arrays.copyOfRange(values, i * BLOCK_SIZE, values.length),
|
||||
new byte[MAX_ENCODED_SIZE], out);
|
||||
}
|
||||
endPointer = out.getFilePointer();
|
||||
out.close();
|
||||
}
|
||||
|
||||
{
|
||||
// decode
|
||||
IndexInput in = d.openInput("test.bin", IOContext.READONCE);
|
||||
final ForUtil forUtil = new ForUtil(in);
|
||||
for (int i = 0; i < iterations; ++i) {
|
||||
if (random().nextBoolean()) {
|
||||
forUtil.skipBlock(in);
|
||||
continue;
|
||||
}
|
||||
final int[] restored = new int[MAX_DATA_SIZE];
|
||||
forUtil.readBlock(in, new byte[MAX_ENCODED_SIZE], restored);
|
||||
assertArrayEquals(Arrays.copyOfRange(values, i * BLOCK_SIZE, (i + 1) * BLOCK_SIZE),
|
||||
Arrays.copyOf(restored, BLOCK_SIZE));
|
||||
}
|
||||
assertEquals(endPointer, in.getFilePointer());
|
||||
in.close();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -933,6 +933,7 @@ public class TestPostingsFormat extends LuceneTestCase {
|
|||
// NOTE: you can also test "weaker" index options than
|
||||
// you indexed with:
|
||||
testTerms(fieldsProducer, EnumSet.allOf(Option.class), IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
//testTerms(fieldsProducer, EnumSet.complementOf(EnumSet.of(Option.THREADS)), IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
|
||||
fieldsProducer.close();
|
||||
dir.close();
|
||||
|
|
|
@ -18,17 +18,24 @@ package org.apache.lucene.util.packed;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.LongBuffer;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.store.*;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.util.LongsRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
import org.apache.lucene.util.LuceneTestCase.Slow;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
import org.apache.lucene.util.packed.PackedInts.Reader;
|
||||
|
||||
@Slow
|
||||
|
@ -622,4 +629,106 @@ public class TestPackedInts extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
public void testEncodeDecode() {
|
||||
for (PackedInts.Format format : PackedInts.Format.values()) {
|
||||
for (int bpv = 1; bpv <= 64; ++bpv) {
|
||||
if (!format.isSupported(bpv)) {
|
||||
continue;
|
||||
}
|
||||
String msg = format + " " + bpv;
|
||||
|
||||
final PackedInts.Encoder encoder = PackedInts.getEncoder(format, PackedInts.VERSION_CURRENT, bpv);
|
||||
final PackedInts.Decoder decoder = PackedInts.getDecoder(format, PackedInts.VERSION_CURRENT, bpv);
|
||||
final int blockCount = encoder.blockCount();
|
||||
final int valueCount = encoder.valueCount();
|
||||
assertEquals(blockCount, decoder.blockCount());
|
||||
assertEquals(valueCount, decoder.valueCount());
|
||||
|
||||
final int iterations = random().nextInt(100);
|
||||
final int blocksOffset = random().nextInt(100);
|
||||
final int valuesOffset = random().nextInt(100);
|
||||
final int blocksOffset2 = random().nextInt(100);
|
||||
final int blocksLen = iterations * blockCount;
|
||||
|
||||
// 1. generate random inputs
|
||||
final long[] blocks = new long[blocksOffset + blocksLen];
|
||||
for (int i = 0; i < blocks.length; ++i) {
|
||||
blocks[i] = random().nextLong();
|
||||
if (format == PackedInts.Format.PACKED_SINGLE_BLOCK && 64 % bpv != 0) {
|
||||
// clear highest bits for packed
|
||||
final int toClear = 64 % bpv;
|
||||
blocks[i] = (blocks[i] << toClear) >>> toClear;
|
||||
}
|
||||
}
|
||||
|
||||
// 2. decode
|
||||
final long[] values = new long[valuesOffset + iterations * valueCount];
|
||||
decoder.decode(blocks, blocksOffset, values, valuesOffset, iterations);
|
||||
for (long value : values) {
|
||||
assertTrue(value <= PackedInts.maxValue(bpv));
|
||||
}
|
||||
// test decoding to int[]
|
||||
final int[] intValues;
|
||||
if (bpv <= 32) {
|
||||
intValues = new int[values.length];
|
||||
decoder.decode(blocks, blocksOffset, intValues, valuesOffset, iterations);
|
||||
assertTrue(equals(intValues, values));
|
||||
} else {
|
||||
intValues = null;
|
||||
}
|
||||
|
||||
// 3. re-encode
|
||||
final long[] blocks2 = new long[blocksOffset2 + blocksLen];
|
||||
encoder.encode(values, valuesOffset, blocks2, blocksOffset2, iterations);
|
||||
assertArrayEquals(msg, Arrays.copyOfRange(blocks, blocksOffset, blocks.length),
|
||||
Arrays.copyOfRange(blocks2, blocksOffset2, blocks2.length));
|
||||
// test encoding from int[]
|
||||
if (bpv <= 32) {
|
||||
final long[] blocks3 = new long[blocks2.length];
|
||||
encoder.encode(intValues, valuesOffset, blocks3, blocksOffset2, iterations);
|
||||
assertArrayEquals(msg, blocks2, blocks3);
|
||||
}
|
||||
|
||||
// 4. byte[] decoding
|
||||
final byte[] byteBlocks = new byte[8 * blocks.length];
|
||||
ByteBuffer.wrap(byteBlocks).asLongBuffer().put(blocks);
|
||||
final long[] values2 = new long[valuesOffset + iterations * valueCount];
|
||||
decoder.decode(byteBlocks, blocksOffset * 8, values2, valuesOffset, iterations);
|
||||
for (long value : values2) {
|
||||
assertTrue(msg, value <= PackedInts.maxValue(bpv));
|
||||
}
|
||||
assertArrayEquals(msg, values, values2);
|
||||
// test decoding to int[]
|
||||
if (bpv <= 32) {
|
||||
final int[] intValues2 = new int[values2.length];
|
||||
decoder.decode(byteBlocks, blocksOffset * 8, intValues2, valuesOffset, iterations);
|
||||
assertTrue(msg, equals(intValues2, values2));
|
||||
}
|
||||
|
||||
// 5. byte[] encoding
|
||||
final byte[] blocks3 = new byte[8 * (blocksOffset2 + blocksLen)];
|
||||
encoder.encode(values, valuesOffset, blocks3, 8 * blocksOffset2, iterations);
|
||||
assertEquals(msg, LongBuffer.wrap(blocks2), ByteBuffer.wrap(blocks3).asLongBuffer());
|
||||
// test encoding from int[]
|
||||
if (bpv <= 32) {
|
||||
final byte[] blocks4 = new byte[blocks3.length];
|
||||
encoder.encode(intValues, valuesOffset, blocks4, 8 * blocksOffset2, iterations);
|
||||
assertArrayEquals(msg, blocks3, blocks4);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean equals(int[] ints, long[] longs) {
|
||||
if (ints.length != longs.length) {
|
||||
return false;
|
||||
}
|
||||
for (int i = 0; i < ints.length; ++i) {
|
||||
if ((ints[i] & 0xFFFFFFFFL) != longs[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -87,7 +87,7 @@ public class MockSingleIntIndexInput extends IntIndexInput {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void set(IntIndexInput.Index other) {
|
||||
public void copyFrom(IntIndexInput.Index other) {
|
||||
fp = ((MockSingleIntIndexInputIndex) other).fp;
|
||||
}
|
||||
|
||||
|
|
|
@ -310,9 +310,7 @@ public abstract class LuceneTestCase extends Assert {
|
|||
"MockFixedIntBlock",
|
||||
"MockVariableIntBlock",
|
||||
"MockSep",
|
||||
"MockRandom",
|
||||
"For",
|
||||
"PFor"
|
||||
"MockRandom"
|
||||
));
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
|
|
Loading…
Reference in New Issue