mirror of https://github.com/apache/lucene.git
fork postingsformat: TempBlock
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3069@1493376 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
16fe4335b2
commit
ee49f10cf0
|
@ -0,0 +1,77 @@
|
||||||
|
package org.apache.lucene.codecs;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Closeable;
|
||||||
|
|
||||||
|
import org.apache.lucene.index.DocsEnum;
|
||||||
|
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||||
|
import org.apache.lucene.index.FieldInfo;
|
||||||
|
import org.apache.lucene.store.IndexInput;
|
||||||
|
import org.apache.lucene.util.Bits;
|
||||||
|
import org.apache.lucene.codecs.temp.TempTermState;
|
||||||
|
|
||||||
|
/** The core terms dictionaries (BlockTermsReader,
|
||||||
|
* BlockTreeTermsReader) interact with a single instance
|
||||||
|
* of this class to manage creation of {@link DocsEnum} and
|
||||||
|
* {@link DocsAndPositionsEnum} instances. It provides an
|
||||||
|
* IndexInput (termsIn) where this class may read any
|
||||||
|
* previously stored data that it had written in its
|
||||||
|
* corresponding {@link PostingsWriterBase} at indexing
|
||||||
|
* time.
|
||||||
|
* @lucene.experimental */
|
||||||
|
|
||||||
|
// TODO: find a better name; this defines the API that the
|
||||||
|
// terms dict impls use to talk to a postings impl.
|
||||||
|
// TermsDict + PostingsReader/WriterBase == PostingsConsumer/Producer
|
||||||
|
public abstract class TempPostingsReaderBase implements Closeable {
|
||||||
|
|
||||||
|
/** Sole constructor. (For invocation by subclass
|
||||||
|
* constructors, typically implicit.) */
|
||||||
|
protected TempPostingsReaderBase() {
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Performs any initialization, such as reading and
|
||||||
|
* verifying the header from the provided terms
|
||||||
|
* dictionary {@link IndexInput}. */
|
||||||
|
public abstract void init(IndexInput termsIn) throws IOException;
|
||||||
|
|
||||||
|
/** Return a newly created empty TermState */
|
||||||
|
public abstract TempTermState newTermState() throws IOException;
|
||||||
|
|
||||||
|
/** Actually decode metadata for next term */
|
||||||
|
public abstract void nextTerm(FieldInfo fieldInfo, TempTermState state) throws IOException;
|
||||||
|
|
||||||
|
/** Must fully consume state, since after this call that
|
||||||
|
* TermState may be reused. */
|
||||||
|
public abstract DocsEnum docs(FieldInfo fieldInfo, TempTermState state, Bits skipDocs, DocsEnum reuse, int flags) throws IOException;
|
||||||
|
|
||||||
|
/** Must fully consume state, since after this call that
|
||||||
|
* TermState may be reused. */
|
||||||
|
public abstract DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, TempTermState state, Bits skipDocs, DocsAndPositionsEnum reuse,
|
||||||
|
int flags) throws IOException;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public abstract void close() throws IOException;
|
||||||
|
|
||||||
|
/** Reads data for all terms in the next block; this
|
||||||
|
* method should merely load the byte[] blob but not
|
||||||
|
* decode, which is done in {@link #nextTerm}. */
|
||||||
|
public abstract void readTermsBlock(IndexInput termsIn, FieldInfo fieldInfo, TempTermState termState) throws IOException;
|
||||||
|
}
|
|
@ -0,0 +1,73 @@
|
||||||
|
package org.apache.lucene.codecs;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Closeable;
|
||||||
|
|
||||||
|
import org.apache.lucene.store.IndexOutput;
|
||||||
|
import org.apache.lucene.index.FieldInfo;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extension of {@link PostingsConsumer} to support pluggable term dictionaries.
|
||||||
|
* <p>
|
||||||
|
* This class contains additional hooks to interact with the provided
|
||||||
|
* term dictionaries such as {@link BlockTreeTermsWriter}. If you want
|
||||||
|
* to re-use an existing implementation and are only interested in
|
||||||
|
* customizing the format of the postings list, extend this class
|
||||||
|
* instead.
|
||||||
|
*
|
||||||
|
* @see PostingsReaderBase
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
// TODO: find a better name; this defines the API that the
|
||||||
|
// terms dict impls use to talk to a postings impl.
|
||||||
|
// TermsDict + PostingsReader/WriterBase == PostingsConsumer/Producer
|
||||||
|
public abstract class TempPostingsWriterBase extends PostingsConsumer implements Closeable {
|
||||||
|
|
||||||
|
/** Sole constructor. (For invocation by subclass
|
||||||
|
* constructors, typically implicit.) */
|
||||||
|
protected TempPostingsWriterBase() {
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Called once after startup, before any terms have been
|
||||||
|
* added. Implementations typically write a header to
|
||||||
|
* the provided {@code termsOut}. */
|
||||||
|
public abstract void start(IndexOutput termsOut) throws IOException;
|
||||||
|
|
||||||
|
/** Start a new term. Note that a matching call to {@link
|
||||||
|
* #finishTerm(TermStats)} is done, only if the term has at least one
|
||||||
|
* document. */
|
||||||
|
public abstract void startTerm() throws IOException;
|
||||||
|
|
||||||
|
/** Flush count terms starting at start "backwards", as a
|
||||||
|
* block. start is a negative offset from the end of the
|
||||||
|
* terms stack, ie bigger start means further back in
|
||||||
|
* the stack. */
|
||||||
|
public abstract void flushTermsBlock(int start, int count) throws IOException;
|
||||||
|
|
||||||
|
/** Finishes the current term. The provided {@link
|
||||||
|
* TermStats} contains the term's summary statistics. */
|
||||||
|
public abstract void finishTerm(TermStats stats) throws IOException;
|
||||||
|
|
||||||
|
/** Called when the writing switches to another field. */
|
||||||
|
public abstract void setField(FieldInfo fieldInfo);
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public abstract void close() throws IOException;
|
||||||
|
}
|
|
@ -33,7 +33,7 @@ import static org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat.BLOCK_SIZ
|
||||||
* Encode all values in normal area with fixed bit width,
|
* Encode all values in normal area with fixed bit width,
|
||||||
* which is determined by the max value in this block.
|
* which is determined by the max value in this block.
|
||||||
*/
|
*/
|
||||||
final class ForUtil {
|
public final class ForUtil {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Special number of bits per value used whenever all values to encode are equal.
|
* Special number of bits per value used whenever all values to encode are equal.
|
||||||
|
@ -44,7 +44,7 @@ final class ForUtil {
|
||||||
* Upper limit of the number of bytes that might be required to stored
|
* Upper limit of the number of bytes that might be required to stored
|
||||||
* <code>BLOCK_SIZE</code> encoded values.
|
* <code>BLOCK_SIZE</code> encoded values.
|
||||||
*/
|
*/
|
||||||
static final int MAX_ENCODED_SIZE = BLOCK_SIZE * 4;
|
public static final int MAX_ENCODED_SIZE = BLOCK_SIZE * 4;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Upper limit of the number of values that might be decoded in a single call to
|
* Upper limit of the number of values that might be decoded in a single call to
|
||||||
|
@ -52,7 +52,7 @@ final class ForUtil {
|
||||||
* <code>BLOCK_SIZE</code> are garbage, it is necessary to allocate value buffers
|
* <code>BLOCK_SIZE</code> are garbage, it is necessary to allocate value buffers
|
||||||
* whose size is >= MAX_DATA_SIZE to avoid {@link ArrayIndexOutOfBoundsException}s.
|
* whose size is >= MAX_DATA_SIZE to avoid {@link ArrayIndexOutOfBoundsException}s.
|
||||||
*/
|
*/
|
||||||
static final int MAX_DATA_SIZE;
|
public static final int MAX_DATA_SIZE;
|
||||||
static {
|
static {
|
||||||
int maxDataSize = 0;
|
int maxDataSize = 0;
|
||||||
for(int version=PackedInts.VERSION_START;version<=PackedInts.VERSION_CURRENT;version++) {
|
for(int version=PackedInts.VERSION_START;version<=PackedInts.VERSION_CURRENT;version++) {
|
||||||
|
@ -96,7 +96,7 @@ final class ForUtil {
|
||||||
/**
|
/**
|
||||||
* Create a new {@link ForUtil} instance and save state into <code>out</code>.
|
* Create a new {@link ForUtil} instance and save state into <code>out</code>.
|
||||||
*/
|
*/
|
||||||
ForUtil(float acceptableOverheadRatio, DataOutput out) throws IOException {
|
public ForUtil(float acceptableOverheadRatio, DataOutput out) throws IOException {
|
||||||
out.writeVInt(PackedInts.VERSION_CURRENT);
|
out.writeVInt(PackedInts.VERSION_CURRENT);
|
||||||
encodedSizes = new int[33];
|
encodedSizes = new int[33];
|
||||||
encoders = new PackedInts.Encoder[33];
|
encoders = new PackedInts.Encoder[33];
|
||||||
|
@ -122,7 +122,7 @@ final class ForUtil {
|
||||||
/**
|
/**
|
||||||
* Restore a {@link ForUtil} from a {@link DataInput}.
|
* Restore a {@link ForUtil} from a {@link DataInput}.
|
||||||
*/
|
*/
|
||||||
ForUtil(DataInput in) throws IOException {
|
public ForUtil(DataInput in) throws IOException {
|
||||||
int packedIntsVersion = in.readVInt();
|
int packedIntsVersion = in.readVInt();
|
||||||
PackedInts.checkVersion(packedIntsVersion);
|
PackedInts.checkVersion(packedIntsVersion);
|
||||||
encodedSizes = new int[33];
|
encodedSizes = new int[33];
|
||||||
|
@ -154,7 +154,7 @@ final class ForUtil {
|
||||||
* @param out the destination output
|
* @param out the destination output
|
||||||
* @throws IOException If there is a low-level I/O error
|
* @throws IOException If there is a low-level I/O error
|
||||||
*/
|
*/
|
||||||
void writeBlock(int[] data, byte[] encoded, IndexOutput out) throws IOException {
|
public void writeBlock(int[] data, byte[] encoded, IndexOutput out) throws IOException {
|
||||||
if (isAllEqual(data)) {
|
if (isAllEqual(data)) {
|
||||||
out.writeByte((byte) ALL_VALUES_EQUAL);
|
out.writeByte((byte) ALL_VALUES_EQUAL);
|
||||||
out.writeVInt(data[0]);
|
out.writeVInt(data[0]);
|
||||||
|
@ -183,7 +183,7 @@ final class ForUtil {
|
||||||
* @param decoded where to write decoded data
|
* @param decoded where to write decoded data
|
||||||
* @throws IOException If there is a low-level I/O error
|
* @throws IOException If there is a low-level I/O error
|
||||||
*/
|
*/
|
||||||
void readBlock(IndexInput in, byte[] encoded, int[] decoded) throws IOException {
|
public void readBlock(IndexInput in, byte[] encoded, int[] decoded) throws IOException {
|
||||||
final int numBits = in.readByte();
|
final int numBits = in.readByte();
|
||||||
assert numBits <= 32 : numBits;
|
assert numBits <= 32 : numBits;
|
||||||
|
|
||||||
|
@ -209,7 +209,7 @@ final class ForUtil {
|
||||||
* @param in the input where to read data
|
* @param in the input where to read data
|
||||||
* @throws IOException If there is a low-level I/O error
|
* @throws IOException If there is a low-level I/O error
|
||||||
*/
|
*/
|
||||||
void skipBlock(IndexInput in) throws IOException {
|
public void skipBlock(IndexInput in) throws IOException {
|
||||||
final int numBits = in.readByte();
|
final int numBits = in.readByte();
|
||||||
if (numBits == ALL_VALUES_EQUAL) {
|
if (numBits == ALL_VALUES_EQUAL) {
|
||||||
in.readVInt();
|
in.readVInt();
|
||||||
|
|
|
@ -161,7 +161,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
||||||
* <li>SkipFPDelta determines the position of this term's SkipData within the .doc
|
* <li>SkipFPDelta determines the position of this term's SkipData within the .doc
|
||||||
* file. In particular, it is the length of the TermFreq data.
|
* file. In particular, it is the length of the TermFreq data.
|
||||||
* SkipDelta is only stored if DocFreq is not smaller than SkipMinimum
|
* SkipDelta is only stored if DocFreq is not smaller than SkipMinimum
|
||||||
* (i.e. 128 in Lucene41PostingsFormat).</li>
|
* (i.e. 8 in Lucene41PostingsFormat).</li>
|
||||||
* <li>SingletonDocID is an optimization when a term only appears in one document. In this case, instead
|
* <li>SingletonDocID is an optimization when a term only appears in one document. In this case, instead
|
||||||
* of writing a file pointer to the .doc file (DocFPDelta), and then a VIntBlock at that location, the
|
* of writing a file pointer to the .doc file (DocFPDelta), and then a VIntBlock at that location, the
|
||||||
* single document ID is written to the term dictionary.</li>
|
* single document ID is written to the term dictionary.</li>
|
||||||
|
|
|
@ -50,7 +50,7 @@ import org.apache.lucene.store.IndexInput;
|
||||||
* Therefore, we'll trim df before passing it to the interface. see trim(int)
|
* Therefore, we'll trim df before passing it to the interface. see trim(int)
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
final class Lucene41SkipReader extends MultiLevelSkipListReader {
|
public final class Lucene41SkipReader extends MultiLevelSkipListReader {
|
||||||
// private boolean DEBUG = Lucene41PostingsReader.DEBUG;
|
// private boolean DEBUG = Lucene41PostingsReader.DEBUG;
|
||||||
private final int blockSize;
|
private final int blockSize;
|
||||||
|
|
||||||
|
|
|
@ -43,7 +43,7 @@ import org.apache.lucene.codecs.MultiLevelSkipListWriter;
|
||||||
* 4. start offset.
|
* 4. start offset.
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
final class Lucene41SkipWriter extends MultiLevelSkipListWriter {
|
public final class Lucene41SkipWriter extends MultiLevelSkipListWriter {
|
||||||
// private boolean DEBUG = Lucene41PostingsReader.DEBUG;
|
// private boolean DEBUG = Lucene41PostingsReader.DEBUG;
|
||||||
|
|
||||||
private int[] lastSkipDoc;
|
private int[] lastSkipDoc;
|
||||||
|
|
|
@ -381,7 +381,13 @@ the term dictionary. Stored fields are compressed by default. </li>
|
||||||
<a name="Limitations" id="Limitations"></a>
|
<a name="Limitations" id="Limitations"></a>
|
||||||
<h2>Limitations</h2>
|
<h2>Limitations</h2>
|
||||||
<div>
|
<div>
|
||||||
<p>Lucene uses a Java <code>int</code> to refer to
|
<p>When referring to term numbers, Lucene's current implementation uses a Java
|
||||||
|
<code>int</code> to hold the term index, which means the
|
||||||
|
maximum number of unique terms in any single index segment is ~2.1 billion
|
||||||
|
times the term index interval (default 128) = ~274 billion. This is technically
|
||||||
|
not a limitation of the index file format, just of Lucene's current
|
||||||
|
implementation.</p>
|
||||||
|
<p>Similarly, Lucene uses a Java <code>int</code> to refer to
|
||||||
document numbers, and the index file format uses an <code>Int32</code>
|
document numbers, and the index file format uses an <code>Int32</code>
|
||||||
on-disk to store document numbers. This is a limitation
|
on-disk to store document numbers. This is a limitation
|
||||||
of both the index file format and the current implementation. Eventually these
|
of both the index file format and the current implementation. Eventually these
|
||||||
|
|
|
@ -0,0 +1,47 @@
|
||||||
|
package org.apache.lucene.codecs.temp;
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.index.TermState;
|
||||||
|
|
||||||
|
public class TempBlockTermState extends TempTermState {
|
||||||
|
/** the term's ord in the current block */
|
||||||
|
public int termBlockOrd;
|
||||||
|
|
||||||
|
/** Sole constructor. (For invocation by subclass
|
||||||
|
* constructors, typically implicit.) */
|
||||||
|
protected TempBlockTermState() {
|
||||||
|
}
|
||||||
|
|
||||||
|
public TempBlockTermState clone() {
|
||||||
|
TempBlockTermState other = (TempBlockTermState)super.clone();
|
||||||
|
return other;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void copyFrom(TermState _other) {
|
||||||
|
assert _other instanceof TempBlockTermState : "can not copy from " + _other.getClass().getName();
|
||||||
|
super.copyFrom(_other);
|
||||||
|
TempBlockTermState other = (TempBlockTermState) _other;
|
||||||
|
termBlockOrd = other.termBlockOrd;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return super.toString() + " termBlockOrd=" + termBlockOrd;
|
||||||
|
}
|
||||||
|
}
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,450 @@
|
||||||
|
package org.apache.lucene.codecs.temp;
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.codecs.CodecUtil;
|
||||||
|
import org.apache.lucene.codecs.FieldsConsumer;
|
||||||
|
import org.apache.lucene.codecs.FieldsProducer;
|
||||||
|
import org.apache.lucene.codecs.MultiLevelSkipListWriter;
|
||||||
|
import org.apache.lucene.codecs.PostingsFormat;
|
||||||
|
import org.apache.lucene.codecs.TempPostingsReaderBase;
|
||||||
|
import org.apache.lucene.codecs.TempPostingsWriterBase;
|
||||||
|
import org.apache.lucene.index.DocsEnum;
|
||||||
|
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||||
|
import org.apache.lucene.index.SegmentReadState;
|
||||||
|
import org.apache.lucene.index.SegmentWriteState;
|
||||||
|
import org.apache.lucene.store.DataOutput;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
import org.apache.lucene.util.packed.PackedInts;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Lucene 4.1 postings format, which encodes postings in packed integer blocks
|
||||||
|
* for fast decode.
|
||||||
|
*
|
||||||
|
* <p><b>NOTE</b>: this format is still experimental and
|
||||||
|
* subject to change without backwards compatibility.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* Basic idea:
|
||||||
|
* <ul>
|
||||||
|
* <li>
|
||||||
|
* <b>Packed Blocks and VInt Blocks</b>:
|
||||||
|
* <p>In packed blocks, integers are encoded with the same bit width ({@link PackedInts packed format}):
|
||||||
|
* the block size (i.e. number of integers inside block) is fixed (currently 128). Additionally blocks
|
||||||
|
* that are all the same value are encoded in an optimized way.</p>
|
||||||
|
* <p>In VInt blocks, integers are encoded as {@link DataOutput#writeVInt VInt}:
|
||||||
|
* the block size is variable.</p>
|
||||||
|
* </li>
|
||||||
|
*
|
||||||
|
* <li>
|
||||||
|
* <b>Block structure</b>:
|
||||||
|
* <p>When the postings are long enough, TempPostingsFormat will try to encode most integer data
|
||||||
|
* as a packed block.</p>
|
||||||
|
* <p>Take a term with 259 documents as an example, the first 256 document ids are encoded as two packed
|
||||||
|
* blocks, while the remaining 3 are encoded as one VInt block. </p>
|
||||||
|
* <p>Different kinds of data are always encoded separately into different packed blocks, but may
|
||||||
|
* possibly be interleaved into the same VInt block. </p>
|
||||||
|
* <p>This strategy is applied to pairs:
|
||||||
|
* <document number, frequency>,
|
||||||
|
* <position, payload length>,
|
||||||
|
* <position, offset start, offset length>, and
|
||||||
|
* <position, payload length, offsetstart, offset length>.</p>
|
||||||
|
* </li>
|
||||||
|
*
|
||||||
|
* <li>
|
||||||
|
* <b>Skipdata settings</b>:
|
||||||
|
* <p>The structure of skip table is quite similar to previous version of Lucene. Skip interval is the
|
||||||
|
* same as block size, and each skip entry points to the beginning of each block. However, for
|
||||||
|
* the first block, skip data is omitted.</p>
|
||||||
|
* </li>
|
||||||
|
*
|
||||||
|
* <li>
|
||||||
|
* <b>Positions, Payloads, and Offsets</b>:
|
||||||
|
* <p>A position is an integer indicating where the term occurs within one document.
|
||||||
|
* A payload is a blob of metadata associated with current position.
|
||||||
|
* An offset is a pair of integers indicating the tokenized start/end offsets for given term
|
||||||
|
* in current position: it is essentially a specialized payload. </p>
|
||||||
|
* <p>When payloads and offsets are not omitted, numPositions==numPayloads==numOffsets (assuming a
|
||||||
|
* null payload contributes one count). As mentioned in block structure, it is possible to encode
|
||||||
|
* these three either combined or separately.
|
||||||
|
* <p>In all cases, payloads and offsets are stored together. When encoded as a packed block,
|
||||||
|
* position data is separated out as .pos, while payloads and offsets are encoded in .pay (payload
|
||||||
|
* metadata will also be stored directly in .pay). When encoded as VInt blocks, all these three are
|
||||||
|
* stored interleaved into the .pos (so is payload metadata).</p>
|
||||||
|
* <p>With this strategy, the majority of payload and offset data will be outside .pos file.
|
||||||
|
* So for queries that require only position data, running on a full index with payloads and offsets,
|
||||||
|
* this reduces disk pre-fetches.</p>
|
||||||
|
* </li>
|
||||||
|
* </ul>
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* Files and detailed format:
|
||||||
|
* <ul>
|
||||||
|
* <li><tt>.tim</tt>: <a href="#Termdictionary">Term Dictionary</a></li>
|
||||||
|
* <li><tt>.tip</tt>: <a href="#Termindex">Term Index</a></li>
|
||||||
|
* <li><tt>.doc</tt>: <a href="#Frequencies">Frequencies and Skip Data</a></li>
|
||||||
|
* <li><tt>.pos</tt>: <a href="#Positions">Positions</a></li>
|
||||||
|
* <li><tt>.pay</tt>: <a href="#Payloads">Payloads and Offsets</a></li>
|
||||||
|
* </ul>
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* <a name="Termdictionary" id="Termdictionary"></a>
|
||||||
|
* <dl>
|
||||||
|
* <dd>
|
||||||
|
* <b>Term Dictionary</b>
|
||||||
|
*
|
||||||
|
* <p>The .tim file contains the list of terms in each
|
||||||
|
* field along with per-term statistics (such as docfreq)
|
||||||
|
* and pointers to the frequencies, positions, payload and
|
||||||
|
* skip data in the .doc, .pos, and .pay files.
|
||||||
|
* See {@link TempBlockTermsWriter} for more details on the format.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* <p>NOTE: The term dictionary can plug into different postings implementations:
|
||||||
|
* the postings writer/reader are actually responsible for encoding
|
||||||
|
* and decoding the Postings Metadata and Term Metadata sections described here:</p>
|
||||||
|
*
|
||||||
|
* <ul>
|
||||||
|
* <li>Postings Metadata --> Header, PackedBlockSize</li>
|
||||||
|
* <li>Term Metadata --> (DocFPDelta|SingletonDocID), PosFPDelta?, PosVIntBlockFPDelta?, PayFPDelta?,
|
||||||
|
* SkipFPDelta?</li>
|
||||||
|
* <li>Header, --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||||
|
* <li>PackedBlockSize, SingletonDocID --> {@link DataOutput#writeVInt VInt}</li>
|
||||||
|
* <li>DocFPDelta, PosFPDelta, PayFPDelta, PosVIntBlockFPDelta, SkipFPDelta --> {@link DataOutput#writeVLong VLong}</li>
|
||||||
|
* </ul>
|
||||||
|
* <p>Notes:</p>
|
||||||
|
* <ul>
|
||||||
|
* <li>Header is a {@link CodecUtil#writeHeader CodecHeader} storing the version information
|
||||||
|
* for the postings.</li>
|
||||||
|
* <li>PackedBlockSize is the fixed block size for packed blocks. In packed block, bit width is
|
||||||
|
* determined by the largest integer. Smaller block size result in smaller variance among width
|
||||||
|
* of integers hence smaller indexes. Larger block size result in more efficient bulk i/o hence
|
||||||
|
* better acceleration. This value should always be a multiple of 64, currently fixed as 128 as
|
||||||
|
* a tradeoff. It is also the skip interval used to accelerate {@link DocsEnum#advance(int)}.
|
||||||
|
* <li>DocFPDelta determines the position of this term's TermFreqs within the .doc file.
|
||||||
|
* In particular, it is the difference of file offset between this term's
|
||||||
|
* data and previous term's data (or zero, for the first term in the block).On disk it is
|
||||||
|
* stored as the difference from previous value in sequence. </li>
|
||||||
|
* <li>PosFPDelta determines the position of this term's TermPositions within the .pos file.
|
||||||
|
* While PayFPDelta determines the position of this term's <TermPayloads, TermOffsets?> within
|
||||||
|
* the .pay file. Similar to DocFPDelta, it is the difference between two file positions (or
|
||||||
|
* neglected, for fields that omit payloads and offsets).</li>
|
||||||
|
* <li>PosVIntBlockFPDelta determines the position of this term's last TermPosition in last pos packed
|
||||||
|
* block within the .pos file. It is synonym for PayVIntBlockFPDelta or OffsetVIntBlockFPDelta.
|
||||||
|
* This is actually used to indicate whether it is necessary to load following
|
||||||
|
* payloads and offsets from .pos instead of .pay. Every time a new block of positions are to be
|
||||||
|
* loaded, the PostingsReader will use this value to check whether current block is packed format
|
||||||
|
* or VInt. When packed format, payloads and offsets are fetched from .pay, otherwise from .pos.
|
||||||
|
* (this value is neglected when total number of positions i.e. totalTermFreq is less or equal
|
||||||
|
* to PackedBlockSize).
|
||||||
|
* <li>SkipFPDelta determines the position of this term's SkipData within the .doc
|
||||||
|
* file. In particular, it is the length of the TermFreq data.
|
||||||
|
* SkipDelta is only stored if DocFreq is not smaller than SkipMinimum
|
||||||
|
* (i.e. 8 in TempPostingsFormat).</li>
|
||||||
|
* <li>SingletonDocID is an optimization when a term only appears in one document. In this case, instead
|
||||||
|
* of writing a file pointer to the .doc file (DocFPDelta), and then a VIntBlock at that location, the
|
||||||
|
* single document ID is written to the term dictionary.</li>
|
||||||
|
* </ul>
|
||||||
|
* </dd>
|
||||||
|
* </dl>
|
||||||
|
*
|
||||||
|
* <a name="Termindex" id="Termindex"></a>
|
||||||
|
* <dl>
|
||||||
|
* <dd>
|
||||||
|
* <b>Term Index</b>
|
||||||
|
* <p>The .tip file contains an index into the term dictionary, so that it can be
|
||||||
|
* accessed randomly. See {@link TempBlockTermsWriter} for more details on the format.</p>
|
||||||
|
* </dd>
|
||||||
|
* </dl>
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* <a name="Frequencies" id="Frequencies"></a>
|
||||||
|
* <dl>
|
||||||
|
* <dd>
|
||||||
|
* <b>Frequencies and Skip Data</b>
|
||||||
|
*
|
||||||
|
* <p>The .doc file contains the lists of documents which contain each term, along
|
||||||
|
* with the frequency of the term in that document (except when frequencies are
|
||||||
|
* omitted: {@link IndexOptions#DOCS_ONLY}). It also saves skip data to the beginning of
|
||||||
|
* each packed or VInt block, when the length of document list is larger than packed block size.</p>
|
||||||
|
*
|
||||||
|
* <ul>
|
||||||
|
* <li>docFile(.doc) --> Header, <TermFreqs, SkipData?><sup>TermCount</sup></li>
|
||||||
|
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||||
|
* <li>TermFreqs --> <PackedBlock> <sup>PackedDocBlockNum</sup>,
|
||||||
|
* VIntBlock? </li>
|
||||||
|
* <li>PackedBlock --> PackedDocDeltaBlock, PackedFreqBlock?
|
||||||
|
* <li>VIntBlock --> <DocDelta[, Freq?]><sup>DocFreq-PackedBlockSize*PackedDocBlockNum</sup>
|
||||||
|
* <li>SkipData --> <<SkipLevelLength, SkipLevel>
|
||||||
|
* <sup>NumSkipLevels-1</sup>, SkipLevel>, SkipDatum?</li>
|
||||||
|
* <li>SkipLevel --> <SkipDatum> <sup>TrimmedDocFreq/(PackedBlockSize^(Level + 1))</sup></li>
|
||||||
|
* <li>SkipDatum --> DocSkip, DocFPSkip, <PosFPSkip, PosBlockOffset, PayLength?,
|
||||||
|
* PayFPSkip?>?, SkipChildLevelPointer?</li>
|
||||||
|
* <li>PackedDocDeltaBlock, PackedFreqBlock --> {@link PackedInts PackedInts}</li>
|
||||||
|
* <li>DocDelta, Freq, DocSkip, DocFPSkip, PosFPSkip, PosBlockOffset, PayByteUpto, PayFPSkip
|
||||||
|
* -->
|
||||||
|
* {@link DataOutput#writeVInt VInt}</li>
|
||||||
|
* <li>SkipChildLevelPointer --> {@link DataOutput#writeVLong VLong}</li>
|
||||||
|
* </ul>
|
||||||
|
* <p>Notes:</p>
|
||||||
|
* <ul>
|
||||||
|
* <li>PackedDocDeltaBlock is theoretically generated from two steps:
|
||||||
|
* <ol>
|
||||||
|
* <li>Calculate the difference between each document number and previous one,
|
||||||
|
* and get a d-gaps list (for the first document, use absolute value); </li>
|
||||||
|
* <li>For those d-gaps from first one to PackedDocBlockNum*PackedBlockSize<sup>th</sup>,
|
||||||
|
* separately encode as packed blocks.</li>
|
||||||
|
* </ol>
|
||||||
|
* If frequencies are not omitted, PackedFreqBlock will be generated without d-gap step.
|
||||||
|
* </li>
|
||||||
|
* <li>VIntBlock stores remaining d-gaps (along with frequencies when possible) with a format
|
||||||
|
* that encodes DocDelta and Freq:
|
||||||
|
* <p>DocDelta: if frequencies are indexed, this determines both the document
|
||||||
|
* number and the frequency. In particular, DocDelta/2 is the difference between
|
||||||
|
* this document number and the previous document number (or zero when this is the
|
||||||
|
* first document in a TermFreqs). When DocDelta is odd, the frequency is one.
|
||||||
|
* When DocDelta is even, the frequency is read as another VInt. If frequencies
|
||||||
|
* are omitted, DocDelta contains the gap (not multiplied by 2) between document
|
||||||
|
* numbers and no frequency information is stored.</p>
|
||||||
|
* <p>For example, the TermFreqs for a term which occurs once in document seven
|
||||||
|
* and three times in document eleven, with frequencies indexed, would be the
|
||||||
|
* following sequence of VInts:</p>
|
||||||
|
* <p>15, 8, 3</p>
|
||||||
|
* <p>If frequencies were omitted ({@link IndexOptions#DOCS_ONLY}) it would be this
|
||||||
|
* sequence of VInts instead:</p>
|
||||||
|
* <p>7,4</p>
|
||||||
|
* </li>
|
||||||
|
* <li>PackedDocBlockNum is the number of packed blocks for current term's docids or frequencies.
|
||||||
|
* In particular, PackedDocBlockNum = floor(DocFreq/PackedBlockSize) </li>
|
||||||
|
* <li>TrimmedDocFreq = DocFreq % PackedBlockSize == 0 ? DocFreq - 1 : DocFreq.
|
||||||
|
* We use this trick since the definition of skip entry is a little different from base interface.
|
||||||
|
* In {@link MultiLevelSkipListWriter}, skip data is assumed to be saved for
|
||||||
|
* skipInterval<sup>th</sup>, 2*skipInterval<sup>th</sup> ... posting in the list. However,
|
||||||
|
* in TempPostingsFormat, the skip data is saved for skipInterval+1<sup>th</sup>,
|
||||||
|
* 2*skipInterval+1<sup>th</sup> ... posting (skipInterval==PackedBlockSize in this case).
|
||||||
|
* When DocFreq is multiple of PackedBlockSize, MultiLevelSkipListWriter will expect one
|
||||||
|
* more skip data than TempSkipWriter. </li>
|
||||||
|
* <li>SkipDatum is the metadata of one skip entry.
|
||||||
|
* For the first block (no matter packed or VInt), it is omitted.</li>
|
||||||
|
* <li>DocSkip records the document number of every PackedBlockSize<sup>th</sup> document number in
|
||||||
|
* the postings (i.e. last document number in each packed block). On disk it is stored as the
|
||||||
|
* difference from previous value in the sequence. </li>
|
||||||
|
* <li>DocFPSkip records the file offsets of each block (excluding )posting at
|
||||||
|
* PackedBlockSize+1<sup>th</sup>, 2*PackedBlockSize+1<sup>th</sup> ... , in DocFile.
|
||||||
|
* The file offsets are relative to the start of current term's TermFreqs.
|
||||||
|
* On disk it is also stored as the difference from previous SkipDatum in the sequence.</li>
|
||||||
|
* <li>Since positions and payloads are also block encoded, the skip should skip to related block first,
|
||||||
|
* then fetch the values according to in-block offset. PosFPSkip and PayFPSkip record the file
|
||||||
|
* offsets of related block in .pos and .pay, respectively. While PosBlockOffset indicates
|
||||||
|
* which value to fetch inside the related block (PayBlockOffset is unnecessary since it is always
|
||||||
|
* equal to PosBlockOffset). Same as DocFPSkip, the file offsets are relative to the start of
|
||||||
|
* current term's TermFreqs, and stored as a difference sequence.</li>
|
||||||
|
* <li>PayByteUpto indicates the start offset of the current payload. It is equivalent to
|
||||||
|
* the sum of the payload lengths in the current block up to PosBlockOffset</li>
|
||||||
|
* </ul>
|
||||||
|
* </dd>
|
||||||
|
* </dl>
|
||||||
|
*
|
||||||
|
* <a name="Positions" id="Positions"></a>
|
||||||
|
* <dl>
|
||||||
|
* <dd>
|
||||||
|
* <b>Positions</b>
|
||||||
|
* <p>The .pos file contains the lists of positions that each term occurs at within documents. It also
|
||||||
|
* sometimes stores part of payloads and offsets for speedup.</p>
|
||||||
|
* <ul>
|
||||||
|
* <li>PosFile(.pos) --> Header, <TermPositions> <sup>TermCount</sup></li>
|
||||||
|
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||||
|
* <li>TermPositions --> <PackedPosDeltaBlock> <sup>PackedPosBlockNum</sup>,
|
||||||
|
* VIntBlock? </li>
|
||||||
|
* <li>VIntBlock --> <PositionDelta[, PayloadLength?], PayloadData?,
|
||||||
|
* OffsetDelta?, OffsetLength?><sup>PosVIntCount</sup>
|
||||||
|
* <li>PackedPosDeltaBlock --> {@link PackedInts PackedInts}</li>
|
||||||
|
* <li>PositionDelta, OffsetDelta, OffsetLength -->
|
||||||
|
* {@link DataOutput#writeVInt VInt}</li>
|
||||||
|
* <li>PayloadData --> {@link DataOutput#writeByte byte}<sup>PayLength</sup></li>
|
||||||
|
* </ul>
|
||||||
|
* <p>Notes:</p>
|
||||||
|
* <ul>
|
||||||
|
* <li>TermPositions are order by term (terms are implicit, from the term dictionary), and position
|
||||||
|
* values for each term document pair are incremental, and ordered by document number.</li>
|
||||||
|
* <li>PackedPosBlockNum is the number of packed blocks for current term's positions, payloads or offsets.
|
||||||
|
* In particular, PackedPosBlockNum = floor(totalTermFreq/PackedBlockSize) </li>
|
||||||
|
* <li>PosVIntCount is the number of positions encoded as VInt format. In particular,
|
||||||
|
* PosVIntCount = totalTermFreq - PackedPosBlockNum*PackedBlockSize</li>
|
||||||
|
* <li>The procedure how PackedPosDeltaBlock is generated is the same as PackedDocDeltaBlock
|
||||||
|
* in chapter <a href="#Frequencies">Frequencies and Skip Data</a>.</li>
|
||||||
|
* <li>PositionDelta is, if payloads are disabled for the term's field, the
|
||||||
|
* difference between the position of the current occurrence in the document and
|
||||||
|
* the previous occurrence (or zero, if this is the first occurrence in this
|
||||||
|
* document). If payloads are enabled for the term's field, then PositionDelta/2
|
||||||
|
* is the difference between the current and the previous position. If payloads
|
||||||
|
* are enabled and PositionDelta is odd, then PayloadLength is stored, indicating
|
||||||
|
* the length of the payload at the current term position.</li>
|
||||||
|
* <li>For example, the TermPositions for a term which occurs as the fourth term in
|
||||||
|
* one document, and as the fifth and ninth term in a subsequent document, would
|
||||||
|
* be the following sequence of VInts (payloads disabled):
|
||||||
|
* <p>4, 5, 4</p></li>
|
||||||
|
* <li>PayloadData is metadata associated with the current term position. If
|
||||||
|
* PayloadLength is stored at the current position, then it indicates the length
|
||||||
|
* of this payload. If PayloadLength is not stored, then this payload has the same
|
||||||
|
* length as the payload at the previous position.</li>
|
||||||
|
* <li>OffsetDelta/2 is the difference between this position's startOffset from the
|
||||||
|
* previous occurrence (or zero, if this is the first occurrence in this document).
|
||||||
|
* If OffsetDelta is odd, then the length (endOffset-startOffset) differs from the
|
||||||
|
* previous occurrence and an OffsetLength follows. Offset data is only written for
|
||||||
|
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}.</li>
|
||||||
|
* </ul>
|
||||||
|
* </dd>
|
||||||
|
* </dl>
|
||||||
|
*
|
||||||
|
* <a name="Payloads" id="Payloads"></a>
|
||||||
|
* <dl>
|
||||||
|
* <dd>
|
||||||
|
* <b>Payloads and Offsets</b>
|
||||||
|
* <p>The .pay file will store payloads and offsets associated with certain term-document positions.
|
||||||
|
* Some payloads and offsets will be separated out into .pos file, for performance reasons.</p>
|
||||||
|
* <ul>
|
||||||
|
* <li>PayFile(.pay): --> Header, <TermPayloads, TermOffsets?> <sup>TermCount</sup></li>
|
||||||
|
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||||
|
* <li>TermPayloads --> <PackedPayLengthBlock, SumPayLength, PayData> <sup>PackedPayBlockNum</sup>
|
||||||
|
* <li>TermOffsets --> <PackedOffsetStartDeltaBlock, PackedOffsetLengthBlock> <sup>PackedPayBlockNum</sup>
|
||||||
|
* <li>PackedPayLengthBlock, PackedOffsetStartDeltaBlock, PackedOffsetLengthBlock --> {@link PackedInts PackedInts}</li>
|
||||||
|
* <li>SumPayLength --> {@link DataOutput#writeVInt VInt}</li>
|
||||||
|
* <li>PayData --> {@link DataOutput#writeByte byte}<sup>SumPayLength</sup></li>
|
||||||
|
* </ul>
|
||||||
|
* <p>Notes:</p>
|
||||||
|
* <ul>
|
||||||
|
* <li>The order of TermPayloads/TermOffsets will be the same as TermPositions, note that part of
|
||||||
|
* payload/offsets are stored in .pos.</li>
|
||||||
|
* <li>The procedure how PackedPayLengthBlock and PackedOffsetLengthBlock are generated is the
|
||||||
|
* same as PackedFreqBlock in chapter <a href="#Frequencies">Frequencies and Skip Data</a>.
|
||||||
|
* While PackedStartDeltaBlock follows a same procedure as PackedDocDeltaBlock.</li>
|
||||||
|
* <li>PackedPayBlockNum is always equal to PackedPosBlockNum, for the same term. It is also synonym
|
||||||
|
* for PackedOffsetBlockNum.</li>
|
||||||
|
* <li>SumPayLength is the total length of payloads written within one block, should be the sum
|
||||||
|
* of PayLengths in one packed block.</li>
|
||||||
|
* <li>PayLength in PackedPayLengthBlock is the length of each payload associated with the current
|
||||||
|
* position.</li>
|
||||||
|
* </ul>
|
||||||
|
* </dd>
|
||||||
|
* </dl>
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
|
||||||
|
public final class TempPostingsFormat extends PostingsFormat {
|
||||||
|
/**
|
||||||
|
* Filename extension for document number, frequencies, and skip data.
|
||||||
|
* See chapter: <a href="#Frequencies">Frequencies and Skip Data</a>
|
||||||
|
*/
|
||||||
|
public static final String DOC_EXTENSION = "doc";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Filename extension for positions.
|
||||||
|
* See chapter: <a href="#Positions">Positions</a>
|
||||||
|
*/
|
||||||
|
public static final String POS_EXTENSION = "pos";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Filename extension for payloads and offsets.
|
||||||
|
* See chapter: <a href="#Payloads">Payloads and Offsets</a>
|
||||||
|
*/
|
||||||
|
public static final String PAY_EXTENSION = "pay";
|
||||||
|
|
||||||
|
private final int minTermBlockSize;
|
||||||
|
private final int maxTermBlockSize;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fixed packed block size, number of integers encoded in
|
||||||
|
* a single packed block.
|
||||||
|
*/
|
||||||
|
// NOTE: must be multiple of 64 because of PackedInts long-aligned encoding/decoding
|
||||||
|
public final static int BLOCK_SIZE = 128;
|
||||||
|
|
||||||
|
/** Creates {@code TempPostingsFormat} with default
|
||||||
|
* settings. */
|
||||||
|
public TempPostingsFormat() {
|
||||||
|
this(TempBlockTermsWriter.DEFAULT_MIN_BLOCK_SIZE, TempBlockTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Creates {@code TempPostingsFormat} with custom
|
||||||
|
* values for {@code minBlockSize} and {@code
|
||||||
|
* maxBlockSize} passed to block terms dictionary.
|
||||||
|
* @see TempBlockTermsWriter#TempBlockTermsWriter(SegmentWriteState,PostingsWriterBase,int,int) */
|
||||||
|
public TempPostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
|
||||||
|
super("TempBlock");
|
||||||
|
this.minTermBlockSize = minTermBlockSize;
|
||||||
|
assert minTermBlockSize > 1;
|
||||||
|
this.maxTermBlockSize = maxTermBlockSize;
|
||||||
|
assert minTermBlockSize <= maxTermBlockSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return getName() + "(blocksize=" + BLOCK_SIZE + ")";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||||
|
TempPostingsWriterBase postingsWriter = new TempPostingsWriter(state);
|
||||||
|
|
||||||
|
boolean success = false;
|
||||||
|
try {
|
||||||
|
FieldsConsumer ret = new TempBlockTermsWriter(state,
|
||||||
|
postingsWriter,
|
||||||
|
minTermBlockSize,
|
||||||
|
maxTermBlockSize);
|
||||||
|
success = true;
|
||||||
|
return ret;
|
||||||
|
} finally {
|
||||||
|
if (!success) {
|
||||||
|
IOUtils.closeWhileHandlingException(postingsWriter);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||||
|
TempPostingsReaderBase postingsReader = new TempPostingsReader(state.directory,
|
||||||
|
state.fieldInfos,
|
||||||
|
state.segmentInfo,
|
||||||
|
state.context,
|
||||||
|
state.segmentSuffix);
|
||||||
|
boolean success = false;
|
||||||
|
try {
|
||||||
|
FieldsProducer ret = new TempBlockTermsReader(state.directory,
|
||||||
|
state.fieldInfos,
|
||||||
|
state.segmentInfo,
|
||||||
|
postingsReader,
|
||||||
|
state.context,
|
||||||
|
state.segmentSuffix,
|
||||||
|
state.termsIndexDivisor);
|
||||||
|
success = true;
|
||||||
|
return ret;
|
||||||
|
} finally {
|
||||||
|
if (!success) {
|
||||||
|
IOUtils.closeWhileHandlingException(postingsReader);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,586 @@
|
||||||
|
package org.apache.lucene.codecs.temp;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import static org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat.BLOCK_SIZE;
|
||||||
|
import static org.apache.lucene.codecs.lucene41.ForUtil.MAX_DATA_SIZE;
|
||||||
|
import static org.apache.lucene.codecs.lucene41.ForUtil.MAX_ENCODED_SIZE;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.lucene.codecs.CodecUtil;
|
||||||
|
import org.apache.lucene.codecs.TempPostingsWriterBase;
|
||||||
|
import org.apache.lucene.codecs.TermStats;
|
||||||
|
import org.apache.lucene.codecs.lucene41.Lucene41SkipWriter;
|
||||||
|
import org.apache.lucene.codecs.lucene41.ForUtil;
|
||||||
|
import org.apache.lucene.index.CorruptIndexException;
|
||||||
|
import org.apache.lucene.index.FieldInfo;
|
||||||
|
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||||
|
import org.apache.lucene.index.IndexFileNames;
|
||||||
|
import org.apache.lucene.index.SegmentWriteState;
|
||||||
|
import org.apache.lucene.store.IndexOutput;
|
||||||
|
import org.apache.lucene.store.RAMOutputStream;
|
||||||
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
import org.apache.lucene.util.packed.PackedInts;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Concrete class that writes docId(maybe frq,pos,offset,payloads) list
|
||||||
|
* with postings format.
|
||||||
|
*
|
||||||
|
* Postings list for each term will be stored separately.
|
||||||
|
*
|
||||||
|
* @see Lucene41SkipWriter for details about skipping setting and postings layout.
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
public final class TempPostingsWriter extends TempPostingsWriterBase {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Expert: The maximum number of skip levels. Smaller values result in
|
||||||
|
* slightly smaller indexes, but slower skipping in big posting lists.
|
||||||
|
*/
|
||||||
|
static final int maxSkipLevels = 10;
|
||||||
|
|
||||||
|
final static String TERMS_CODEC = "TempPostingsWriterTerms";
|
||||||
|
final static String DOC_CODEC = "TempPostingsWriterDoc";
|
||||||
|
final static String POS_CODEC = "TempPostingsWriterPos";
|
||||||
|
final static String PAY_CODEC = "TempPostingsWriterPay";
|
||||||
|
|
||||||
|
// Increment version to change it
|
||||||
|
final static int VERSION_START = 0;
|
||||||
|
final static int VERSION_CURRENT = VERSION_START;
|
||||||
|
|
||||||
|
final IndexOutput docOut;
|
||||||
|
final IndexOutput posOut;
|
||||||
|
final IndexOutput payOut;
|
||||||
|
|
||||||
|
private IndexOutput termsOut;
|
||||||
|
|
||||||
|
// How current field indexes postings:
|
||||||
|
private boolean fieldHasFreqs;
|
||||||
|
private boolean fieldHasPositions;
|
||||||
|
private boolean fieldHasOffsets;
|
||||||
|
private boolean fieldHasPayloads;
|
||||||
|
|
||||||
|
// Holds starting file pointers for each term:
|
||||||
|
private long docTermStartFP;
|
||||||
|
private long posTermStartFP;
|
||||||
|
private long payTermStartFP;
|
||||||
|
|
||||||
|
final int[] docDeltaBuffer;
|
||||||
|
final int[] freqBuffer;
|
||||||
|
private int docBufferUpto;
|
||||||
|
|
||||||
|
final int[] posDeltaBuffer;
|
||||||
|
final int[] payloadLengthBuffer;
|
||||||
|
final int[] offsetStartDeltaBuffer;
|
||||||
|
final int[] offsetLengthBuffer;
|
||||||
|
private int posBufferUpto;
|
||||||
|
|
||||||
|
private byte[] payloadBytes;
|
||||||
|
private int payloadByteUpto;
|
||||||
|
|
||||||
|
private int lastBlockDocID;
|
||||||
|
private long lastBlockPosFP;
|
||||||
|
private long lastBlockPayFP;
|
||||||
|
private int lastBlockPosBufferUpto;
|
||||||
|
private int lastBlockPayloadByteUpto;
|
||||||
|
|
||||||
|
private int lastDocID;
|
||||||
|
private int lastPosition;
|
||||||
|
private int lastStartOffset;
|
||||||
|
private int docCount;
|
||||||
|
|
||||||
|
final byte[] encoded;
|
||||||
|
|
||||||
|
private final ForUtil forUtil;
|
||||||
|
private final Lucene41SkipWriter skipWriter;
|
||||||
|
|
||||||
|
/** Creates a postings writer with the specified PackedInts overhead ratio */
|
||||||
|
// TODO: does this ctor even make sense?
|
||||||
|
public TempPostingsWriter(SegmentWriteState state, float acceptableOverheadRatio) throws IOException {
|
||||||
|
super();
|
||||||
|
|
||||||
|
docOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TempPostingsFormat.DOC_EXTENSION),
|
||||||
|
state.context);
|
||||||
|
IndexOutput posOut = null;
|
||||||
|
IndexOutput payOut = null;
|
||||||
|
boolean success = false;
|
||||||
|
try {
|
||||||
|
CodecUtil.writeHeader(docOut, DOC_CODEC, VERSION_CURRENT);
|
||||||
|
forUtil = new ForUtil(acceptableOverheadRatio, docOut);
|
||||||
|
if (state.fieldInfos.hasProx()) {
|
||||||
|
posDeltaBuffer = new int[MAX_DATA_SIZE];
|
||||||
|
posOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TempPostingsFormat.POS_EXTENSION),
|
||||||
|
state.context);
|
||||||
|
CodecUtil.writeHeader(posOut, POS_CODEC, VERSION_CURRENT);
|
||||||
|
|
||||||
|
if (state.fieldInfos.hasPayloads()) {
|
||||||
|
payloadBytes = new byte[128];
|
||||||
|
payloadLengthBuffer = new int[MAX_DATA_SIZE];
|
||||||
|
} else {
|
||||||
|
payloadBytes = null;
|
||||||
|
payloadLengthBuffer = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (state.fieldInfos.hasOffsets()) {
|
||||||
|
offsetStartDeltaBuffer = new int[MAX_DATA_SIZE];
|
||||||
|
offsetLengthBuffer = new int[MAX_DATA_SIZE];
|
||||||
|
} else {
|
||||||
|
offsetStartDeltaBuffer = null;
|
||||||
|
offsetLengthBuffer = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (state.fieldInfos.hasPayloads() || state.fieldInfos.hasOffsets()) {
|
||||||
|
payOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TempPostingsFormat.PAY_EXTENSION),
|
||||||
|
state.context);
|
||||||
|
CodecUtil.writeHeader(payOut, PAY_CODEC, VERSION_CURRENT);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
posDeltaBuffer = null;
|
||||||
|
payloadLengthBuffer = null;
|
||||||
|
offsetStartDeltaBuffer = null;
|
||||||
|
offsetLengthBuffer = null;
|
||||||
|
payloadBytes = null;
|
||||||
|
}
|
||||||
|
this.payOut = payOut;
|
||||||
|
this.posOut = posOut;
|
||||||
|
success = true;
|
||||||
|
} finally {
|
||||||
|
if (!success) {
|
||||||
|
IOUtils.closeWhileHandlingException(docOut, posOut, payOut);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
docDeltaBuffer = new int[MAX_DATA_SIZE];
|
||||||
|
freqBuffer = new int[MAX_DATA_SIZE];
|
||||||
|
|
||||||
|
// TODO: should we try skipping every 2/4 blocks...?
|
||||||
|
skipWriter = new Lucene41SkipWriter(maxSkipLevels,
|
||||||
|
BLOCK_SIZE,
|
||||||
|
state.segmentInfo.getDocCount(),
|
||||||
|
docOut,
|
||||||
|
posOut,
|
||||||
|
payOut);
|
||||||
|
|
||||||
|
encoded = new byte[MAX_ENCODED_SIZE];
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Creates a postings writer with <code>PackedInts.COMPACT</code> */
|
||||||
|
public TempPostingsWriter(SegmentWriteState state) throws IOException {
|
||||||
|
this(state, PackedInts.COMPACT);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void start(IndexOutput termsOut) throws IOException {
|
||||||
|
this.termsOut = termsOut;
|
||||||
|
CodecUtil.writeHeader(termsOut, TERMS_CODEC, VERSION_CURRENT);
|
||||||
|
termsOut.writeVInt(BLOCK_SIZE);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setField(FieldInfo fieldInfo) {
|
||||||
|
IndexOptions indexOptions = fieldInfo.getIndexOptions();
|
||||||
|
fieldHasFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
|
||||||
|
fieldHasPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||||
|
fieldHasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||||
|
fieldHasPayloads = fieldInfo.hasPayloads();
|
||||||
|
skipWriter.setField(fieldHasPositions, fieldHasOffsets, fieldHasPayloads);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void startTerm() {
|
||||||
|
docTermStartFP = docOut.getFilePointer();
|
||||||
|
if (fieldHasPositions) {
|
||||||
|
posTermStartFP = posOut.getFilePointer();
|
||||||
|
if (fieldHasPayloads || fieldHasOffsets) {
|
||||||
|
payTermStartFP = payOut.getFilePointer();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
lastDocID = 0;
|
||||||
|
lastBlockDocID = -1;
|
||||||
|
// if (DEBUG) {
|
||||||
|
// System.out.println("FPW.startTerm startFP=" + docTermStartFP);
|
||||||
|
// }
|
||||||
|
skipWriter.resetSkip();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void startDoc(int docID, int termDocFreq) throws IOException {
|
||||||
|
// if (DEBUG) {
|
||||||
|
// System.out.println("FPW.startDoc docID["+docBufferUpto+"]=" + docID);
|
||||||
|
// }
|
||||||
|
// Have collected a block of docs, and get a new doc.
|
||||||
|
// Should write skip data as well as postings list for
|
||||||
|
// current block.
|
||||||
|
if (lastBlockDocID != -1 && docBufferUpto == 0) {
|
||||||
|
// if (DEBUG) {
|
||||||
|
// System.out.println(" bufferSkip at writeBlock: lastDocID=" + lastBlockDocID + " docCount=" + (docCount-1));
|
||||||
|
// }
|
||||||
|
skipWriter.bufferSkip(lastBlockDocID, docCount, lastBlockPosFP, lastBlockPayFP, lastBlockPosBufferUpto, lastBlockPayloadByteUpto);
|
||||||
|
}
|
||||||
|
|
||||||
|
final int docDelta = docID - lastDocID;
|
||||||
|
|
||||||
|
if (docID < 0 || (docCount > 0 && docDelta <= 0)) {
|
||||||
|
throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " ) (docOut: " + docOut + ")");
|
||||||
|
}
|
||||||
|
|
||||||
|
docDeltaBuffer[docBufferUpto] = docDelta;
|
||||||
|
// if (DEBUG) {
|
||||||
|
// System.out.println(" docDeltaBuffer[" + docBufferUpto + "]=" + docDelta);
|
||||||
|
// }
|
||||||
|
if (fieldHasFreqs) {
|
||||||
|
freqBuffer[docBufferUpto] = termDocFreq;
|
||||||
|
}
|
||||||
|
docBufferUpto++;
|
||||||
|
docCount++;
|
||||||
|
|
||||||
|
if (docBufferUpto == BLOCK_SIZE) {
|
||||||
|
// if (DEBUG) {
|
||||||
|
// System.out.println(" write docDelta block @ fp=" + docOut.getFilePointer());
|
||||||
|
// }
|
||||||
|
forUtil.writeBlock(docDeltaBuffer, encoded, docOut);
|
||||||
|
if (fieldHasFreqs) {
|
||||||
|
// if (DEBUG) {
|
||||||
|
// System.out.println(" write freq block @ fp=" + docOut.getFilePointer());
|
||||||
|
// }
|
||||||
|
forUtil.writeBlock(freqBuffer, encoded, docOut);
|
||||||
|
}
|
||||||
|
// NOTE: don't set docBufferUpto back to 0 here;
|
||||||
|
// finishDoc will do so (because it needs to see that
|
||||||
|
// the block was filled so it can save skip data)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
lastDocID = docID;
|
||||||
|
lastPosition = 0;
|
||||||
|
lastStartOffset = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Add a new position & payload */
|
||||||
|
@Override
|
||||||
|
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
|
||||||
|
// if (DEBUG) {
|
||||||
|
// System.out.println("FPW.addPosition pos=" + position + " posBufferUpto=" + posBufferUpto + (fieldHasPayloads ? " payloadByteUpto=" + payloadByteUpto: ""));
|
||||||
|
// }
|
||||||
|
posDeltaBuffer[posBufferUpto] = position - lastPosition;
|
||||||
|
if (fieldHasPayloads) {
|
||||||
|
if (payload == null || payload.length == 0) {
|
||||||
|
// no payload
|
||||||
|
payloadLengthBuffer[posBufferUpto] = 0;
|
||||||
|
} else {
|
||||||
|
payloadLengthBuffer[posBufferUpto] = payload.length;
|
||||||
|
if (payloadByteUpto + payload.length > payloadBytes.length) {
|
||||||
|
payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payload.length);
|
||||||
|
}
|
||||||
|
System.arraycopy(payload.bytes, payload.offset, payloadBytes, payloadByteUpto, payload.length);
|
||||||
|
payloadByteUpto += payload.length;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (fieldHasOffsets) {
|
||||||
|
assert startOffset >= lastStartOffset;
|
||||||
|
assert endOffset >= startOffset;
|
||||||
|
offsetStartDeltaBuffer[posBufferUpto] = startOffset - lastStartOffset;
|
||||||
|
offsetLengthBuffer[posBufferUpto] = endOffset - startOffset;
|
||||||
|
lastStartOffset = startOffset;
|
||||||
|
}
|
||||||
|
|
||||||
|
posBufferUpto++;
|
||||||
|
lastPosition = position;
|
||||||
|
if (posBufferUpto == BLOCK_SIZE) {
|
||||||
|
// if (DEBUG) {
|
||||||
|
// System.out.println(" write pos bulk block @ fp=" + posOut.getFilePointer());
|
||||||
|
// }
|
||||||
|
forUtil.writeBlock(posDeltaBuffer, encoded, posOut);
|
||||||
|
|
||||||
|
if (fieldHasPayloads) {
|
||||||
|
forUtil.writeBlock(payloadLengthBuffer, encoded, payOut);
|
||||||
|
payOut.writeVInt(payloadByteUpto);
|
||||||
|
payOut.writeBytes(payloadBytes, 0, payloadByteUpto);
|
||||||
|
payloadByteUpto = 0;
|
||||||
|
}
|
||||||
|
if (fieldHasOffsets) {
|
||||||
|
forUtil.writeBlock(offsetStartDeltaBuffer, encoded, payOut);
|
||||||
|
forUtil.writeBlock(offsetLengthBuffer, encoded, payOut);
|
||||||
|
}
|
||||||
|
posBufferUpto = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void finishDoc() throws IOException {
|
||||||
|
// Since we don't know df for current term, we had to buffer
|
||||||
|
// those skip data for each block, and when a new doc comes,
|
||||||
|
// write them to skip file.
|
||||||
|
if (docBufferUpto == BLOCK_SIZE) {
|
||||||
|
lastBlockDocID = lastDocID;
|
||||||
|
if (posOut != null) {
|
||||||
|
if (payOut != null) {
|
||||||
|
lastBlockPayFP = payOut.getFilePointer();
|
||||||
|
}
|
||||||
|
lastBlockPosFP = posOut.getFilePointer();
|
||||||
|
lastBlockPosBufferUpto = posBufferUpto;
|
||||||
|
lastBlockPayloadByteUpto = payloadByteUpto;
|
||||||
|
}
|
||||||
|
// if (DEBUG) {
|
||||||
|
// System.out.println(" docBufferUpto="+docBufferUpto+" now get lastBlockDocID="+lastBlockDocID+" lastBlockPosFP=" + lastBlockPosFP + " lastBlockPosBufferUpto=" + lastBlockPosBufferUpto + " lastBlockPayloadByteUpto=" + lastBlockPayloadByteUpto);
|
||||||
|
// }
|
||||||
|
docBufferUpto = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class PendingTerm {
|
||||||
|
public final long docStartFP;
|
||||||
|
public final long posStartFP;
|
||||||
|
public final long payStartFP;
|
||||||
|
public final long skipOffset;
|
||||||
|
public final long lastPosBlockOffset;
|
||||||
|
public final int singletonDocID;
|
||||||
|
|
||||||
|
public PendingTerm(long docStartFP, long posStartFP, long payStartFP, long skipOffset, long lastPosBlockOffset, int singletonDocID) {
|
||||||
|
this.docStartFP = docStartFP;
|
||||||
|
this.posStartFP = posStartFP;
|
||||||
|
this.payStartFP = payStartFP;
|
||||||
|
this.skipOffset = skipOffset;
|
||||||
|
this.lastPosBlockOffset = lastPosBlockOffset;
|
||||||
|
this.singletonDocID = singletonDocID;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private final List<PendingTerm> pendingTerms = new ArrayList<PendingTerm>();
|
||||||
|
|
||||||
|
/** Called when we are done adding docs to this term */
|
||||||
|
@Override
|
||||||
|
public void finishTerm(TermStats stats) throws IOException {
|
||||||
|
assert stats.docFreq > 0;
|
||||||
|
|
||||||
|
// TODO: wasteful we are counting this (counting # docs
|
||||||
|
// for this term) in two places?
|
||||||
|
assert stats.docFreq == docCount: stats.docFreq + " vs " + docCount;
|
||||||
|
|
||||||
|
// if (DEBUG) {
|
||||||
|
// System.out.println("FPW.finishTerm docFreq=" + stats.docFreq);
|
||||||
|
// }
|
||||||
|
|
||||||
|
// if (DEBUG) {
|
||||||
|
// if (docBufferUpto > 0) {
|
||||||
|
// System.out.println(" write doc/freq vInt block (count=" + docBufferUpto + ") at fp=" + docOut.getFilePointer() + " docTermStartFP=" + docTermStartFP);
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
// docFreq == 1, don't write the single docid/freq to a separate file along with a pointer to it.
|
||||||
|
final int singletonDocID;
|
||||||
|
if (stats.docFreq == 1) {
|
||||||
|
// pulse the singleton docid into the term dictionary, freq is implicitly totalTermFreq
|
||||||
|
singletonDocID = docDeltaBuffer[0];
|
||||||
|
} else {
|
||||||
|
singletonDocID = -1;
|
||||||
|
// vInt encode the remaining doc deltas and freqs:
|
||||||
|
for(int i=0;i<docBufferUpto;i++) {
|
||||||
|
final int docDelta = docDeltaBuffer[i];
|
||||||
|
final int freq = freqBuffer[i];
|
||||||
|
if (!fieldHasFreqs) {
|
||||||
|
docOut.writeVInt(docDelta);
|
||||||
|
} else if (freqBuffer[i] == 1) {
|
||||||
|
docOut.writeVInt((docDelta<<1)|1);
|
||||||
|
} else {
|
||||||
|
docOut.writeVInt(docDelta<<1);
|
||||||
|
docOut.writeVInt(freq);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
final long lastPosBlockOffset;
|
||||||
|
|
||||||
|
if (fieldHasPositions) {
|
||||||
|
// if (DEBUG) {
|
||||||
|
// if (posBufferUpto > 0) {
|
||||||
|
// System.out.println(" write pos vInt block (count=" + posBufferUpto + ") at fp=" + posOut.getFilePointer() + " posTermStartFP=" + posTermStartFP + " hasPayloads=" + fieldHasPayloads + " hasOffsets=" + fieldHasOffsets);
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
// totalTermFreq is just total number of positions(or payloads, or offsets)
|
||||||
|
// associated with current term.
|
||||||
|
assert stats.totalTermFreq != -1;
|
||||||
|
if (stats.totalTermFreq > BLOCK_SIZE) {
|
||||||
|
// record file offset for last pos in last block
|
||||||
|
lastPosBlockOffset = posOut.getFilePointer() - posTermStartFP;
|
||||||
|
} else {
|
||||||
|
lastPosBlockOffset = -1;
|
||||||
|
}
|
||||||
|
if (posBufferUpto > 0) {
|
||||||
|
// TODO: should we send offsets/payloads to
|
||||||
|
// .pay...? seems wasteful (have to store extra
|
||||||
|
// vLong for low (< BLOCK_SIZE) DF terms = vast vast
|
||||||
|
// majority)
|
||||||
|
|
||||||
|
// vInt encode the remaining positions/payloads/offsets:
|
||||||
|
int lastPayloadLength = -1; // force first payload length to be written
|
||||||
|
int lastOffsetLength = -1; // force first offset length to be written
|
||||||
|
int payloadBytesReadUpto = 0;
|
||||||
|
for(int i=0;i<posBufferUpto;i++) {
|
||||||
|
final int posDelta = posDeltaBuffer[i];
|
||||||
|
if (fieldHasPayloads) {
|
||||||
|
final int payloadLength = payloadLengthBuffer[i];
|
||||||
|
if (payloadLength != lastPayloadLength) {
|
||||||
|
lastPayloadLength = payloadLength;
|
||||||
|
posOut.writeVInt((posDelta<<1)|1);
|
||||||
|
posOut.writeVInt(payloadLength);
|
||||||
|
} else {
|
||||||
|
posOut.writeVInt(posDelta<<1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// if (DEBUG) {
|
||||||
|
// System.out.println(" i=" + i + " payloadLen=" + payloadLength);
|
||||||
|
// }
|
||||||
|
|
||||||
|
if (payloadLength != 0) {
|
||||||
|
// if (DEBUG) {
|
||||||
|
// System.out.println(" write payload @ pos.fp=" + posOut.getFilePointer());
|
||||||
|
// }
|
||||||
|
posOut.writeBytes(payloadBytes, payloadBytesReadUpto, payloadLength);
|
||||||
|
payloadBytesReadUpto += payloadLength;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
posOut.writeVInt(posDelta);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (fieldHasOffsets) {
|
||||||
|
// if (DEBUG) {
|
||||||
|
// System.out.println(" write offset @ pos.fp=" + posOut.getFilePointer());
|
||||||
|
// }
|
||||||
|
int delta = offsetStartDeltaBuffer[i];
|
||||||
|
int length = offsetLengthBuffer[i];
|
||||||
|
if (length == lastOffsetLength) {
|
||||||
|
posOut.writeVInt(delta << 1);
|
||||||
|
} else {
|
||||||
|
posOut.writeVInt(delta << 1 | 1);
|
||||||
|
posOut.writeVInt(length);
|
||||||
|
lastOffsetLength = length;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (fieldHasPayloads) {
|
||||||
|
assert payloadBytesReadUpto == payloadByteUpto;
|
||||||
|
payloadByteUpto = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// if (DEBUG) {
|
||||||
|
// System.out.println(" totalTermFreq=" + stats.totalTermFreq + " lastPosBlockOffset=" + lastPosBlockOffset);
|
||||||
|
// }
|
||||||
|
} else {
|
||||||
|
lastPosBlockOffset = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
long skipOffset;
|
||||||
|
if (docCount > BLOCK_SIZE) {
|
||||||
|
skipOffset = skipWriter.writeSkip(docOut) - docTermStartFP;
|
||||||
|
|
||||||
|
// if (DEBUG) {
|
||||||
|
// System.out.println("skip packet " + (docOut.getFilePointer() - (docTermStartFP + skipOffset)) + " bytes");
|
||||||
|
// }
|
||||||
|
} else {
|
||||||
|
skipOffset = -1;
|
||||||
|
// if (DEBUG) {
|
||||||
|
// System.out.println(" no skip: docCount=" + docCount);
|
||||||
|
// }
|
||||||
|
}
|
||||||
|
|
||||||
|
long payStartFP;
|
||||||
|
if (stats.totalTermFreq >= BLOCK_SIZE) {
|
||||||
|
payStartFP = payTermStartFP;
|
||||||
|
} else {
|
||||||
|
payStartFP = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// if (DEBUG) {
|
||||||
|
// System.out.println(" payStartFP=" + payStartFP);
|
||||||
|
// }
|
||||||
|
|
||||||
|
pendingTerms.add(new PendingTerm(docTermStartFP, posTermStartFP, payStartFP, skipOffset, lastPosBlockOffset, singletonDocID));
|
||||||
|
docBufferUpto = 0;
|
||||||
|
posBufferUpto = 0;
|
||||||
|
lastDocID = 0;
|
||||||
|
docCount = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
private final RAMOutputStream bytesWriter = new RAMOutputStream();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void flushTermsBlock(int start, int count) throws IOException {
|
||||||
|
|
||||||
|
if (count == 0) {
|
||||||
|
termsOut.writeByte((byte) 0);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
assert start <= pendingTerms.size();
|
||||||
|
assert count <= start;
|
||||||
|
|
||||||
|
final int limit = pendingTerms.size() - start + count;
|
||||||
|
|
||||||
|
long lastDocStartFP = 0;
|
||||||
|
long lastPosStartFP = 0;
|
||||||
|
long lastPayStartFP = 0;
|
||||||
|
for(int idx=limit-count; idx<limit; idx++) {
|
||||||
|
PendingTerm term = pendingTerms.get(idx);
|
||||||
|
|
||||||
|
if (term.singletonDocID == -1) {
|
||||||
|
bytesWriter.writeVLong(term.docStartFP - lastDocStartFP);
|
||||||
|
lastDocStartFP = term.docStartFP;
|
||||||
|
} else {
|
||||||
|
bytesWriter.writeVInt(term.singletonDocID);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (fieldHasPositions) {
|
||||||
|
bytesWriter.writeVLong(term.posStartFP - lastPosStartFP);
|
||||||
|
lastPosStartFP = term.posStartFP;
|
||||||
|
if (term.lastPosBlockOffset != -1) {
|
||||||
|
bytesWriter.writeVLong(term.lastPosBlockOffset);
|
||||||
|
}
|
||||||
|
if ((fieldHasPayloads || fieldHasOffsets) && term.payStartFP != -1) {
|
||||||
|
bytesWriter.writeVLong(term.payStartFP - lastPayStartFP);
|
||||||
|
lastPayStartFP = term.payStartFP;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (term.skipOffset != -1) {
|
||||||
|
bytesWriter.writeVLong(term.skipOffset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
termsOut.writeVInt((int) bytesWriter.getFilePointer());
|
||||||
|
bytesWriter.writeTo(termsOut);
|
||||||
|
bytesWriter.reset();
|
||||||
|
|
||||||
|
// Remove the terms we just wrote:
|
||||||
|
pendingTerms.subList(limit-count, limit).clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
|
IOUtils.close(docOut, posOut, payOut);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,54 @@
|
||||||
|
package org.apache.lucene.codecs.temp;
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.index.DocsEnum; // javadocs
|
||||||
|
import org.apache.lucene.index.TermState;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Holds all state required for {@link PostingsReaderBase}
|
||||||
|
* to produce a {@link DocsEnum} without re-seeking the
|
||||||
|
* terms dict.
|
||||||
|
*/
|
||||||
|
public class TempTermState extends TermState {
|
||||||
|
/** how many docs have this term */
|
||||||
|
public int docFreq;
|
||||||
|
/** total number of occurrences of this term */
|
||||||
|
public long totalTermFreq;
|
||||||
|
|
||||||
|
/** the term's ord in the current block */
|
||||||
|
public int termBlockOrd;
|
||||||
|
|
||||||
|
/** Sole constructor. (For invocation by subclass
|
||||||
|
* constructors, typically implicit.) */
|
||||||
|
protected TempTermState() {
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void copyFrom(TermState _other) {
|
||||||
|
assert _other instanceof TempTermState : "can not copy from " + _other.getClass().getName();
|
||||||
|
TempTermState other = (TempTermState) _other;
|
||||||
|
docFreq = other.docFreq;
|
||||||
|
totalTermFreq = other.totalTermFreq;
|
||||||
|
termBlockOrd = other.termBlockOrd;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "docFreq=" + docFreq + " totalTermFreq=" + totalTermFreq + " termBlockOrd=" + termBlockOrd;
|
||||||
|
}
|
||||||
|
}
|
|
@ -15,3 +15,4 @@
|
||||||
|
|
||||||
org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat
|
org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat
|
||||||
org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat
|
org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat
|
||||||
|
org.apache.lucene.codecs.temp.TempPostingsFormat
|
||||||
|
|
Loading…
Reference in New Issue