mirror of https://github.com/apache/lucene.git
Lucene-4733: Make CompressingTermVectorsFormat the new default term vectors format.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1441732 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
369f2379c2
commit
94b17a28de
|
@ -69,6 +69,10 @@ Optimizations
|
|||
* LUCENE-4740: Don't track clones of MMapIndexInput if unmapping
|
||||
is disabled. This reduces GC overhead. (Kristofer Karlsson, Uwe Schindler)
|
||||
|
||||
* LUCENE-4733: The default Lucene 4.2 codec now uses a more compact
|
||||
TermVectorsFormat (Lucene42TermVectorsFormat) based on
|
||||
CompressingTermVectorsFormat. (Adrien Grand)
|
||||
|
||||
New Features
|
||||
|
||||
* LUCENE-4686: New specialized DGapVInt8IntEncoder for facets (now the
|
||||
|
|
|
@ -28,7 +28,11 @@ import org.apache.lucene.util.ArrayUtil;
|
|||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
class CompressingStoredFieldsIndexReader implements Closeable, Cloneable {
|
||||
/**
|
||||
* Random-access reader for {@link CompressingStoredFieldsIndexWriter}.
|
||||
* @lucene.internal
|
||||
*/
|
||||
public final class CompressingStoredFieldsIndexReader implements Closeable, Cloneable {
|
||||
|
||||
final IndexInput fieldsIndexIn;
|
||||
|
||||
|
|
|
@ -20,10 +20,54 @@ package org.apache.lucene.codecs.compressing;
|
|||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
class CompressingStoredFieldsIndexWriter implements Closeable {
|
||||
/**
|
||||
* Efficient index format for block-based {@link Codec}s.
|
||||
* <p> This writer generates a file which can be loaded into memory using
|
||||
* memory-efficient data structures to quickly locate the block that contains
|
||||
* any document.
|
||||
* <p>In order to have a compact in-memory representation, for every block of
|
||||
* 1024 chunks, this index computes the average number of bytes per
|
||||
* chunk and for every chunk, only stores the difference between<ul>
|
||||
* <li>${chunk number} * ${average length of a chunk}</li>
|
||||
* <li>and the actual start offset of the chunk</li></ul></p>
|
||||
* <p>Data is written as follows:</p>
|
||||
* <ul>
|
||||
* <li>PackedIntsVersion, <Block><sup>BlockCount</sup>, BlocksEndMarker</li>
|
||||
* <li>PackedIntsVersion --> {@link PackedInts#VERSION_CURRENT} as a {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>BlocksEndMarker --> <tt>0</tt> as a {@link DataOutput#writeVInt VInt}, this marks the end of blocks since blocks are not allowed to start with <tt>0</tt></li>
|
||||
* <li>Block --> BlockChunks, <DocBases>, <StartPointers></li>
|
||||
* <li>BlockChunks --> a {@link DataOutput#writeVInt VInt} which is the number of chunks encoded in the block</li>
|
||||
* <li>DocBases --> DocBase, AvgChunkDocs, BitsPerDocBaseDelta, DocBaseDeltas</li>
|
||||
* <li>DocBase --> first document ID of the block of chunks, as a {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>AvgChunkDocs --> average number of documents in a single chunk, as a {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>BitsPerDocBaseDelta --> number of bits required to represent a delta from the average using <a href="https://developers.google.com/protocol-buffers/docs/encoding#types">ZigZag encoding</a></li>
|
||||
* <li>DocBaseDeltas --> {@link PackedInts packed} array of BlockChunks elements of BitsPerDocBaseDelta bits each, representing the deltas from the average doc base using <a href="https://developers.google.com/protocol-buffers/docs/encoding#types">ZigZag encoding</a>.</li>
|
||||
* <li>StartPointers --> StartPointerBase, AvgChunkSize, BitsPerStartPointerDelta, StartPointerDeltas</li>
|
||||
* <li>StartPointerBase --> the first start pointer of the block, as a {@link DataOutput#writeVLong VLong}</li>
|
||||
* <li>AvgChunkSize --> the average size of a chunk of compressed documents, as a {@link DataOutput#writeVLong VLong}</li>
|
||||
* <li>BitsPerStartPointerDelta --> number of bits required to represent a delta from the average using <a href="https://developers.google.com/protocol-buffers/docs/encoding#types">ZigZag encoding</a></li>
|
||||
* <li>StartPointerDeltas --> {@link PackedInts packed} array of BlockChunks elements of BitsPerStartPointerDelta bits each, representing the deltas from the average start pointer using <a href="https://developers.google.com/protocol-buffers/docs/encoding#types">ZigZag encoding</a></li>
|
||||
* </ul>
|
||||
* <p>Notes</p>
|
||||
* <ul>
|
||||
* <li>For any block, the doc base of the n-th chunk can be restored with
|
||||
* <code>DocBase + AvgChunkDocs * n + DocBaseDeltas[n]</code>.</li>
|
||||
* <li>For any block, the start pointer of the n-th chunk can be restored with
|
||||
* <code>StartPointerBase + AvgChunkSize * n + StartPointerDeltas[n]</code>.</li>
|
||||
* <li>Once data is loaded into memory, you can lookup the start pointer of any
|
||||
* document by performing two binary searches: a first one based on the values
|
||||
* of DocBase in order to find the right block, and then inside the block based
|
||||
* on DocBaseDeltas (by reconstructing the doc bases for every chunk).</li>
|
||||
* </ul>
|
||||
* @lucene.internal
|
||||
*/
|
||||
public final class CompressingStoredFieldsIndexWriter implements Closeable {
|
||||
|
||||
static final int BLOCK_SIZE = 1024; // number of chunks to serialize at once
|
||||
|
||||
|
|
|
@ -34,7 +34,7 @@ import org.apache.lucene.store.IOContext;
|
|||
* order to improve the compression ratio.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public final class CompressingTermVectorsFormat extends TermVectorsFormat {
|
||||
public class CompressingTermVectorsFormat extends TermVectorsFormat {
|
||||
|
||||
private final String formatName;
|
||||
private final String segmentSuffix;
|
||||
|
@ -79,7 +79,7 @@ public final class CompressingTermVectorsFormat extends TermVectorsFormat {
|
|||
}
|
||||
|
||||
@Override
|
||||
public TermVectorsReader vectorsReader(Directory directory,
|
||||
public final TermVectorsReader vectorsReader(Directory directory,
|
||||
SegmentInfo segmentInfo, FieldInfos fieldInfos, IOContext context)
|
||||
throws IOException {
|
||||
return new CompressingTermVectorsReader(directory, segmentInfo, segmentSuffix,
|
||||
|
@ -87,7 +87,7 @@ public final class CompressingTermVectorsFormat extends TermVectorsFormat {
|
|||
}
|
||||
|
||||
@Override
|
||||
public TermVectorsWriter vectorsWriter(Directory directory,
|
||||
public final TermVectorsWriter vectorsWriter(Directory directory,
|
||||
SegmentInfo segmentInfo, IOContext context) throws IOException {
|
||||
return new CompressingTermVectorsWriter(directory, segmentInfo, segmentSuffix,
|
||||
context, formatName, compressionMode, chunkSize);
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.codecs.lucene41;
|
|||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.compressing.CompressingStoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.compressing.CompressingStoredFieldsIndexWriter;
|
||||
import org.apache.lucene.codecs.compressing.CompressionMode;
|
||||
import org.apache.lucene.codecs.lucene40.Lucene40StoredFieldsFormat;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
|
@ -50,7 +51,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* <a href="http://fastcompression.blogspot.fr/2011/05/lz4-explained.html">compression format</a>.</p>
|
||||
* <p>Here is a more detailed description of the field data file format:</p>
|
||||
* <ul>
|
||||
* <li>FieldData (.fdt) --> <Header>, PackedIntsVersion, CompressionFormat, <Chunk><sup>ChunkCount</sup></li>
|
||||
* <li>FieldData (.fdt) --> <Header>, PackedIntsVersion, <Chunk><sup>ChunkCount</sup></li>
|
||||
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||
* <li>PackedIntsVersion --> {@link PackedInts#VERSION_CURRENT} as a {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>ChunkCount is not known in advance and is the number of chunks necessary to store all document of the segment</li>
|
||||
|
@ -95,43 +96,11 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* </ul>
|
||||
* </li>
|
||||
* <li><a name="field_index" id="field_index"></a>
|
||||
* <p>A fields index file (extension <tt>.fdx</tt>). The data stored in this
|
||||
* file is read to load an in-memory data-structure that can be used to locate
|
||||
* the start offset of a block containing any document in the fields data file.</p>
|
||||
* <p>In order to have a compact in-memory representation, for every block of
|
||||
* 1024 chunks, this stored fields index computes the average number of bytes per
|
||||
* chunk and for every chunk, only stores the difference between<ul>
|
||||
* <li>${chunk number} * ${average length of a chunk}</li>
|
||||
* <li>and the actual start offset of the chunk</li></ul></p>
|
||||
* <p>Data is written as follows:</p>
|
||||
* <p>A fields index file (extension <tt>.fdx</tt>).</p>
|
||||
* <ul>
|
||||
* <li>FieldsIndex (.fdx) --> <Header>, FieldsIndex, PackedIntsVersion, <Block><sup>BlockCount</sup>, BlocksEndMarker</li>
|
||||
* <li>FieldsIndex (.fdx) --> <Header>, <ChunkIndex></li>
|
||||
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||
* <li>PackedIntsVersion --> {@link PackedInts#VERSION_CURRENT} as a {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>BlocksEndMarker --> <tt>0</tt> as a {@link DataOutput#writeVInt VInt}, this marks the end of blocks since blocks are not allowed to start with <tt>0</tt></li>
|
||||
* <li>Block --> BlockChunks, <DocBases>, <StartPointers></li>
|
||||
* <li>BlockChunks --> a {@link DataOutput#writeVInt VInt} which is the number of chunks encoded in the block</li>
|
||||
* <li>DocBases --> DocBase, AvgChunkDocs, BitsPerDocBaseDelta, DocBaseDeltas</li>
|
||||
* <li>DocBase --> first document ID of the block of chunks, as a {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>AvgChunkDocs --> average number of documents in a single chunk, as a {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>BitsPerDocBaseDelta --> number of bits required to represent a delta from the average using <a href="https://developers.google.com/protocol-buffers/docs/encoding#types">ZigZag encoding</a></li>
|
||||
* <li>DocBaseDeltas --> {@link PackedInts packed} array of BlockChunks elements of BitsPerDocBaseDelta bits each, representing the deltas from the average doc base using <a href="https://developers.google.com/protocol-buffers/docs/encoding#types">ZigZag encoding</a>.</li>
|
||||
* <li>StartPointers --> StartPointerBase, AvgChunkSize, BitsPerStartPointerDelta, StartPointerDeltas</li>
|
||||
* <li>StartPointerBase --> the first start pointer of the block, as a {@link DataOutput#writeVLong VLong}</li>
|
||||
* <li>AvgChunkSize --> the average size of a chunk of compressed documents, as a {@link DataOutput#writeVLong VLong}</li>
|
||||
* <li>BitsPerStartPointerDelta --> number of bits required to represent a delta from the average using <a href="https://developers.google.com/protocol-buffers/docs/encoding#types">ZigZag encoding</a></li>
|
||||
* <li>StartPointerDeltas --> {@link PackedInts packed} array of BlockChunks elements of BitsPerStartPointerDelta bits each, representing the deltas from the average start pointer using <a href="https://developers.google.com/protocol-buffers/docs/encoding#types">ZigZag encoding</a></li>
|
||||
* </ul>
|
||||
* <p>Notes</p>
|
||||
* <ul>
|
||||
* <li>For any block, the doc base of the n-th chunk can be restored with
|
||||
* <code>DocBase + AvgChunkDocs * n + DocBaseDeltas[n]</code>.</li>
|
||||
* <li>For any block, the start pointer of the n-th chunk can be restored with
|
||||
* <code>StartPointerBase + AvgChunkSize * n + StartPointerDeltas[n]</code>.</li>
|
||||
* <li>Once data is loaded into memory, you can lookup the start pointer of any
|
||||
* document by performing two binary searches: a first one based on the values
|
||||
* of DocBase in order to find the right block, and then inside the block based
|
||||
* on DocBaseDeltas (by reconstructing the doc bases for every chunk).</li>
|
||||
* <li>ChunkIndex: See {@link CompressingStoredFieldsIndexWriter}</li>
|
||||
* </ul>
|
||||
* </li>
|
||||
* </ol>
|
||||
|
|
|
@ -18,13 +18,13 @@ package org.apache.lucene.codecs.lucene42;
|
|||
*/
|
||||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.codecs.FieldInfosFormat;
|
||||
import org.apache.lucene.codecs.FilterCodec;
|
||||
import org.apache.lucene.codecs.LiveDocsFormat;
|
||||
import org.apache.lucene.codecs.NormsFormat;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.SegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.codecs.NormsFormat;
|
||||
import org.apache.lucene.codecs.StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.TermVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene40.Lucene40DocValuesFormat;
|
||||
|
@ -32,7 +32,6 @@ import org.apache.lucene.codecs.lucene40.Lucene40FieldInfosFormat;
|
|||
import org.apache.lucene.codecs.lucene40.Lucene40LiveDocsFormat;
|
||||
import org.apache.lucene.codecs.lucene40.Lucene40NormsFormat;
|
||||
import org.apache.lucene.codecs.lucene40.Lucene40SegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.lucene40.Lucene40TermVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene41.Lucene41StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
||||
|
||||
|
@ -50,7 +49,7 @@ import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
|||
// (it writes a minor version, etc).
|
||||
public class Lucene42Codec extends Codec {
|
||||
private final StoredFieldsFormat fieldsFormat = new Lucene41StoredFieldsFormat();
|
||||
private final TermVectorsFormat vectorsFormat = new Lucene40TermVectorsFormat();
|
||||
private final TermVectorsFormat vectorsFormat = new Lucene42TermVectorsFormat();
|
||||
private final FieldInfosFormat fieldInfosFormat = new Lucene40FieldInfosFormat();
|
||||
private final SegmentInfoFormat infosFormat = new Lucene40SegmentInfoFormat();
|
||||
private final LiveDocsFormat liveDocsFormat = new Lucene40LiveDocsFormat();
|
||||
|
|
|
@ -0,0 +1,130 @@
|
|||
package org.apache.lucene.codecs.lucene42;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.TermVectorsFormat;
|
||||
import org.apache.lucene.codecs.compressing.CompressingStoredFieldsIndexWriter;
|
||||
import org.apache.lucene.codecs.compressing.CompressingTermVectorsFormat;
|
||||
import org.apache.lucene.codecs.compressing.CompressionMode;
|
||||
import org.apache.lucene.codecs.lucene41.Lucene41StoredFieldsFormat;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.packed.BlockPackedWriter;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/**
|
||||
* Lucene 4.2 {@link TermVectorsFormat term vectors format}.
|
||||
* <p>
|
||||
* Very similarly to {@link Lucene41StoredFieldsFormat}, this format is based
|
||||
* on compressed chunks of data, with document-level granularity so that a
|
||||
* document can never span across distinct chunks. Moreover, data is made as
|
||||
* compact as possible:<ul>
|
||||
* <li>textual data is compressed using the very light,
|
||||
* <a href="http://code.google.com/p/lz4/">LZ4</a> compression algorithm,
|
||||
* <li>binary data is written using fixed-size blocks of
|
||||
* {@link PackedInts packed ints}.
|
||||
* </ul>
|
||||
* <p>
|
||||
* Term vectors are stored using two files<ul>
|
||||
* <li>a data file where terms, frequencies, positions, offsets and payloads
|
||||
* are stored,
|
||||
* <li>an index file, loaded into memory, used to locate specific documents in
|
||||
* the data file.
|
||||
* </ul>
|
||||
* Looking up term vectors for any document requires at most 1 disk seek.
|
||||
* <p><b>File formats</b>
|
||||
* <ol>
|
||||
* <li><a name="vector_data" id="vector_data"></a>
|
||||
* <p>A vector data file (extension <tt>.tvd</tt>). This file stores terms,
|
||||
* frequencies, positions, offsets and payloads for every document. Upon writing
|
||||
* a new segment, it accumulates data into memory until the buffer used to store
|
||||
* terms and payloads grows beyond 4KB. Then it flushes all metadata, terms
|
||||
* and positions to disk using <a href="http://code.google.com/p/lz4/">LZ4</a>
|
||||
* compression for terms and payloads and
|
||||
* {@link BlockPackedWriter blocks of packed ints} for positions.</p>
|
||||
* <p>Here is a more detailed description of the field data file format:</p>
|
||||
* <ul>
|
||||
* <li>VectorData (.tvd) --> <Header>, PackedIntsVersion, ChunkSize, <Chunk><sup>ChunkCount</sup></li>
|
||||
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||
* <li>PackedIntsVersion --> {@link PackedInts#VERSION_CURRENT} as a {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>ChunkSize is the number of bytes of terms to accumulate before flushing, as a {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>ChunkCount is not known in advance and is the number of chunks necessary to store all document of the segment</li>
|
||||
* <li>Chunk --> DocBase, ChunkDocs, < NumFields >, < FieldNums >, < FieldNumOffs >, < Flags >,
|
||||
* < NumTerms >, < TermLengths >, < TermFreqs >, < Positions >, < StartOffsets >, < Lengths >,
|
||||
* < PayloadLengths >, < TermAndPayloads ></li>
|
||||
* <li>DocBase is the ID of the first doc of the chunk as a {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>ChunkDocs is the number of documents in the chunk</li>
|
||||
* <li>NumFields --> DocNumFields<sup>ChunkDocs</sup></li>
|
||||
* <li>DocNumFields is the number of fields for each doc, written as a {@link DataOutput#writeVInt VInt} if ChunkDocs==1 and as a {@link PackedInts} array otherwise</li>
|
||||
* <li>FieldNums --> FieldNumDelta<sup>TotalDistincFields</sup>, a delta-encoded list of the sorted unique field numbers present in the chunk</li>
|
||||
* <li>FieldNumOffs --> FieldNumOff<sup>TotalFields</sup>, as a {@link PackedInts} array</li>
|
||||
* <li>FieldNumOff is the offset of the field number in FieldNums</li>
|
||||
* <li>TotalFields is the total number of fields (sum of the values of NumFields)</li>
|
||||
* <li>Flags --> Bit < FieldFlags ></li>
|
||||
* <li>Bit is a single bit which when true means that fields have the same options for every document in the chunk</li>
|
||||
* <li>FieldFlags --> if Bit==1: Flag<sup>TotalDistinctFields</sup> else Flag<sup>TotalFields</sup></li>
|
||||
* <li>Flag: a 3-bits int where:<ul>
|
||||
* <li>the first bit means that the field has positions</li>
|
||||
* <li>the second bit means that the field has offsets</li>
|
||||
* <li>the third bit means that the field has payloads</li>
|
||||
* </ul></li>
|
||||
* <li>NumTerms --> FieldNumTerms<sup>TotalFields</sup></li>
|
||||
* <li>FieldNumTerms: the number of terms for each field, using {@link BlockPackedWriter blocks of 64 packed ints}</li>
|
||||
* <li>TermLengths --> PrefixLength<sup>TotalTerms</sup> SuffixLength<sup>TotalTerms</sup></li>
|
||||
* <li>TotalTerms: total number of terms (sum of NumTerms)</li>
|
||||
* <li>PrefixLength: 0 for the first term of a field, the common prefix with the previous term otherwise using {@link BlockPackedWriter blocks of 64 packed ints}</li>
|
||||
* <li>SuffixLength: length of the term minus PrefixLength for every term using {@link BlockPackedWriter blocks of 64 packed ints}</li>
|
||||
* <li>TermFreqs --> TermFreqMinus1<sup>TotalTerms</sup></li>
|
||||
* <li>TermFreqMinus1: (frequency - 1) for each term using {@link BlockPackedWriter blocks of 64 packed ints}</li>
|
||||
* <li>Positions --> PositionDelta<sup>TotalPositions</sup></li>
|
||||
* <li>TotalPositions is the sum of frequencies of terms of all fields that have positions</li>
|
||||
* <li>PositionDelta: the absolute position for the first position of a term, and the difference with the previous positions for following positions using {@link BlockPackedWriter blocks of 64 packed ints}</li>
|
||||
* <li>StartOffsets --> (AvgCharsPerTerm<sup>TotalDistinctFields</sup>) StartOffsetDelta<sup>TotalOffsets</sup></li>
|
||||
* <li>TotalOffsets is the sum of frequencies of terms of all fields that have offsets</li>
|
||||
* <li>AvgCharsPerTerm: average number of chars per term, encoded as a float on 4 bytes. They are not present if no field has both positions and offsets enabled.</li>
|
||||
* <li>StartOffsetDelta: (startOffset - previousStartOffset - AvgCharsPerTerm * PositionDelta). previousStartOffset is 0 for the first offset and AvgCharsPerTerm is 0 if the field has no positions using {@link BlockPackedWriter blocks of 64 packed ints}</li>
|
||||
* <li>Lengths --> LengthMinusTermLength<sup>TotalOffsets</sup></li>
|
||||
* <li>LengthMinusTermLength: (endOffset - startOffset - termLength) using {@link BlockPackedWriter blocks of 64 packed ints}</li>
|
||||
* <li>PayloadLengths --> PayloadLength<sup>TotalPayloads</sup></li>
|
||||
* <li>TotalPayloads is the sum of frequencies of terms of all fields that have payloads</li>
|
||||
* <li>PayloadLength is the payload length encoded using {@link BlockPackedWriter blocks of 64 packed ints}</li>
|
||||
* <li>TermAndPayloads --> LZ4-compressed representation of < FieldTermsAndPayLoads ><sup>TotalFields</sup></li>
|
||||
* <li>FieldTermsAndPayLoads --> Terms (Payloads)</li>
|
||||
* <li>Terms: term bytes</li>
|
||||
* <li>Payloads: payload bytes (if the field has payloads)</li>
|
||||
* </ul>
|
||||
* </li>
|
||||
* <li><a name="vector_index" id="vector_index"></a>
|
||||
* <p>An index file (extension <tt>.tvx</tt>).</p>
|
||||
* <ul>
|
||||
* <li>VectorIndex (.tvx) --> <Header>, <ChunkIndex></li>
|
||||
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||
* <li>ChunkIndex: See {@link CompressingStoredFieldsIndexWriter}</li>
|
||||
* </ul>
|
||||
* </li>
|
||||
* </ol>
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public final class Lucene42TermVectorsFormat extends CompressingTermVectorsFormat {
|
||||
|
||||
/** Sole constructor. */
|
||||
public Lucene42TermVectorsFormat() {
|
||||
super("Lucene41StoredFields", "", CompressionMode.FAST, 1 << 12);
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue