mirror of https://github.com/apache/lucene.git
LUCENE-2946: doc 4.0 term vectors format
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1330591 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
9da30ebc82
commit
34276f9e80
|
@ -1,17 +1,5 @@
|
|||
package org.apache.lucene.codecs.lucene40;
|
||||
|
||||
import org.apache.lucene.codecs.Codec; // javadocs
|
||||
import org.apache.lucene.codecs.LiveDocsFormat; // javadocs
|
||||
import org.apache.lucene.codecs.SegmentInfosFormat;
|
||||
import org.apache.lucene.codecs.SegmentInfosReader;
|
||||
import org.apache.lucene.codecs.SegmentInfosWriter;
|
||||
import org.apache.lucene.codecs.StoredFieldsFormat; // javadocs
|
||||
import org.apache.lucene.codecs.TermVectorsFormat; // javadocs
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions; // javadocs
|
||||
import org.apache.lucene.index.IndexWriter; // javadocs
|
||||
import org.apache.lucene.index.SegmentInfos; // javadocs
|
||||
import org.apache.lucene.store.DataOutput; // javadocs
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
|
@ -29,6 +17,18 @@ import org.apache.lucene.store.DataOutput; // javadocs
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.codecs.Codec; // javadocs
|
||||
import org.apache.lucene.codecs.LiveDocsFormat; // javadocs
|
||||
import org.apache.lucene.codecs.SegmentInfosFormat;
|
||||
import org.apache.lucene.codecs.SegmentInfosReader;
|
||||
import org.apache.lucene.codecs.SegmentInfosWriter;
|
||||
import org.apache.lucene.codecs.StoredFieldsFormat; // javadocs
|
||||
import org.apache.lucene.codecs.TermVectorsFormat; // javadocs
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions; // javadocs
|
||||
import org.apache.lucene.index.IndexWriter; // javadocs
|
||||
import org.apache.lucene.index.SegmentInfos; // javadocs
|
||||
import org.apache.lucene.store.DataOutput; // javadocs
|
||||
|
||||
/**
|
||||
* Lucene 4.0 Segments format.
|
||||
* <p>
|
||||
|
|
|
@ -25,9 +25,81 @@ import org.apache.lucene.codecs.TermVectorsReader;
|
|||
import org.apache.lucene.codecs.TermVectorsWriter;
|
||||
import org.apache.lucene.index.FieldInfos;
|
||||
import org.apache.lucene.index.SegmentInfo;
|
||||
import org.apache.lucene.store.DataOutput; // javadocs
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
|
||||
/**
|
||||
* Lucene 4.0 Term Vectors format.
|
||||
* <p>Term Vector support is an optional on a field by field basis. It consists of
|
||||
* 3 files.</p>
|
||||
* <ol>
|
||||
* <li><a name="tvx" id="tvx"></a>
|
||||
* <p>The Document Index or .tvx file.</p>
|
||||
* <p>For each document, this stores the offset into the document data (.tvd) and
|
||||
* field data (.tvf) files.</p>
|
||||
* <p>DocumentIndex (.tvx) --> TVXVersion<DocumentPosition,FieldPosition>
|
||||
* <sup>NumDocs</sup></p>
|
||||
* <ul>
|
||||
* <li>TVXVersion --> {@link DataOutput#writeInt Int32} (<code>Lucene40TermVectorsReader.FORMAT_CURRENT</code>)</li>
|
||||
* <li>DocumentPosition --> {@link DataOutput#writeLong UInt64} (offset in the .tvd file)</li>
|
||||
* <li>FieldPosition --> {@link DataOutput#writeLong UInt64} (offset in the .tvf file)</li>
|
||||
* </ul>
|
||||
* </li>
|
||||
* <li><a name="tvd" id="tvd"></a>
|
||||
* <p>The Document or .tvd file.</p>
|
||||
* <p>This contains, for each document, the number of fields, a list of the fields
|
||||
* with term vector info and finally a list of pointers to the field information
|
||||
* in the .tvf (Term Vector Fields) file.</p>
|
||||
* <p>The .tvd file is used to map out the fields that have term vectors stored
|
||||
* and where the field information is in the .tvf file.</p>
|
||||
* <p>Document (.tvd) --> TVDVersion<NumFields, FieldNums,
|
||||
* FieldPositions> <sup>NumDocs</sup></p>
|
||||
* <ul>
|
||||
* <li>TVDVersion --> {@link DataOutput#writeInt Int32} (<code>Lucene40TermVectorsReader.FORMAT_CURRENT</code>)</li>
|
||||
* <li>NumFields --> {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>FieldNums --> <FieldNumDelta> <sup>NumFields</sup></li>
|
||||
* <li>FieldNumDelta --> {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>FieldPositions --> <FieldPositionDelta> <sup>NumFields-1</sup></li>
|
||||
* <li>FieldPositionDelta --> {@link DataOutput#writeVLong VLong}</li>
|
||||
* </ul>
|
||||
* </li>
|
||||
* <li><a name="tvf" id="tvf"></a>
|
||||
* <p>The Field or .tvf file.</p>
|
||||
* <p>This file contains, for each field that has a term vector stored, a list of
|
||||
* the terms, their frequencies and, optionally, position and offset
|
||||
* information.</p>
|
||||
* <p>Field (.tvf) --> TVFVersion<NumTerms, Position/Offset, TermFreqs>
|
||||
* <sup>NumFields</sup></p>
|
||||
* <ul>
|
||||
* <li>TVFVersion --> {@link DataOutput#writeInt Int32} (<code>Lucene40TermVectorsReader.FORMAT_CURRENT</code>)</li>
|
||||
* <li>NumTerms --> {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>Position/Offset --> {@link DataOutput#writeByte Byte}</li>
|
||||
* <li>TermFreqs --> <TermText, TermFreq, Positions?, Offsets?>
|
||||
* <sup>NumTerms</sup></li>
|
||||
* <li>TermText --> <PrefixLength, Suffix></li>
|
||||
* <li>PrefixLength --> {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>Suffix --> {@link DataOutput#writeString String}</li>
|
||||
* <li>TermFreq --> {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>Positions --> <{@link DataOutput#writeVInt VInt}><sup>TermFreq</sup></li>
|
||||
* <li>Offsets --> <{@link DataOutput#writeVInt VInt}, {@link DataOutput#writeVInt VInt}><sup>TermFreq</sup></li>
|
||||
* </ul>
|
||||
* <p>Notes:</p>
|
||||
* <ul>
|
||||
* <li>Position/Offset byte stores whether this term vector has position or offset
|
||||
* information stored.</li>
|
||||
* <li>Term byte prefixes are shared. The PrefixLength is the number of initial
|
||||
* bytes from the previous term which must be pre-pended to a term's suffix
|
||||
* in order to form the term's bytes. Thus, if the previous term's text was "bone"
|
||||
* and the term is "boy", the PrefixLength is two and the suffix is "y".</li>
|
||||
* <li>Positions are stored as delta encoded VInts. This means we only store the
|
||||
* difference of the current position from the last position</li>
|
||||
* <li>Offsets are stored as delta encoded VInts. The first VInt is the
|
||||
* startOffset, the second is the endOffset.</li>
|
||||
* </ul>
|
||||
* </li>
|
||||
* </ol>
|
||||
*/
|
||||
public class Lucene40TermVectorsFormat extends TermVectorsFormat {
|
||||
|
||||
@Override
|
||||
|
|
Loading…
Reference in New Issue