LUCENE-2946: doc 4.0 term vectors format

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1330591 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-04-25 22:44:05 +00:00
parent 9da30ebc82
commit 34276f9e80
2 changed files with 84 additions and 12 deletions

View File

@ -1,17 +1,5 @@
package org.apache.lucene.codecs.lucene40;
import org.apache.lucene.codecs.Codec; // javadocs
import org.apache.lucene.codecs.LiveDocsFormat; // javadocs
import org.apache.lucene.codecs.SegmentInfosFormat;
import org.apache.lucene.codecs.SegmentInfosReader;
import org.apache.lucene.codecs.SegmentInfosWriter;
import org.apache.lucene.codecs.StoredFieldsFormat; // javadocs
import org.apache.lucene.codecs.TermVectorsFormat; // javadocs
import org.apache.lucene.index.FieldInfo.IndexOptions; // javadocs
import org.apache.lucene.index.IndexWriter; // javadocs
import org.apache.lucene.index.SegmentInfos; // javadocs
import org.apache.lucene.store.DataOutput; // javadocs
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -29,6 +17,18 @@ import org.apache.lucene.store.DataOutput; // javadocs
* limitations under the License.
*/
import org.apache.lucene.codecs.Codec; // javadocs
import org.apache.lucene.codecs.LiveDocsFormat; // javadocs
import org.apache.lucene.codecs.SegmentInfosFormat;
import org.apache.lucene.codecs.SegmentInfosReader;
import org.apache.lucene.codecs.SegmentInfosWriter;
import org.apache.lucene.codecs.StoredFieldsFormat; // javadocs
import org.apache.lucene.codecs.TermVectorsFormat; // javadocs
import org.apache.lucene.index.FieldInfo.IndexOptions; // javadocs
import org.apache.lucene.index.IndexWriter; // javadocs
import org.apache.lucene.index.SegmentInfos; // javadocs
import org.apache.lucene.store.DataOutput; // javadocs
/**
* Lucene 4.0 Segments format.
* <p>

View File

@ -25,9 +25,81 @@ import org.apache.lucene.codecs.TermVectorsReader;
import org.apache.lucene.codecs.TermVectorsWriter;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.DataOutput; // javadocs
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
/**
* Lucene 4.0 Term Vectors format.
* <p>Term Vector support is an optional on a field by field basis. It consists of
* 3 files.</p>
* <ol>
* <li><a name="tvx" id="tvx"></a>
* <p>The Document Index or .tvx file.</p>
* <p>For each document, this stores the offset into the document data (.tvd) and
* field data (.tvf) files.</p>
* <p>DocumentIndex (.tvx) --&gt; TVXVersion&lt;DocumentPosition,FieldPosition&gt;
* <sup>NumDocs</sup></p>
* <ul>
* <li>TVXVersion --&gt; {@link DataOutput#writeInt Int32} (<code>Lucene40TermVectorsReader.FORMAT_CURRENT</code>)</li>
* <li>DocumentPosition --&gt; {@link DataOutput#writeLong UInt64} (offset in the .tvd file)</li>
* <li>FieldPosition --&gt; {@link DataOutput#writeLong UInt64} (offset in the .tvf file)</li>
* </ul>
* </li>
* <li><a name="tvd" id="tvd"></a>
* <p>The Document or .tvd file.</p>
* <p>This contains, for each document, the number of fields, a list of the fields
* with term vector info and finally a list of pointers to the field information
* in the .tvf (Term Vector Fields) file.</p>
* <p>The .tvd file is used to map out the fields that have term vectors stored
* and where the field information is in the .tvf file.</p>
* <p>Document (.tvd) --&gt; TVDVersion&lt;NumFields, FieldNums,
* FieldPositions&gt; <sup>NumDocs</sup></p>
* <ul>
* <li>TVDVersion --&gt; {@link DataOutput#writeInt Int32} (<code>Lucene40TermVectorsReader.FORMAT_CURRENT</code>)</li>
* <li>NumFields --&gt; {@link DataOutput#writeVInt VInt}</li>
* <li>FieldNums --&gt; &lt;FieldNumDelta&gt; <sup>NumFields</sup></li>
* <li>FieldNumDelta --&gt; {@link DataOutput#writeVInt VInt}</li>
* <li>FieldPositions --&gt; &lt;FieldPositionDelta&gt; <sup>NumFields-1</sup></li>
* <li>FieldPositionDelta --&gt; {@link DataOutput#writeVLong VLong}</li>
* </ul>
* </li>
* <li><a name="tvf" id="tvf"></a>
* <p>The Field or .tvf file.</p>
* <p>This file contains, for each field that has a term vector stored, a list of
* the terms, their frequencies and, optionally, position and offset
* information.</p>
* <p>Field (.tvf) --&gt; TVFVersion&lt;NumTerms, Position/Offset, TermFreqs&gt;
* <sup>NumFields</sup></p>
* <ul>
* <li>TVFVersion --&gt; {@link DataOutput#writeInt Int32} (<code>Lucene40TermVectorsReader.FORMAT_CURRENT</code>)</li>
* <li>NumTerms --&gt; {@link DataOutput#writeVInt VInt}</li>
* <li>Position/Offset --&gt; {@link DataOutput#writeByte Byte}</li>
* <li>TermFreqs --&gt; &lt;TermText, TermFreq, Positions?, Offsets?&gt;
* <sup>NumTerms</sup></li>
* <li>TermText --&gt; &lt;PrefixLength, Suffix&gt;</li>
* <li>PrefixLength --&gt; {@link DataOutput#writeVInt VInt}</li>
* <li>Suffix --&gt; {@link DataOutput#writeString String}</li>
* <li>TermFreq --&gt; {@link DataOutput#writeVInt VInt}</li>
* <li>Positions --&gt; &lt;{@link DataOutput#writeVInt VInt}&gt;<sup>TermFreq</sup></li>
* <li>Offsets --&gt; &lt;{@link DataOutput#writeVInt VInt}, {@link DataOutput#writeVInt VInt}&gt;<sup>TermFreq</sup></li>
* </ul>
* <p>Notes:</p>
* <ul>
* <li>Position/Offset byte stores whether this term vector has position or offset
* information stored.</li>
* <li>Term byte prefixes are shared. The PrefixLength is the number of initial
* bytes from the previous term which must be pre-pended to a term's suffix
* in order to form the term's bytes. Thus, if the previous term's text was "bone"
* and the term is "boy", the PrefixLength is two and the suffix is "y".</li>
* <li>Positions are stored as delta encoded VInts. This means we only store the
* difference of the current position from the last position</li>
* <li>Offsets are stored as delta encoded VInts. The first VInt is the
* startOffset, the second is the endOffset.</li>
* </ul>
* </li>
* </ol>
*/
public class Lucene40TermVectorsFormat extends TermVectorsFormat {
@Override