From 34276f9e80a04e274ace0485640fed87e8ed4ab3 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Wed, 25 Apr 2012 22:44:05 +0000
Subject: [PATCH] LUCENE-2946: doc 4.0 term vectors format

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1330591 13f79535-47bb-0310-9956-ffa450edef68
---
 .../lucene40/Lucene40SegmentInfosFormat.java  | 24 +++----
 .../lucene40/Lucene40TermVectorsFormat.java   | 72 +++++++++++++++++++
 2 files changed, 84 insertions(+), 12 deletions(-)
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40SegmentInfosFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40SegmentInfosFormat.java
index b4783b3d214..4852bbbc73d 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40SegmentInfosFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40SegmentInfosFormat.java
@@ -1,17 +1,5 @@
 package org.apache.lucene.codecs.lucene40;
 
-import org.apache.lucene.codecs.Codec; // javadocs
-import org.apache.lucene.codecs.LiveDocsFormat; // javadocs
-import org.apache.lucene.codecs.SegmentInfosFormat;
-import org.apache.lucene.codecs.SegmentInfosReader;
-import org.apache.lucene.codecs.SegmentInfosWriter;
-import org.apache.lucene.codecs.StoredFieldsFormat; // javadocs
-import org.apache.lucene.codecs.TermVectorsFormat; // javadocs
-import org.apache.lucene.index.FieldInfo.IndexOptions; // javadocs
-import org.apache.lucene.index.IndexWriter; // javadocs
-import org.apache.lucene.index.SegmentInfos; // javadocs
-import org.apache.lucene.store.DataOutput; // javadocs
-
 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -29,6 +17,18 @@ import org.apache.lucene.store.DataOutput; // javadocs
  * limitations under the License.
  */
 
+import org.apache.lucene.codecs.Codec; // javadocs
+import org.apache.lucene.codecs.LiveDocsFormat; // javadocs
+import org.apache.lucene.codecs.SegmentInfosFormat;
+import org.apache.lucene.codecs.SegmentInfosReader;
+import org.apache.lucene.codecs.SegmentInfosWriter;
+import org.apache.lucene.codecs.StoredFieldsFormat; // javadocs
+import org.apache.lucene.codecs.TermVectorsFormat; // javadocs
+import org.apache.lucene.index.FieldInfo.IndexOptions; // javadocs
+import org.apache.lucene.index.IndexWriter; // javadocs
+import org.apache.lucene.index.SegmentInfos; // javadocs
+import org.apache.lucene.store.DataOutput; // javadocs
+
 /**
  * Lucene 4.0 Segments format.
  * <p>
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsFormat.java
index 8e3f68177e3..b7fc81266e3 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsFormat.java
@@ -25,9 +25,81 @@ import org.apache.lucene.codecs.TermVectorsReader;
 import org.apache.lucene.codecs.TermVectorsWriter;
 import org.apache.lucene.index.FieldInfos;
 import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.store.DataOutput; // javadocs
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
 
+/**
+ * Lucene 4.0 Term Vectors format.
+ * <p>Term Vector support is an optional on a field by field basis. It consists of
+ * 3 files.</p>
+ * <ol>
+ * <li><a name="tvx" id="tvx"></a>
+ * <p>The Document Index or .tvx file.</p>
+ * <p>For each document, this stores the offset into the document data (.tvd) and
+ * field data (.tvf) files.</p>
+ * <p>DocumentIndex (.tvx) --&gt; TVXVersion&lt;DocumentPosition,FieldPosition&gt;
+ * <sup>NumDocs</sup></p>
+ * <ul>
+ *   <li>TVXVersion --&gt; {@link DataOutput#writeInt Int32} (<code>Lucene40TermVectorsReader.FORMAT_CURRENT</code>)</li>
+ *   <li>DocumentPosition --&gt; {@link DataOutput#writeLong UInt64} (offset in the .tvd file)</li>
+ *   <li>FieldPosition --&gt; {@link DataOutput#writeLong UInt64} (offset in the .tvf file)</li>
+ * </ul>
+ * </li>
+ * <li><a name="tvd" id="tvd"></a>
+ * <p>The Document or .tvd file.</p>
+ * <p>This contains, for each document, the number of fields, a list of the fields
+ * with term vector info and finally a list of pointers to the field information
+ * in the .tvf (Term Vector Fields) file.</p>
+ * <p>The .tvd file is used to map out the fields that have term vectors stored
+ * and where the field information is in the .tvf file.</p>
+ * <p>Document (.tvd) --&gt; TVDVersion&lt;NumFields, FieldNums,
+ * FieldPositions&gt; <sup>NumDocs</sup></p>
+ * <ul>
+ *   <li>TVDVersion --&gt; {@link DataOutput#writeInt Int32} (<code>Lucene40TermVectorsReader.FORMAT_CURRENT</code>)</li>
+ *   <li>NumFields --&gt; {@link DataOutput#writeVInt VInt}</li>
+ *   <li>FieldNums --&gt; &lt;FieldNumDelta&gt; <sup>NumFields</sup></li>
+ *   <li>FieldNumDelta --&gt; {@link DataOutput#writeVInt VInt}</li>
+ *   <li>FieldPositions --&gt; &lt;FieldPositionDelta&gt; <sup>NumFields-1</sup></li>
+ *   <li>FieldPositionDelta --&gt; {@link DataOutput#writeVLong VLong}</li>
+ * </ul>
+ * </li>
+ * <li><a name="tvf" id="tvf"></a>
+ * <p>The Field or .tvf file.</p>
+ * <p>This file contains, for each field that has a term vector stored, a list of
+ * the terms, their frequencies and, optionally, position and offset
+ * information.</p>
+ * <p>Field (.tvf) --&gt; TVFVersion&lt;NumTerms, Position/Offset, TermFreqs&gt;
+ * <sup>NumFields</sup></p>
+ * <ul>
+ *   <li>TVFVersion --&gt; {@link DataOutput#writeInt Int32} (<code>Lucene40TermVectorsReader.FORMAT_CURRENT</code>)</li>
+ *   <li>NumTerms --&gt; {@link DataOutput#writeVInt VInt}</li>
+ *   <li>Position/Offset --&gt; {@link DataOutput#writeByte Byte}</li>
+ *   <li>TermFreqs --&gt; &lt;TermText, TermFreq, Positions?, Offsets?&gt;
+ *       <sup>NumTerms</sup></li>
+ *   <li>TermText --&gt; &lt;PrefixLength, Suffix&gt;</li>
+ *   <li>PrefixLength --&gt; {@link DataOutput#writeVInt VInt}</li>
+ *   <li>Suffix --&gt; {@link DataOutput#writeString String}</li>
+ *   <li>TermFreq --&gt; {@link DataOutput#writeVInt VInt}</li>
+ *   <li>Positions --&gt; &lt;{@link DataOutput#writeVInt VInt}&gt;<sup>TermFreq</sup></li>
+ *   <li>Offsets --&gt; &lt;{@link DataOutput#writeVInt VInt}, {@link DataOutput#writeVInt VInt}&gt;<sup>TermFreq</sup></li>
+ * </ul>
+ * <p>Notes:</p>
+ * <ul>
+ * <li>Position/Offset byte stores whether this term vector has position or offset
+ * information stored.</li>
+ * <li>Term byte prefixes are shared. The PrefixLength is the number of initial
+ * bytes from the previous term which must be pre-pended to a term's suffix
+ * in order to form the term's bytes. Thus, if the previous term's text was "bone"
+ * and the term is "boy", the PrefixLength is two and the suffix is "y".</li>
+ * <li>Positions are stored as delta encoded VInts. This means we only store the
+ * difference of the current position from the last position</li>
+ * <li>Offsets are stored as delta encoded VInts. The first VInt is the
+ * startOffset, the second is the endOffset.</li>
+ * </ul>
+ * </li>
+ * </ol>
+ */
 public class Lucene40TermVectorsFormat extends TermVectorsFormat {
 
   @Override