From 34276f9e80a04e274ace0485640fed87e8ed4ab3 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Wed, 25 Apr 2012 22:44:05 +0000 Subject: [PATCH] LUCENE-2946: doc 4.0 term vectors format git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1330591 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene40/Lucene40SegmentInfosFormat.java | 24 +++---- .../lucene40/Lucene40TermVectorsFormat.java | 72 +++++++++++++++++++ 2 files changed, 84 insertions(+), 12 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40SegmentInfosFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40SegmentInfosFormat.java index b4783b3d214..4852bbbc73d 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40SegmentInfosFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40SegmentInfosFormat.java @@ -1,17 +1,5 @@ package org.apache.lucene.codecs.lucene40; -import org.apache.lucene.codecs.Codec; // javadocs -import org.apache.lucene.codecs.LiveDocsFormat; // javadocs -import org.apache.lucene.codecs.SegmentInfosFormat; -import org.apache.lucene.codecs.SegmentInfosReader; -import org.apache.lucene.codecs.SegmentInfosWriter; -import org.apache.lucene.codecs.StoredFieldsFormat; // javadocs -import org.apache.lucene.codecs.TermVectorsFormat; // javadocs -import org.apache.lucene.index.FieldInfo.IndexOptions; // javadocs -import org.apache.lucene.index.IndexWriter; // javadocs -import org.apache.lucene.index.SegmentInfos; // javadocs -import org.apache.lucene.store.DataOutput; // javadocs - /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -29,6 +17,18 @@ import org.apache.lucene.store.DataOutput; // javadocs * limitations under the License. */ +import org.apache.lucene.codecs.Codec; // javadocs +import org.apache.lucene.codecs.LiveDocsFormat; // javadocs +import org.apache.lucene.codecs.SegmentInfosFormat; +import org.apache.lucene.codecs.SegmentInfosReader; +import org.apache.lucene.codecs.SegmentInfosWriter; +import org.apache.lucene.codecs.StoredFieldsFormat; // javadocs +import org.apache.lucene.codecs.TermVectorsFormat; // javadocs +import org.apache.lucene.index.FieldInfo.IndexOptions; // javadocs +import org.apache.lucene.index.IndexWriter; // javadocs +import org.apache.lucene.index.SegmentInfos; // javadocs +import org.apache.lucene.store.DataOutput; // javadocs + /** * Lucene 4.0 Segments format. *

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsFormat.java index 8e3f68177e3..b7fc81266e3 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsFormat.java @@ -25,9 +25,81 @@ import org.apache.lucene.codecs.TermVectorsReader; import org.apache.lucene.codecs.TermVectorsWriter; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.store.DataOutput; // javadocs import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; +/** + * Lucene 4.0 Term Vectors format. + *

Term Vector support is an optional on a field by field basis. It consists of + * 3 files.

+ *
    + *
  1. + *

    The Document Index or .tvx file.

    + *

    For each document, this stores the offset into the document data (.tvd) and + * field data (.tvf) files.

    + *

    DocumentIndex (.tvx) --> TVXVersion<DocumentPosition,FieldPosition> + * NumDocs

    + *
      + *
    • TVXVersion --> {@link DataOutput#writeInt Int32} (Lucene40TermVectorsReader.FORMAT_CURRENT)
    • + *
    • DocumentPosition --> {@link DataOutput#writeLong UInt64} (offset in the .tvd file)
    • + *
    • FieldPosition --> {@link DataOutput#writeLong UInt64} (offset in the .tvf file)
    • + *
    + *
  2. + *
  3. + *

    The Document or .tvd file.

    + *

    This contains, for each document, the number of fields, a list of the fields + * with term vector info and finally a list of pointers to the field information + * in the .tvf (Term Vector Fields) file.

    + *

    The .tvd file is used to map out the fields that have term vectors stored + * and where the field information is in the .tvf file.

    + *

    Document (.tvd) --> TVDVersion<NumFields, FieldNums, + * FieldPositions> NumDocs

    + *
      + *
    • TVDVersion --> {@link DataOutput#writeInt Int32} (Lucene40TermVectorsReader.FORMAT_CURRENT)
    • + *
    • NumFields --> {@link DataOutput#writeVInt VInt}
    • + *
    • FieldNums --> <FieldNumDelta> NumFields
    • + *
    • FieldNumDelta --> {@link DataOutput#writeVInt VInt}
    • + *
    • FieldPositions --> <FieldPositionDelta> NumFields-1
    • + *
    • FieldPositionDelta --> {@link DataOutput#writeVLong VLong}
    • + *
    + *
  4. + *
  5. + *

    The Field or .tvf file.

    + *

    This file contains, for each field that has a term vector stored, a list of + * the terms, their frequencies and, optionally, position and offset + * information.

    + *

    Field (.tvf) --> TVFVersion<NumTerms, Position/Offset, TermFreqs> + * NumFields

    + *
      + *
    • TVFVersion --> {@link DataOutput#writeInt Int32} (Lucene40TermVectorsReader.FORMAT_CURRENT)
    • + *
    • NumTerms --> {@link DataOutput#writeVInt VInt}
    • + *
    • Position/Offset --> {@link DataOutput#writeByte Byte}
    • + *
    • TermFreqs --> <TermText, TermFreq, Positions?, Offsets?> + * NumTerms
    • + *
    • TermText --> <PrefixLength, Suffix>
    • + *
    • PrefixLength --> {@link DataOutput#writeVInt VInt}
    • + *
    • Suffix --> {@link DataOutput#writeString String}
    • + *
    • TermFreq --> {@link DataOutput#writeVInt VInt}
    • + *
    • Positions --> <{@link DataOutput#writeVInt VInt}>TermFreq
    • + *
    • Offsets --> <{@link DataOutput#writeVInt VInt}, {@link DataOutput#writeVInt VInt}>TermFreq
    • + *
    + *

    Notes:

    + *
      + *
    • Position/Offset byte stores whether this term vector has position or offset + * information stored.
    • + *
    • Term byte prefixes are shared. The PrefixLength is the number of initial + * bytes from the previous term which must be pre-pended to a term's suffix + * in order to form the term's bytes. Thus, if the previous term's text was "bone" + * and the term is "boy", the PrefixLength is two and the suffix is "y".
    • + *
    • Positions are stored as delta encoded VInts. This means we only store the + * difference of the current position from the last position
    • + *
    • Offsets are stored as delta encoded VInts. The first VInt is the + * startOffset, the second is the endOffset.
    • + *
    + *
  6. + *
+ */ public class Lucene40TermVectorsFormat extends TermVectorsFormat { @Override