diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java index 856e9bad901..7c2205aa3cf 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java @@ -40,12 +40,7 @@ import org.apache.lucene.util.packed.MonotonicBlockPackedWriter; import org.apache.lucene.util.packed.PackedInts; /** - * Writes numbers one of two ways: - * 1. packed ints as deltas from minValue - * 2. packed ints as ordinals to a table (if the number of values is small, e.g. <= 256) - * - * the latter is typically much smaller with lucene's sims, as only some byte values are used, - * but its often a nonlinear mapping, especially if you dont use crazy boosts. + * Writer for {@link Lucene42DocValuesFormat} */ class Lucene42DocValuesConsumer extends DocValuesConsumer { static final int VERSION_START = 0; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesFormat.java index 16f36e59511..590396635b1 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesFormat.java @@ -19,14 +19,92 @@ package org.apache.lucene.codecs.lucene42; import java.io.IOException; +import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.packed.MonotonicBlockPackedWriter; +import org.apache.lucene.util.packed.PackedInts; +import org.apache.lucene.util.packed.BlockPackedWriter; +/** + * Lucene 4.2 DocValues format. + *
+ * Encodes the three per-document value types (Numeric,Binary,Sorted) with five basic strategies. + *
+ *
+ * Files: + *
The DocValues metadata or .dvm file.
+ *For DocValues field, this stores metadata, such as the offset into the + * DocValues data (.dvd)
+ *DocValues metadata (.dvm) --> Header,<FieldNumber,EntryType,Entry>NumFields
+ *Sorted fields have two entries: a SortedEntry with the FST metadata, + * and an ordinary NumericEntry for the document-to-ord metadata.
+ *FieldNumber of -1 indicates the end of metadata.
+ *EntryType is a 0 (NumericEntry), 1 (BinaryEntry, or 2 (SortedEntry)
+ *DataOffset is the pointer to the start of the data in the DocValues data (.dvd)
+ *CompressionType indicates how Numeric values will be compressed: + *
MinLength and MaxLength represent the min and max byte[] value lengths for Binary values. + * If they are equal, then all values are of a fixed size, and can be addressed as DataOffset + (docID * length). + * Otherwise, the binary values are of variable size, and packed integer metadata (PackedVersion,BlockSize) + * is written for the addresses. + *
The DocValues data or .dvd file.
+ *For DocValues field, this stores the actual per-document data (the heavy-lifting)
+ *DocValues data (.dvd) --> Header,<NumericData | BinaryData | SortedData>NumFields
+ *+ * NOTE: this uses the same format as {@link Lucene42DocValuesFormat} + * Numeric DocValues, but with different file extensions. + *
+ * Files: + *