mirror of https://github.com/apache/lucene.git
4.2 docvalues/norms file formats
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4547@1438703 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
e862e17907
commit
b0d6ff2cc3
|
@ -40,12 +40,7 @@ import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
|
|||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/**
|
||||
* Writes numbers one of two ways:
|
||||
* 1. packed ints as deltas from minValue
|
||||
* 2. packed ints as ordinals to a table (if the number of values is small, e.g. <= 256)
|
||||
*
|
||||
* the latter is typically much smaller with lucene's sims, as only some byte values are used,
|
||||
* but its often a nonlinear mapping, especially if you dont use crazy boosts.
|
||||
* Writer for {@link Lucene42DocValuesFormat}
|
||||
*/
|
||||
class Lucene42DocValuesConsumer extends DocValuesConsumer {
|
||||
static final int VERSION_START = 0;
|
||||
|
|
|
@ -19,14 +19,92 @@ package org.apache.lucene.codecs.lucene42;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.DocValuesConsumer;
|
||||
import org.apache.lucene.codecs.DocValuesProducer;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
import org.apache.lucene.util.packed.BlockPackedWriter;
|
||||
|
||||
/**
|
||||
* Lucene 4.2 DocValues format.
|
||||
* <p>
|
||||
* Encodes the three per-document value types (Numeric,Binary,Sorted) with five basic strategies.
|
||||
* <p>
|
||||
* <ul>
|
||||
* <li>Delta-compressed Numerics: per-document integers written in blocks of 4096. For each block
|
||||
* the minimum value is encoded, and each entry is a delta from that minimum value.
|
||||
* <li>Table-compressed Numerics: when the number of unique values is very small, a lookup table
|
||||
* is written instead. Each per-document entry is instead the ordinal to this table.
|
||||
* <li>Fixed-width Binary: one large concatenated byte[] is written, along with the fixed length.
|
||||
* Each document's value can be addressed by maxDoc*length.
|
||||
* <li>Variable-width Binary: one large concatenated byte[] is written, along with end addresses
|
||||
* for each document. The addresses are written in blocks of 4096, with the current absolute
|
||||
* start for the block, and the average (expected) delta per entry. For each document the
|
||||
* deviation from the delta (actual - expected) is written.
|
||||
* <li>Sorted: an FST mapping deduplicated terms to ordinals is written, along with the per-document
|
||||
* ordinals written using one of the numeric stratgies above.
|
||||
* </ul>
|
||||
* <p>
|
||||
* Files:
|
||||
* <ol>
|
||||
* <li><tt>.dvd</tt>: DocValues data</li>
|
||||
* <li><tt>.dvm</tt>: DocValues metadata</li>
|
||||
* </ol>
|
||||
* <ol>
|
||||
* <li><a name="dvm" id="dvm"></a>
|
||||
* <p>The DocValues metadata or .dvm file.</p>
|
||||
* <p>For DocValues field, this stores metadata, such as the offset into the
|
||||
* DocValues data (.dvd)</p>
|
||||
* <p>DocValues metadata (.dvm) --> Header,<FieldNumber,EntryType,Entry><sup>NumFields</sup></p>
|
||||
* <ul>
|
||||
* <li>Entry --> NumericEntry | BinaryEntry | SortedEntry</li>
|
||||
* <li>NumericEntry --> DataOffset,CompressionType,PackedVersion</li>
|
||||
* <li>BinaryEntry --> DataOffset,DataLength,MinLength,MaxLength,PackedVersion?,BlockSize?</li>
|
||||
* <li>SortedEntry --> DataOffset,ValueCount</li>
|
||||
* <li>FieldNumber,PackedVersion,MinLength,MaxLength,BlockSize,ValueCount --> {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>DataOffset,DataLength --> {@link DataOutput#writeLong Int64}</li>
|
||||
* <li>EntryType,CompressionType --> {@link DataOutput#writeByte Byte}</li>
|
||||
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||
* </ul>
|
||||
* <p>Sorted fields have two entries: a SortedEntry with the FST metadata,
|
||||
* and an ordinary NumericEntry for the document-to-ord metadata.</p>
|
||||
* <p>FieldNumber of -1 indicates the end of metadata.</p>
|
||||
* <p>EntryType is a 0 (NumericEntry), 1 (BinaryEntry, or 2 (SortedEntry)</p>
|
||||
* <p>DataOffset is the pointer to the start of the data in the DocValues data (.dvd)</p>
|
||||
* <p>CompressionType indicates how Numeric values will be compressed:
|
||||
* <ul>
|
||||
* <li>0 --> delta-compressed. For each block of 4096 integers, every integer is delta-encoded
|
||||
* from the minimum value within the block.
|
||||
* <li>1 --> table-compressed. When the number of unique numeric values is small and it would save space,
|
||||
* a lookup table of unique values is written, followed by the ordinal for each document.
|
||||
* </ul>
|
||||
* <p>MinLength and MaxLength represent the min and max byte[] value lengths for Binary values.
|
||||
* If they are equal, then all values are of a fixed size, and can be addressed as DataOffset + (docID * length).
|
||||
* Otherwise, the binary values are of variable size, and packed integer metadata (PackedVersion,BlockSize)
|
||||
* is written for the addresses.
|
||||
* <li><a name="dvd" id="dvd"></a>
|
||||
* <p>The DocValues data or .dvd file.</p>
|
||||
* <p>For DocValues field, this stores the actual per-document data (the heavy-lifting)</p>
|
||||
* <p>DocValues data (.dvd) --> Header,<NumericData | BinaryData | SortedData><sup>NumFields</sup></p>
|
||||
* <ul>
|
||||
* <li>NumericData --> DeltaCompressedNumerics | TableCompressedNumerics</li>
|
||||
* <li>BinaryData --> {@link DataOutput#writeByte Byte}<sup>DataLength</sup>,Addresses</li>
|
||||
* <li>SortedData --> {@link FST FST<Int64>}</li>
|
||||
* <li>DeltaCompressedNumerics --> {@link BlockPackedWriter BlockPackedInts(blockSize=4096)}</li>
|
||||
* <li>TableCompressedNumerics --> TableSize,{@link DataOutput#writeLong Int64}<sup>TableSize</sup>,{@link PackedInts PackedInts}</li>
|
||||
* <li>Addresses --> {@link MonotonicBlockPackedWriter MonotonicBlockPackedInts(blockSize=4096)}</li>
|
||||
* </ul>
|
||||
* </ol>
|
||||
*/
|
||||
public class Lucene42DocValuesFormat extends DocValuesFormat {
|
||||
|
||||
/** Sole constructor */
|
||||
public Lucene42DocValuesFormat() {
|
||||
super("Lucene42");
|
||||
}
|
||||
|
|
|
@ -46,6 +46,9 @@ import org.apache.lucene.util.packed.BlockPackedReader;
|
|||
import org.apache.lucene.util.packed.MonotonicBlockPackedReader;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/**
|
||||
* Reader for {@link Lucene42DocValuesFormat}
|
||||
*/
|
||||
class Lucene42DocValuesProducer extends DocValuesProducer {
|
||||
// metadata maps (just file pointers and minimal stuff)
|
||||
private final Map<Integer,NumericEntry> numerics;
|
||||
|
@ -56,12 +59,8 @@ class Lucene42DocValuesProducer extends DocValuesProducer {
|
|||
// ram instances we have already loaded
|
||||
private final Map<Integer,NumericDocValues> numericInstances =
|
||||
new HashMap<Integer,NumericDocValues>();
|
||||
|
||||
// if this thing needs some TL state then we might put something
|
||||
// else in this map.
|
||||
private final Map<Integer,BinaryDocValues> binaryInstances =
|
||||
new HashMap<Integer,BinaryDocValues>();
|
||||
|
||||
private final Map<Integer,FST<Long>> fstInstances =
|
||||
new HashMap<Integer,FST<Long>>();
|
||||
|
||||
|
|
|
@ -25,8 +25,24 @@ import org.apache.lucene.codecs.NormsFormat;
|
|||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
|
||||
/**
|
||||
* Lucene 4.2 score normalization format.
|
||||
* <p>
|
||||
* NOTE: this uses the same format as {@link Lucene42DocValuesFormat}
|
||||
* Numeric DocValues, but with different file extensions.
|
||||
* <p>
|
||||
* Files:
|
||||
* <ul>
|
||||
* <li><tt>.nvd</tt>: DocValues data</li>
|
||||
* <li><tt>.nvm</tt>: DocValues metadata</li>
|
||||
* </ul>
|
||||
* @see Lucene42DocValuesFormat
|
||||
*/
|
||||
public class Lucene42NormsFormat extends NormsFormat {
|
||||
|
||||
/** Sole constructor */
|
||||
public Lucene42NormsFormat() {}
|
||||
|
||||
@Override
|
||||
public DocValuesConsumer normsConsumer(SegmentWriteState state) throws IOException {
|
||||
return new Lucene42DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION);
|
||||
|
|
Loading…
Reference in New Issue