mirror of https://github.com/apache/lucene.git
LUCENE-5969: fix compile/javadocs, tighten up backwards codecs, add more safety to 5.x fields/vectors
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5969@1628070 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
a06c00350b
commit
24005cdcc5
|
@ -31,19 +31,11 @@ import org.apache.lucene.store.IndexOutput;
|
|||
import org.apache.lucene.util.BitUtil;
|
||||
import org.apache.lucene.util.MutableBits;
|
||||
|
||||
/** Optimized implementation of a vector of bits. This is more-or-less like
|
||||
* java.util.BitSet, but also includes the following:
|
||||
* <ul>
|
||||
* <li>a count() method, which efficiently computes the number of one bits;</li>
|
||||
* <li>optimized read from and write to disk;</li>
|
||||
* <li>inlinable get() method;</li>
|
||||
* <li>store and load, as bit set or d-gaps, depending on sparseness;</li>
|
||||
* </ul>
|
||||
*
|
||||
* @lucene.internal
|
||||
/**
|
||||
* Bitset for support of 4.x live documents
|
||||
* @deprecated only for old 4.x segments
|
||||
*/
|
||||
// pkg-private: if this thing is generally useful then it can go back in .util,
|
||||
// but the serialization must be here underneath the codec.
|
||||
@Deprecated
|
||||
final class BitVector implements Cloneable, MutableBits {
|
||||
|
||||
private byte[] bits;
|
||||
|
@ -52,7 +44,7 @@ final class BitVector implements Cloneable, MutableBits {
|
|||
private int version;
|
||||
|
||||
/** Constructs a vector capable of holding <code>n</code> bits. */
|
||||
public BitVector(int n) {
|
||||
BitVector(int n) {
|
||||
size = n;
|
||||
bits = new byte[getNumBytes(size)];
|
||||
count = 0;
|
||||
|
@ -90,27 +82,6 @@ final class BitVector implements Cloneable, MutableBits {
|
|||
count = -1;
|
||||
}
|
||||
|
||||
/** Sets the value of <code>bit</code> to true, and
|
||||
* returns true if bit was already set */
|
||||
public final boolean getAndSet(int bit) {
|
||||
if (bit >= size) {
|
||||
throw new ArrayIndexOutOfBoundsException("bit=" + bit + " size=" + size);
|
||||
}
|
||||
final int pos = bit >> 3;
|
||||
final int v = bits[pos];
|
||||
final int flag = 1 << (bit & 7);
|
||||
if ((flag & v) != 0)
|
||||
return true;
|
||||
else {
|
||||
bits[pos] = (byte) (v | flag);
|
||||
if (count != -1) {
|
||||
count++;
|
||||
assert count <= size;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/** Sets the value of <code>bit</code> to zero. */
|
||||
@Override
|
||||
public final void clear(int bit) {
|
||||
|
@ -121,25 +92,6 @@ final class BitVector implements Cloneable, MutableBits {
|
|||
count = -1;
|
||||
}
|
||||
|
||||
public final boolean getAndClear(int bit) {
|
||||
if (bit >= size) {
|
||||
throw new ArrayIndexOutOfBoundsException(bit);
|
||||
}
|
||||
final int pos = bit >> 3;
|
||||
final int v = bits[pos];
|
||||
final int flag = 1 << (bit & 7);
|
||||
if ((flag & v) == 0) {
|
||||
return false;
|
||||
} else {
|
||||
bits[pos] &= ~flag;
|
||||
if (count != -1) {
|
||||
count--;
|
||||
assert count >= 0;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns <code>true</code> if <code>bit</code> is one and
|
||||
<code>false</code> if it is zero. */
|
||||
@Override
|
||||
|
@ -150,7 +102,7 @@ final class BitVector implements Cloneable, MutableBits {
|
|||
|
||||
/** Returns the number of bits in this vector. This is also one greater than
|
||||
the number of the largest valid bit number. */
|
||||
public final int size() {
|
||||
final int size() {
|
||||
return size;
|
||||
}
|
||||
|
||||
|
@ -162,7 +114,7 @@ final class BitVector implements Cloneable, MutableBits {
|
|||
/** Returns the total number of one bits in this vector. This is efficiently
|
||||
computed and cached, so that, if the vector is not changed, no
|
||||
recomputation is done for repeated calls. */
|
||||
public final int count() {
|
||||
final int count() {
|
||||
// if the vector has been modified
|
||||
if (count == -1) {
|
||||
int c = 0;
|
||||
|
@ -177,7 +129,7 @@ final class BitVector implements Cloneable, MutableBits {
|
|||
}
|
||||
|
||||
/** For testing */
|
||||
public final int getRecomputedCount() {
|
||||
final int getRecomputedCount() {
|
||||
int c = 0;
|
||||
int end = bits.length;
|
||||
for (int i = 0; i < end; i++) {
|
||||
|
@ -191,29 +143,29 @@ final class BitVector implements Cloneable, MutableBits {
|
|||
private static String CODEC = "BitVector";
|
||||
|
||||
// Version before version tracking was added:
|
||||
public final static int VERSION_PRE = -1;
|
||||
final static int VERSION_PRE = -1;
|
||||
|
||||
// First version:
|
||||
public final static int VERSION_START = 0;
|
||||
final static int VERSION_START = 0;
|
||||
|
||||
// Changed DGaps to encode gaps between cleared bits, not
|
||||
// set:
|
||||
public final static int VERSION_DGAPS_CLEARED = 1;
|
||||
final static int VERSION_DGAPS_CLEARED = 1;
|
||||
|
||||
// added checksum
|
||||
public final static int VERSION_CHECKSUM = 2;
|
||||
final static int VERSION_CHECKSUM = 2;
|
||||
|
||||
// Increment version to change it:
|
||||
public final static int VERSION_CURRENT = VERSION_CHECKSUM;
|
||||
final static int VERSION_CURRENT = VERSION_CHECKSUM;
|
||||
|
||||
public int getVersion() {
|
||||
int getVersion() {
|
||||
return version;
|
||||
}
|
||||
|
||||
/** Writes this vector to the file <code>name</code> in Directory
|
||||
<code>d</code>, in a format that can be read by the constructor {@link
|
||||
#BitVector(Directory, String, IOContext)}. */
|
||||
public final void write(Directory d, String name, IOContext context) throws IOException {
|
||||
final void write(Directory d, String name, IOContext context) throws IOException {
|
||||
assert !(d instanceof CompoundFileDirectory);
|
||||
try (IndexOutput output = d.createOutput(name, context)) {
|
||||
output.writeInt(-2);
|
||||
|
@ -230,7 +182,7 @@ final class BitVector implements Cloneable, MutableBits {
|
|||
}
|
||||
|
||||
/** Invert all bits */
|
||||
public void invertAll() {
|
||||
void invertAll() {
|
||||
if (count != -1) {
|
||||
count = size - count;
|
||||
}
|
||||
|
@ -254,13 +206,6 @@ final class BitVector implements Cloneable, MutableBits {
|
|||
}
|
||||
}
|
||||
|
||||
/** Set all bits */
|
||||
public void setAll() {
|
||||
Arrays.fill(bits, (byte) 0xff);
|
||||
clearUnusedBits();
|
||||
count = size;
|
||||
}
|
||||
|
||||
/** Write as a bit set */
|
||||
private void writeBits(IndexOutput output) throws IOException {
|
||||
output.writeInt(size()); // write size
|
||||
|
@ -325,7 +270,7 @@ final class BitVector implements Cloneable, MutableBits {
|
|||
/** Constructs a bit vector from the file <code>name</code> in Directory
|
||||
<code>d</code>, as written by the {@link #write} method.
|
||||
*/
|
||||
public BitVector(Directory d, String name, IOContext context) throws IOException {
|
||||
BitVector(Directory d, String name, IOContext context) throws IOException {
|
||||
try (ChecksumIndexInput input = d.openChecksumInput(name, context)) {
|
||||
final int firstInt = input.readInt();
|
||||
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.codecs.lucene40;
|
|||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.FieldInfosFormat;
|
||||
import org.apache.lucene.codecs.FilterCodec;
|
||||
import org.apache.lucene.codecs.LiveDocsFormat;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.SegmentInfoFormat;
|
||||
|
@ -30,17 +29,9 @@ import org.apache.lucene.codecs.TermVectorsFormat;
|
|||
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
||||
|
||||
/**
|
||||
* Implements the Lucene 4.0 index format, with configurable per-field postings formats.
|
||||
* <p>
|
||||
* If you want to reuse functionality of this codec in another codec, extend
|
||||
* {@link FilterCodec}.
|
||||
*
|
||||
* @see org.apache.lucene.codecs.lucene40 package documentation for file format details.
|
||||
* Reader for the 4.0 file format
|
||||
* @deprecated Only for reading old 4.0 segments
|
||||
*/
|
||||
// NOTE: if we make largish changes in a minor release, easier to just make Lucene42Codec or whatever
|
||||
// if they are backwards compatible or smallish we can probably do the backwards in the postingsreader
|
||||
// (it writes a minor version, etc).
|
||||
@Deprecated
|
||||
public class Lucene40Codec extends Codec {
|
||||
private final StoredFieldsFormat fieldsFormat = new Lucene40StoredFieldsFormat();
|
||||
|
|
|
@ -19,119 +19,22 @@ package org.apache.lucene.codecs.lucene40;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.DocValuesConsumer;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.codecs.DocValuesProducer;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.store.CompoundFileDirectory;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/**
|
||||
* Lucene 4.0 DocValues format.
|
||||
* <p>
|
||||
* Files:
|
||||
* <ul>
|
||||
* <li><tt>.dv.cfs</tt>: {@link CompoundFileDirectory compound container}</li>
|
||||
* <li><tt>.dv.cfe</tt>: {@link CompoundFileDirectory compound entries}</li>
|
||||
* </ul>
|
||||
* Entries within the compound file:
|
||||
* <ul>
|
||||
* <li><tt><segment>_<fieldNumber>.dat</tt>: data values</li>
|
||||
* <li><tt><segment>_<fieldNumber>.idx</tt>: index into the .dat for DEREF types</li>
|
||||
* </ul>
|
||||
* <p>
|
||||
* There are several many types of {@code DocValues} with different encodings.
|
||||
* From the perspective of filenames, all types store their values in <tt>.dat</tt>
|
||||
* entries within the compound file. In the case of dereferenced/sorted types, the <tt>.dat</tt>
|
||||
* actually contains only the unique values, and an additional <tt>.idx</tt> file contains
|
||||
* pointers to these unique values.
|
||||
* </p>
|
||||
* Formats:
|
||||
* <ul>
|
||||
* <li>{@code VAR_INTS} .dat --> Header, PackedType, MinValue,
|
||||
* DefaultValue, PackedStream</li>
|
||||
* <li>{@code FIXED_INTS_8} .dat --> Header, ValueSize,
|
||||
* {@link DataOutput#writeByte Byte}<sup>maxdoc</sup></li>
|
||||
* <li>{@code FIXED_INTS_16} .dat --> Header, ValueSize,
|
||||
* {@link DataOutput#writeShort Short}<sup>maxdoc</sup></li>
|
||||
* <li>{@code FIXED_INTS_32} .dat --> Header, ValueSize,
|
||||
* {@link DataOutput#writeInt Int32}<sup>maxdoc</sup></li>
|
||||
* <li>{@code FIXED_INTS_64} .dat --> Header, ValueSize,
|
||||
* {@link DataOutput#writeLong Int64}<sup>maxdoc</sup></li>
|
||||
* <li>{@code FLOAT_32} .dat --> Header, ValueSize, Float32<sup>maxdoc</sup></li>
|
||||
* <li>{@code FLOAT_64} .dat --> Header, ValueSize, Float64<sup>maxdoc</sup></li>
|
||||
* <li>{@code BYTES_FIXED_STRAIGHT} .dat --> Header, ValueSize,
|
||||
* ({@link DataOutput#writeByte Byte} * ValueSize)<sup>maxdoc</sup></li>
|
||||
* <li>{@code BYTES_VAR_STRAIGHT} .idx --> Header, TotalBytes, Addresses</li>
|
||||
* <li>{@code BYTES_VAR_STRAIGHT} .dat --> Header,
|
||||
({@link DataOutput#writeByte Byte} * <i>variable ValueSize</i>)<sup>maxdoc</sup></li>
|
||||
* <li>{@code BYTES_FIXED_DEREF} .idx --> Header, NumValues, Addresses</li>
|
||||
* <li>{@code BYTES_FIXED_DEREF} .dat --> Header, ValueSize,
|
||||
* ({@link DataOutput#writeByte Byte} * ValueSize)<sup>NumValues</sup></li>
|
||||
* <li>{@code BYTES_VAR_DEREF} .idx --> Header, TotalVarBytes, Addresses</li>
|
||||
* <li>{@code BYTES_VAR_DEREF} .dat --> Header,
|
||||
* (LengthPrefix + {@link DataOutput#writeByte Byte} * <i>variable ValueSize</i>)<sup>NumValues</sup></li>
|
||||
* <li>{@code BYTES_FIXED_SORTED} .idx --> Header, NumValues, Ordinals</li>
|
||||
* <li>{@code BYTES_FIXED_SORTED} .dat --> Header, ValueSize,
|
||||
* ({@link DataOutput#writeByte Byte} * ValueSize)<sup>NumValues</sup></li>
|
||||
* <li>{@code BYTES_VAR_SORTED} .idx --> Header, TotalVarBytes, Addresses, Ordinals</li>
|
||||
* <li>{@code BYTES_VAR_SORTED} .dat --> Header,
|
||||
* ({@link DataOutput#writeByte Byte} * <i>variable ValueSize</i>)<sup>NumValues</sup></li>
|
||||
* </ul>
|
||||
* Data Types:
|
||||
* <ul>
|
||||
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||
* <li>PackedType --> {@link DataOutput#writeByte Byte}</li>
|
||||
* <li>MaxAddress, MinValue, DefaultValue --> {@link DataOutput#writeLong Int64}</li>
|
||||
* <li>PackedStream, Addresses, Ordinals --> {@link PackedInts}</li>
|
||||
* <li>ValueSize, NumValues --> {@link DataOutput#writeInt Int32}</li>
|
||||
* <li>Float32 --> 32-bit float encoded with {@link Float#floatToRawIntBits(float)}
|
||||
* then written as {@link DataOutput#writeInt Int32}</li>
|
||||
* <li>Float64 --> 64-bit float encoded with {@link Double#doubleToRawLongBits(double)}
|
||||
* then written as {@link DataOutput#writeLong Int64}</li>
|
||||
* <li>TotalBytes --> {@link DataOutput#writeVLong VLong}</li>
|
||||
* <li>TotalVarBytes --> {@link DataOutput#writeLong Int64}</li>
|
||||
* <li>LengthPrefix --> Length of the data value as {@link DataOutput#writeVInt VInt} (maximum
|
||||
* of 2 bytes)</li>
|
||||
* </ul>
|
||||
* Notes:
|
||||
* <ul>
|
||||
* <li>PackedType is a 0 when compressed, 1 when the stream is written as 64-bit integers.</li>
|
||||
* <li>Addresses stores pointers to the actual byte location (indexed by docid). In the VAR_STRAIGHT
|
||||
* case, each entry can have a different length, so to determine the length, docid+1 is
|
||||
* retrieved. A sentinel address is written at the end for the VAR_STRAIGHT case, so the Addresses
|
||||
* stream contains maxdoc+1 indices. For the deduplicated VAR_DEREF case, each length
|
||||
* is encoded as a prefix to the data itself as a {@link DataOutput#writeVInt VInt}
|
||||
* (maximum of 2 bytes).</li>
|
||||
* <li>Ordinals stores the term ID in sorted order (indexed by docid). In the FIXED_SORTED case,
|
||||
* the address into the .dat can be computed from the ordinal as
|
||||
* <code>Header+ValueSize+(ordinal*ValueSize)</code> because the byte length is fixed.
|
||||
* In the VAR_SORTED case, there is double indirection (docid -> ordinal -> address), but
|
||||
* an additional sentinel ordinal+address is always written (so there are NumValues+1 ordinals). To
|
||||
* determine the length, ord+1's address is looked up as well.</li>
|
||||
* <li>{@code BYTES_VAR_STRAIGHT BYTES_VAR_STRAIGHT} in contrast to other straight
|
||||
* variants uses a <tt>.idx</tt> file to improve lookup perfromance. In contrast to
|
||||
* {@code BYTES_VAR_DEREF BYTES_VAR_DEREF} it doesn't apply deduplication of the document values.
|
||||
* </li>
|
||||
* </ul>
|
||||
* <p>
|
||||
* Limitations:
|
||||
* <ul>
|
||||
* <li> Binary doc values can be at most {@link #MAX_BINARY_FIELD_LENGTH} in length.
|
||||
* </ul>
|
||||
* @deprecated Only for reading old 4.0 and 4.1 segments
|
||||
*/
|
||||
@Deprecated
|
||||
// NOTE: not registered in SPI, doesnt respect segment suffix, etc
|
||||
// for back compat only!
|
||||
public class Lucene40DocValuesFormat extends DocValuesFormat {
|
||||
|
||||
/** Maximum length for each binary doc values field. */
|
||||
public static final int MAX_BINARY_FIELD_LENGTH = (1 << 15) - 2;
|
||||
static final int MAX_BINARY_FIELD_LENGTH = (1 << 15) - 2;
|
||||
|
||||
/** Sole constructor. */
|
||||
public Lucene40DocValuesFormat() {
|
||||
|
@ -144,7 +47,7 @@ public class Lucene40DocValuesFormat extends DocValuesFormat {
|
|||
}
|
||||
|
||||
@Override
|
||||
public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
public final DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
String filename = IndexFileNames.segmentFileName(state.segmentInfo.name,
|
||||
"dv",
|
||||
IndexFileNames.COMPOUND_FILE_EXTENSION);
|
||||
|
|
|
@ -48,7 +48,6 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
|
||||
/**
|
||||
* Reads the 4.0 format of norms/docvalues
|
||||
* @lucene.experimental
|
||||
* @deprecated Only for reading old 4.0 and 4.1 segments
|
||||
*/
|
||||
@Deprecated
|
||||
|
|
|
@ -19,79 +19,12 @@ package org.apache.lucene.codecs.lucene40;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.FieldInfosFormat;
|
||||
import org.apache.lucene.codecs.FieldInfosReader;
|
||||
import org.apache.lucene.codecs.FieldInfosWriter;
|
||||
import org.apache.lucene.store.DataOutput; // javadoc
|
||||
|
||||
/**
|
||||
* Lucene 4.0 Field Infos format.
|
||||
* <p>
|
||||
* <p>Field names are stored in the field info file, with suffix <tt>.fnm</tt>.</p>
|
||||
* <p>FieldInfos (.fnm) --> Header,FieldsCount, <FieldName,FieldNumber,
|
||||
* FieldBits,DocValuesBits,Attributes> <sup>FieldsCount</sup></p>
|
||||
* <p>Data types:
|
||||
* <ul>
|
||||
* <li>Header --> {@link CodecUtil#checkHeader CodecHeader}</li>
|
||||
* <li>FieldsCount --> {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>FieldName --> {@link DataOutput#writeString String}</li>
|
||||
* <li>FieldBits, DocValuesBits --> {@link DataOutput#writeByte Byte}</li>
|
||||
* <li>FieldNumber --> {@link DataOutput#writeInt VInt}</li>
|
||||
* <li>Attributes --> {@link DataOutput#writeStringStringMap Map<String,String>}</li>
|
||||
* </ul>
|
||||
* </p>
|
||||
* Field Descriptions:
|
||||
* <ul>
|
||||
* <li>FieldsCount: the number of fields in this file.</li>
|
||||
* <li>FieldName: name of the field as a UTF-8 String.</li>
|
||||
* <li>FieldNumber: the field's number. Note that unlike previous versions of
|
||||
* Lucene, the fields are not numbered implicitly by their order in the
|
||||
* file, instead explicitly.</li>
|
||||
* <li>FieldBits: a byte containing field options.
|
||||
* <ul>
|
||||
* <li>The low-order bit is one for indexed fields, and zero for non-indexed
|
||||
* fields.</li>
|
||||
* <li>The second lowest-order bit is one for fields that have term vectors
|
||||
* stored, and zero for fields without term vectors.</li>
|
||||
* <li>If the third lowest order-bit is set (0x4), offsets are stored into
|
||||
* the postings list in addition to positions.</li>
|
||||
* <li>Fourth bit is unused.</li>
|
||||
* <li>If the fifth lowest-order bit is set (0x10), norms are omitted for the
|
||||
* indexed field.</li>
|
||||
* <li>If the sixth lowest-order bit is set (0x20), payloads are stored for the
|
||||
* indexed field.</li>
|
||||
* <li>If the seventh lowest-order bit is set (0x40), term frequencies and
|
||||
* positions omitted for the indexed field.</li>
|
||||
* <li>If the eighth lowest-order bit is set (0x80), positions are omitted for the
|
||||
* indexed field.</li>
|
||||
* </ul>
|
||||
* </li>
|
||||
* <li>DocValuesBits: a byte containing per-document value types. The type
|
||||
* recorded as two four-bit integers, with the high-order bits representing
|
||||
* <code>norms</code> options, and the low-order bits representing
|
||||
* {@code DocValues} options. Each four-bit integer can be decoded as such:
|
||||
* <ul>
|
||||
* <li>0: no DocValues for this field.</li>
|
||||
* <li>1: variable-width signed integers. ({@code Type#VAR_INTS VAR_INTS})</li>
|
||||
* <li>2: 32-bit floating point values. ({@code Type#FLOAT_32 FLOAT_32})</li>
|
||||
* <li>3: 64-bit floating point values. ({@code Type#FLOAT_64 FLOAT_64})</li>
|
||||
* <li>4: fixed-length byte array values. ({@code Type#BYTES_FIXED_STRAIGHT BYTES_FIXED_STRAIGHT})</li>
|
||||
* <li>5: fixed-length dereferenced byte array values. ({@code Type#BYTES_FIXED_DEREF BYTES_FIXED_DEREF})</li>
|
||||
* <li>6: variable-length byte array values. ({@code Type#BYTES_VAR_STRAIGHT BYTES_VAR_STRAIGHT})</li>
|
||||
* <li>7: variable-length dereferenced byte array values. ({@code Type#BYTES_VAR_DEREF BYTES_VAR_DEREF})</li>
|
||||
* <li>8: 16-bit signed integers. ({@code Type#FIXED_INTS_16 FIXED_INTS_16})</li>
|
||||
* <li>9: 32-bit signed integers. ({@code Type#FIXED_INTS_32 FIXED_INTS_32})</li>
|
||||
* <li>10: 64-bit signed integers. ({@code Type#FIXED_INTS_64 FIXED_INTS_64})</li>
|
||||
* <li>11: 8-bit signed integers. ({@code Type#FIXED_INTS_8 FIXED_INTS_8})</li>
|
||||
* <li>12: fixed-length sorted byte array values. ({@code Type#BYTES_FIXED_SORTED BYTES_FIXED_SORTED})</li>
|
||||
* <li>13: variable-length sorted byte array values. ({@code Type#BYTES_VAR_SORTED BYTES_VAR_SORTED})</li>
|
||||
* </ul>
|
||||
* </li>
|
||||
* <li>Attributes: a key-value map of codec-private attributes.</li>
|
||||
* </ul>
|
||||
*
|
||||
* @lucene.experimental
|
||||
* @deprecated Only for reading old 4.0 and 4.1 segments
|
||||
*/
|
||||
@Deprecated
|
||||
|
@ -103,7 +36,7 @@ public class Lucene40FieldInfosFormat extends FieldInfosFormat {
|
|||
}
|
||||
|
||||
@Override
|
||||
public FieldInfosReader getFieldInfosReader() throws IOException {
|
||||
public final FieldInfosReader getFieldInfosReader() throws IOException {
|
||||
return reader;
|
||||
}
|
||||
|
||||
|
|
|
@ -37,13 +37,10 @@ import org.apache.lucene.util.IOUtils;
|
|||
|
||||
/**
|
||||
* Lucene 4.0 FieldInfos reader.
|
||||
*
|
||||
* @lucene.experimental
|
||||
* @see Lucene40FieldInfosFormat
|
||||
* @deprecated Only for reading old 4.0 and 4.1 segments
|
||||
*/
|
||||
@Deprecated
|
||||
class Lucene40FieldInfosReader extends FieldInfosReader {
|
||||
final class Lucene40FieldInfosReader extends FieldInfosReader {
|
||||
|
||||
/** Sole constructor. */
|
||||
public Lucene40FieldInfosReader() {
|
||||
|
|
|
@ -20,12 +20,10 @@ package org.apache.lucene.codecs.lucene40;
|
|||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.LiveDocsFormat;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentCommitInfo;
|
||||
import org.apache.lucene.store.DataOutput; // javadocs
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.util.Bits;
|
||||
|
@ -33,37 +31,10 @@ import org.apache.lucene.util.MutableBits;
|
|||
|
||||
/**
|
||||
* Lucene 4.0 Live Documents Format.
|
||||
* <p>
|
||||
* <p>The .del file is optional, and only exists when a segment contains
|
||||
* deletions.</p>
|
||||
* <p>Although per-segment, this file is maintained exterior to compound segment
|
||||
* files.</p>
|
||||
* <p>Deletions (.del) --> Format,Header,ByteCount,BitCount, Bits | DGaps (depending
|
||||
* on Format)</p>
|
||||
* <ul>
|
||||
* <li>Format,ByteSize,BitCount --> {@link DataOutput#writeInt Uint32}</li>
|
||||
* <li>Bits --> <{@link DataOutput#writeByte Byte}> <sup>ByteCount</sup></li>
|
||||
* <li>DGaps --> <DGap,NonOnesByte> <sup>NonzeroBytesCount</sup></li>
|
||||
* <li>DGap --> {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>NonOnesByte --> {@link DataOutput#writeByte Byte}</li>
|
||||
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||
* </ul>
|
||||
* <p>Format is 1: indicates cleared DGaps.</p>
|
||||
* <p>ByteCount indicates the number of bytes in Bits. It is typically
|
||||
* (SegSize/8)+1.</p>
|
||||
* <p>BitCount indicates the number of bits that are currently set in Bits.</p>
|
||||
* <p>Bits contains one bit for each document indexed. When the bit corresponding
|
||||
* to a document number is cleared, that document is marked as deleted. Bit ordering
|
||||
* is from least to most significant. Thus, if Bits contains two bytes, 0x00 and
|
||||
* 0x02, then document 9 is marked as alive (not deleted).</p>
|
||||
* <p>DGaps represents sparse bit-vectors more efficiently than Bits. It is made
|
||||
* of DGaps on indexes of nonOnes bytes in Bits, and the nonOnes bytes themselves.
|
||||
* The number of nonOnes bytes in Bits (NonOnesBytesCount) is not stored.</p>
|
||||
* <p>For example, if there are 8000 bits and only bits 10,12,32 are cleared, DGaps
|
||||
* would be used:</p>
|
||||
* <p>(VInt) 1 , (byte) 20 , (VInt) 3 , (Byte) 1</p>
|
||||
* @deprecated Only for reading old 4.x segments
|
||||
*/
|
||||
public class Lucene40LiveDocsFormat extends LiveDocsFormat {
|
||||
@Deprecated
|
||||
public final class Lucene40LiveDocsFormat extends LiveDocsFormat {
|
||||
|
||||
/** Extension of deletes */
|
||||
static final String DELETES_EXTENSION = "del";
|
||||
|
|
|
@ -25,21 +25,9 @@ import org.apache.lucene.codecs.NormsProducer;
|
|||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.store.CompoundFileDirectory;
|
||||
|
||||
/**
|
||||
* Lucene 4.0 Norms Format.
|
||||
* <p>
|
||||
* Files:
|
||||
* <ul>
|
||||
* <li><tt>.nrm.cfs</tt>: {@link CompoundFileDirectory compound container}</li>
|
||||
* <li><tt>.nrm.cfe</tt>: {@link CompoundFileDirectory compound entries}</li>
|
||||
* </ul>
|
||||
* Norms are implemented as DocValues, so other than file extension, norms are
|
||||
* written exactly the same way as {@link Lucene40DocValuesFormat DocValues}.
|
||||
*
|
||||
* @see Lucene40DocValuesFormat
|
||||
* @lucene.experimental
|
||||
* @deprecated Only for reading old 4.0 and 4.1 segments
|
||||
*/
|
||||
@Deprecated
|
||||
|
|
|
@ -27,14 +27,13 @@ import org.apache.lucene.util.Accountable;
|
|||
|
||||
/**
|
||||
* Reads 4.0/4.1 norms.
|
||||
* Implemented the same as docvalues, but with a different filename.
|
||||
* @deprecated Only for reading old 4.0 and 4.1 segments
|
||||
*/
|
||||
@Deprecated
|
||||
class Lucene40NormsReader extends NormsProducer {
|
||||
final class Lucene40NormsReader extends NormsProducer {
|
||||
private final Lucene40DocValuesReader impl;
|
||||
|
||||
public Lucene40NormsReader(SegmentReadState state, String filename) throws IOException {
|
||||
Lucene40NormsReader(SegmentReadState state, String filename) throws IOException {
|
||||
impl = new Lucene40DocValuesReader(state, filename, Lucene40FieldInfosReader.LEGACY_NORM_TYPE_KEY);
|
||||
}
|
||||
|
||||
|
|
|
@ -26,17 +26,13 @@ import org.apache.lucene.index.SegmentReadState;
|
|||
import org.apache.lucene.index.SegmentWriteState;
|
||||
|
||||
/**
|
||||
* Provides a {@link PostingsReaderBase} and {@link
|
||||
* PostingsWriterBase}.
|
||||
*
|
||||
* PostingsReaderBase for 4.0 segments
|
||||
* @deprecated Only for reading old 4.0 segments */
|
||||
|
||||
// TODO: should these also be named / looked up via SPI?
|
||||
@Deprecated
|
||||
public final class Lucene40PostingsBaseFormat extends PostingsBaseFormat {
|
||||
final class Lucene40PostingsBaseFormat extends PostingsBaseFormat {
|
||||
|
||||
/** Sole constructor. */
|
||||
public Lucene40PostingsBaseFormat() {
|
||||
Lucene40PostingsBaseFormat() {
|
||||
super("Lucene40");
|
||||
}
|
||||
|
||||
|
|
|
@ -19,226 +19,25 @@ package org.apache.lucene.codecs.lucene40;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.codecs.FieldsProducer;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase; // javadocs
|
||||
import org.apache.lucene.codecs.blocktree.BlockTreeTermsReader;
|
||||
import org.apache.lucene.codecs.blocktree.BlockTreeTermsWriter;
|
||||
import org.apache.lucene.index.DocsEnum; // javadocs
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions; // javadocs
|
||||
import org.apache.lucene.index.FieldInfos; // javadocs
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.store.DataOutput; // javadocs
|
||||
import org.apache.lucene.util.fst.FST; // javadocs
|
||||
|
||||
/**
|
||||
* Lucene 4.0 Postings format.
|
||||
* <p>
|
||||
* Files:
|
||||
* <ul>
|
||||
* <li><tt>.tim</tt>: <a href="#Termdictionary">Term Dictionary</a></li>
|
||||
* <li><tt>.tip</tt>: <a href="#Termindex">Term Index</a></li>
|
||||
* <li><tt>.frq</tt>: <a href="#Frequencies">Frequencies</a></li>
|
||||
* <li><tt>.prx</tt>: <a href="#Positions">Positions</a></li>
|
||||
* </ul>
|
||||
* </p>
|
||||
* <p>
|
||||
* <a name="Termdictionary" id="Termdictionary"></a>
|
||||
* <h3>Term Dictionary</h3>
|
||||
*
|
||||
* <p>The .tim file contains the list of terms in each
|
||||
* field along with per-term statistics (such as docfreq)
|
||||
* and pointers to the frequencies, positions and
|
||||
* skip data in the .frq and .prx files.
|
||||
* See {@link BlockTreeTermsWriter} for more details on the format.
|
||||
* </p>
|
||||
*
|
||||
* <p>NOTE: The term dictionary can plug into different postings implementations:
|
||||
* the postings writer/reader are actually responsible for encoding
|
||||
* and decoding the Postings Metadata and Term Metadata sections described here:</p>
|
||||
* <ul>
|
||||
* <li>Postings Metadata --> Header, SkipInterval, MaxSkipLevels, SkipMinimum</li>
|
||||
* <li>Term Metadata --> FreqDelta, SkipDelta?, ProxDelta?
|
||||
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||
* <li>SkipInterval,MaxSkipLevels,SkipMinimum --> {@link DataOutput#writeInt Uint32}</li>
|
||||
* <li>SkipDelta,FreqDelta,ProxDelta --> {@link DataOutput#writeVLong VLong}</li>
|
||||
* </ul>
|
||||
* <p>Notes:</p>
|
||||
* <ul>
|
||||
* <li>Header is a {@link CodecUtil#writeHeader CodecHeader} storing the version information
|
||||
* for the postings.</li>
|
||||
* <li>SkipInterval is the fraction of TermDocs stored in skip tables. It is used to accelerate
|
||||
* {@link DocsEnum#advance(int)}. Larger values result in smaller indexes, greater
|
||||
* acceleration, but fewer accelerable cases, while smaller values result in bigger indexes,
|
||||
* less acceleration (in case of a small value for MaxSkipLevels) and more accelerable cases.
|
||||
* </li>
|
||||
* <li>MaxSkipLevels is the max. number of skip levels stored for each term in the .frq file. A
|
||||
* low value results in smaller indexes but less acceleration, a larger value results in
|
||||
* slightly larger indexes but greater acceleration. See format of .frq file for more
|
||||
* information about skip levels.</li>
|
||||
* <li>SkipMinimum is the minimum document frequency a term must have in order to write any
|
||||
* skip data at all.</li>
|
||||
* <li>FreqDelta determines the position of this term's TermFreqs within the .frq
|
||||
* file. In particular, it is the difference between the position of this term's
|
||||
* data in that file and the position of the previous term's data (or zero, for
|
||||
* the first term in the block).</li>
|
||||
* <li>ProxDelta determines the position of this term's TermPositions within the
|
||||
* .prx file. In particular, it is the difference between the position of this
|
||||
* term's data in that file and the position of the previous term's data (or zero,
|
||||
* for the first term in the block. For fields that omit position data, this will
|
||||
* be 0 since prox information is not stored.</li>
|
||||
* <li>SkipDelta determines the position of this term's SkipData within the .frq
|
||||
* file. In particular, it is the number of bytes after TermFreqs that the
|
||||
* SkipData starts. In other words, it is the length of the TermFreq data.
|
||||
* SkipDelta is only stored if DocFreq is not smaller than SkipMinimum.</li>
|
||||
* </ul>
|
||||
* <a name="Termindex" id="Termindex"></a>
|
||||
* <h3>Term Index</h3>
|
||||
* <p>The .tip file contains an index into the term dictionary, so that it can be
|
||||
* accessed randomly. See {@link BlockTreeTermsWriter} for more details on the format.</p>
|
||||
* <a name="Frequencies" id="Frequencies"></a>
|
||||
* <h3>Frequencies</h3>
|
||||
* <p>The .frq file contains the lists of documents which contain each term, along
|
||||
* with the frequency of the term in that document (except when frequencies are
|
||||
* omitted: {@link IndexOptions#DOCS_ONLY}).</p>
|
||||
* <ul>
|
||||
* <li>FreqFile (.frq) --> Header, <TermFreqs, SkipData?> <sup>TermCount</sup></li>
|
||||
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||
* <li>TermFreqs --> <TermFreq> <sup>DocFreq</sup></li>
|
||||
* <li>TermFreq --> DocDelta[, Freq?]</li>
|
||||
* <li>SkipData --> <<SkipLevelLength, SkipLevel>
|
||||
* <sup>NumSkipLevels-1</sup>, SkipLevel> <SkipDatum></li>
|
||||
* <li>SkipLevel --> <SkipDatum> <sup>DocFreq/(SkipInterval^(Level +
|
||||
* 1))</sup></li>
|
||||
* <li>SkipDatum -->
|
||||
* DocSkip,PayloadLength?,OffsetLength?,FreqSkip,ProxSkip,SkipChildLevelPointer?</li>
|
||||
* <li>DocDelta,Freq,DocSkip,PayloadLength,OffsetLength,FreqSkip,ProxSkip --> {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>SkipChildLevelPointer --> {@link DataOutput#writeVLong VLong}</li>
|
||||
* </ul>
|
||||
* <p>TermFreqs are ordered by term (the term is implicit, from the term dictionary).</p>
|
||||
* <p>TermFreq entries are ordered by increasing document number.</p>
|
||||
* <p>DocDelta: if frequencies are indexed, this determines both the document
|
||||
* number and the frequency. In particular, DocDelta/2 is the difference between
|
||||
* this document number and the previous document number (or zero when this is the
|
||||
* first document in a TermFreqs). When DocDelta is odd, the frequency is one.
|
||||
* When DocDelta is even, the frequency is read as another VInt. If frequencies
|
||||
* are omitted, DocDelta contains the gap (not multiplied by 2) between document
|
||||
* numbers and no frequency information is stored.</p>
|
||||
* <p>For example, the TermFreqs for a term which occurs once in document seven
|
||||
* and three times in document eleven, with frequencies indexed, would be the
|
||||
* following sequence of VInts:</p>
|
||||
* <p>15, 8, 3</p>
|
||||
* <p>If frequencies were omitted ({@link IndexOptions#DOCS_ONLY}) it would be this
|
||||
* sequence of VInts instead:</p>
|
||||
* <p>7,4</p>
|
||||
* <p>DocSkip records the document number before every SkipInterval <sup>th</sup>
|
||||
* document in TermFreqs. If payloads and offsets are disabled for the term's field, then
|
||||
* DocSkip represents the difference from the previous value in the sequence. If
|
||||
* payloads and/or offsets are enabled for the term's field, then DocSkip/2 represents the
|
||||
* difference from the previous value in the sequence. In this case when
|
||||
* DocSkip is odd, then PayloadLength and/or OffsetLength are stored indicating the length of
|
||||
* the last payload/offset before the SkipInterval<sup>th</sup> document in TermPositions.</p>
|
||||
* <p>PayloadLength indicates the length of the last payload.</p>
|
||||
* <p>OffsetLength indicates the length of the last offset (endOffset-startOffset).</p>
|
||||
* <p>
|
||||
* FreqSkip and ProxSkip record the position of every SkipInterval <sup>th</sup>
|
||||
* entry in FreqFile and ProxFile, respectively. File positions are relative to
|
||||
* the start of TermFreqs and Positions, to the previous SkipDatum in the
|
||||
* sequence.</p>
|
||||
* <p>For example, if DocFreq=35 and SkipInterval=16, then there are two SkipData
|
||||
* entries, containing the 15 <sup>th</sup> and 31 <sup>st</sup> document numbers
|
||||
* in TermFreqs. The first FreqSkip names the number of bytes after the beginning
|
||||
* of TermFreqs that the 16 <sup>th</sup> SkipDatum starts, and the second the
|
||||
* number of bytes after that that the 32 <sup>nd</sup> starts. The first ProxSkip
|
||||
* names the number of bytes after the beginning of Positions that the 16
|
||||
* <sup>th</sup> SkipDatum starts, and the second the number of bytes after that
|
||||
* that the 32 <sup>nd</sup> starts.</p>
|
||||
* <p>Each term can have multiple skip levels. The amount of skip levels for a
|
||||
* term is NumSkipLevels = Min(MaxSkipLevels,
|
||||
* floor(log(DocFreq/log(SkipInterval)))). The number of SkipData entries for a
|
||||
* skip level is DocFreq/(SkipInterval^(Level + 1)), whereas the lowest skip level
|
||||
* is Level=0.<br>
|
||||
* Example: SkipInterval = 4, MaxSkipLevels = 2, DocFreq = 35. Then skip level 0
|
||||
* has 8 SkipData entries, containing the 3<sup>rd</sup>, 7<sup>th</sup>,
|
||||
* 11<sup>th</sup>, 15<sup>th</sup>, 19<sup>th</sup>, 23<sup>rd</sup>,
|
||||
* 27<sup>th</sup>, and 31<sup>st</sup> document numbers in TermFreqs. Skip level
|
||||
* 1 has 2 SkipData entries, containing the 15<sup>th</sup> and 31<sup>st</sup>
|
||||
* document numbers in TermFreqs.<br>
|
||||
* The SkipData entries on all upper levels > 0 contain a SkipChildLevelPointer
|
||||
* referencing the corresponding SkipData entry in level-1. In the example has
|
||||
* entry 15 on level 1 a pointer to entry 15 on level 0 and entry 31 on level 1 a
|
||||
* pointer to entry 31 on level 0.
|
||||
* </p>
|
||||
* <a name="Positions" id="Positions"></a>
|
||||
* <h3>Positions</h3>
|
||||
* <p>The .prx file contains the lists of positions that each term occurs at
|
||||
* within documents. Note that fields omitting positional data do not store
|
||||
* anything into this file, and if all fields in the index omit positional data
|
||||
* then the .prx file will not exist.</p>
|
||||
* <ul>
|
||||
* <li>ProxFile (.prx) --> Header, <TermPositions> <sup>TermCount</sup></li>
|
||||
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||
* <li>TermPositions --> <Positions> <sup>DocFreq</sup></li>
|
||||
* <li>Positions --> <PositionDelta,PayloadLength?,OffsetDelta?,OffsetLength?,PayloadData?> <sup>Freq</sup></li>
|
||||
* <li>PositionDelta,OffsetDelta,OffsetLength,PayloadLength --> {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>PayloadData --> {@link DataOutput#writeByte byte}<sup>PayloadLength</sup></li>
|
||||
* </ul>
|
||||
* <p>TermPositions are ordered by term (the term is implicit, from the term dictionary).</p>
|
||||
* <p>Positions entries are ordered by increasing document number (the document
|
||||
* number is implicit from the .frq file).</p>
|
||||
* <p>PositionDelta is, if payloads are disabled for the term's field, the
|
||||
* difference between the position of the current occurrence in the document and
|
||||
* the previous occurrence (or zero, if this is the first occurrence in this
|
||||
* document). If payloads are enabled for the term's field, then PositionDelta/2
|
||||
* is the difference between the current and the previous position. If payloads
|
||||
* are enabled and PositionDelta is odd, then PayloadLength is stored, indicating
|
||||
* the length of the payload at the current term position.</p>
|
||||
* <p>For example, the TermPositions for a term which occurs as the fourth term in
|
||||
* one document, and as the fifth and ninth term in a subsequent document, would
|
||||
* be the following sequence of VInts (payloads disabled):</p>
|
||||
* <p>4, 5, 4</p>
|
||||
* <p>PayloadData is metadata associated with the current term position. If
|
||||
* PayloadLength is stored at the current position, then it indicates the length
|
||||
* of this payload. If PayloadLength is not stored, then this payload has the same
|
||||
* length as the payload at the previous position.</p>
|
||||
* <p>OffsetDelta/2 is the difference between this position's startOffset from the
|
||||
* previous occurrence (or zero, if this is the first occurrence in this document).
|
||||
* If OffsetDelta is odd, then the length (endOffset-startOffset) differs from the
|
||||
* previous occurrence and an OffsetLength follows. Offset data is only written for
|
||||
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}.</p>
|
||||
*
|
||||
* @deprecated Only for reading old 4.0 segments */
|
||||
|
||||
// TODO: this class could be created by wrapping
|
||||
// BlockTreeTermsDict around Lucene40PostingsBaseFormat; ie
|
||||
// we should not duplicate the code from that class here:
|
||||
* @deprecated Only for reading old 4.0 segments
|
||||
*/
|
||||
@Deprecated
|
||||
public class Lucene40PostingsFormat extends PostingsFormat {
|
||||
|
||||
/** minimum items (terms or sub-blocks) per block for BlockTree */
|
||||
protected final int minBlockSize;
|
||||
/** maximum items (terms or sub-blocks) per block for BlockTree */
|
||||
protected final int maxBlockSize;
|
||||
|
||||
/** Creates {@code Lucene40PostingsFormat} with default
|
||||
* settings. */
|
||||
public Lucene40PostingsFormat() {
|
||||
this(BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
|
||||
}
|
||||
|
||||
/** Creates {@code Lucene40PostingsFormat} with custom
|
||||
* values for {@code minBlockSize} and {@code
|
||||
* maxBlockSize} passed to block terms dictionary.
|
||||
* @see BlockTreeTermsWriter#BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int) */
|
||||
private Lucene40PostingsFormat(int minBlockSize, int maxBlockSize) {
|
||||
super("Lucene40");
|
||||
this.minBlockSize = minBlockSize;
|
||||
assert minBlockSize > 1;
|
||||
this.maxBlockSize = maxBlockSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -247,7 +46,7 @@ public class Lucene40PostingsFormat extends PostingsFormat {
|
|||
}
|
||||
|
||||
@Override
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
public final FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
PostingsReaderBase postings = new Lucene40PostingsReader(state.directory, state.fieldInfos, state.segmentInfo, state.context, state.segmentSuffix);
|
||||
|
||||
boolean success = false;
|
||||
|
@ -276,6 +75,6 @@ public class Lucene40PostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public String toString() {
|
||||
return getName() + "(minBlockSize=" + minBlockSize + " maxBlockSize=" + maxBlockSize + ")";
|
||||
return getName();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -43,13 +43,10 @@ import org.apache.lucene.util.BytesRefBuilder;
|
|||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/**
|
||||
* Concrete class that reads the 4.0 frq/prox
|
||||
* postings format.
|
||||
*
|
||||
* @see Lucene40PostingsFormat
|
||||
* Reader for 4.0 postings format
|
||||
* @deprecated Only for reading old 4.0 segments */
|
||||
@Deprecated
|
||||
public class Lucene40PostingsReader extends PostingsReaderBase {
|
||||
final class Lucene40PostingsReader extends PostingsReaderBase {
|
||||
|
||||
final static String TERMS_CODEC = "Lucene40PostingsWriterTerms";
|
||||
final static String FRQ_CODEC = "Lucene40PostingsWriterFrq";
|
||||
|
|
|
@ -17,57 +17,14 @@ package org.apache.lucene.codecs.lucene40;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.SegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.SegmentInfoReader;
|
||||
import org.apache.lucene.codecs.SegmentInfoWriter;
|
||||
import org.apache.lucene.index.IndexWriter; // javadocs
|
||||
import org.apache.lucene.index.SegmentInfo; // javadocs
|
||||
import org.apache.lucene.index.SegmentInfos; // javadocs
|
||||
import org.apache.lucene.store.DataOutput; // javadocs
|
||||
import org.apache.lucene.index.SegmentInfo;
|
||||
|
||||
/**
|
||||
* Lucene 4.0 Segment info format.
|
||||
* <p>
|
||||
* Files:
|
||||
* <ul>
|
||||
* <li><tt>.si</tt>: Header, SegVersion, SegSize, IsCompoundFile, Diagnostics, Attributes, Files
|
||||
* </ul>
|
||||
* </p>
|
||||
* Data types:
|
||||
* <p>
|
||||
* <ul>
|
||||
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||
* <li>SegSize --> {@link DataOutput#writeInt Int32}</li>
|
||||
* <li>SegVersion --> {@link DataOutput#writeString String}</li>
|
||||
* <li>Files --> {@link DataOutput#writeStringSet Set<String>}</li>
|
||||
* <li>Diagnostics, Attributes --> {@link DataOutput#writeStringStringMap Map<String,String>}</li>
|
||||
* <li>IsCompoundFile --> {@link DataOutput#writeByte Int8}</li>
|
||||
* </ul>
|
||||
* </p>
|
||||
* Field Descriptions:
|
||||
* <p>
|
||||
* <ul>
|
||||
* <li>SegVersion is the code version that created the segment.</li>
|
||||
* <li>SegSize is the number of documents contained in the segment index.</li>
|
||||
* <li>IsCompoundFile records whether the segment is written as a compound file or
|
||||
* not. If this is -1, the segment is not a compound file. If it is 1, the segment
|
||||
* is a compound file.</li>
|
||||
* <li>Checksum contains the CRC32 checksum of all bytes in the segments_N file up
|
||||
* until the checksum. This is used to verify integrity of the file on opening the
|
||||
* index.</li>
|
||||
* <li>The Diagnostics Map is privately written by {@link IndexWriter}, as a debugging aid,
|
||||
* for each segment it creates. It includes metadata like the current Lucene
|
||||
* version, OS, Java version, why the segment was created (merge, flush,
|
||||
* addIndexes), etc.</li>
|
||||
* <li>Attributes: a key-value map of codec-private attributes.</li>
|
||||
* <li>Files is a list of files referred to by this segment.</li>
|
||||
* </ul>
|
||||
* </p>
|
||||
*
|
||||
* @see SegmentInfos
|
||||
* @lucene.experimental
|
||||
* @deprecated Only for reading old 4.0-4.5 segments, and supporting IndexWriter.addIndexes
|
||||
* @deprecated Only for reading old 4.0-4.5 segments
|
||||
*/
|
||||
@Deprecated
|
||||
public class Lucene40SegmentInfoFormat extends SegmentInfoFormat {
|
||||
|
@ -78,7 +35,7 @@ public class Lucene40SegmentInfoFormat extends SegmentInfoFormat {
|
|||
}
|
||||
|
||||
@Override
|
||||
public SegmentInfoReader getSegmentInfoReader() {
|
||||
public final SegmentInfoReader getSegmentInfoReader() {
|
||||
return reader;
|
||||
}
|
||||
|
||||
|
@ -88,7 +45,7 @@ public class Lucene40SegmentInfoFormat extends SegmentInfoFormat {
|
|||
}
|
||||
|
||||
/** File extension used to store {@link SegmentInfo}. */
|
||||
public final static String SI_EXTENSION = "si";
|
||||
static final String SI_EXTENSION = "si";
|
||||
static final String CODEC_NAME = "Lucene40SegmentInfo";
|
||||
static final int VERSION_START = 0;
|
||||
static final int VERSION_CURRENT = VERSION_START;
|
||||
|
|
|
@ -34,14 +34,11 @@ import org.apache.lucene.util.IOUtils;
|
|||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Lucene 4.0 implementation of {@link SegmentInfoReader}.
|
||||
*
|
||||
* @see Lucene40SegmentInfoFormat
|
||||
* @lucene.experimental
|
||||
* Lucene 4.0 SI reader
|
||||
* @deprecated Only for reading old 4.0-4.5 segments
|
||||
*/
|
||||
@Deprecated
|
||||
public class Lucene40SegmentInfoReader extends SegmentInfoReader {
|
||||
final class Lucene40SegmentInfoReader extends SegmentInfoReader {
|
||||
|
||||
/** Sole constructor. */
|
||||
public Lucene40SegmentInfoReader() {
|
||||
|
|
|
@ -24,14 +24,11 @@ import org.apache.lucene.codecs.MultiLevelSkipListReader;
|
|||
import org.apache.lucene.store.IndexInput;
|
||||
|
||||
/**
|
||||
* Implements the skip list reader for the 4.0 posting list format
|
||||
* that stores positions and payloads.
|
||||
*
|
||||
* @see Lucene40PostingsFormat
|
||||
* Lucene 4.0 skiplist reader
|
||||
* @deprecated Only for reading old 4.0 segments
|
||||
*/
|
||||
@Deprecated
|
||||
public class Lucene40SkipListReader extends MultiLevelSkipListReader {
|
||||
final class Lucene40SkipListReader extends MultiLevelSkipListReader {
|
||||
private boolean currentFieldStoresPayloads;
|
||||
private boolean currentFieldStoresOffsets;
|
||||
private long freqPointer[];
|
||||
|
|
|
@ -19,66 +19,18 @@ package org.apache.lucene.codecs.lucene40;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.StoredFieldsReader;
|
||||
import org.apache.lucene.codecs.StoredFieldsWriter;
|
||||
import org.apache.lucene.index.FieldInfos;
|
||||
import org.apache.lucene.index.SegmentInfo;
|
||||
import org.apache.lucene.store.DataOutput; // javadocs
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
|
||||
/**
|
||||
* Lucene 4.0 Stored Fields Format.
|
||||
* <p>Stored fields are represented by two files:</p>
|
||||
* <ol>
|
||||
* <li><a name="field_index" id="field_index"></a>
|
||||
* <p>The field index, or <tt>.fdx</tt> file.</p>
|
||||
* <p>This is used to find the location within the field data file of the fields
|
||||
* of a particular document. Because it contains fixed-length data, this file may
|
||||
* be easily randomly accessed. The position of document <i>n</i> 's field data is
|
||||
* the {@link DataOutput#writeLong Uint64} at <i>n*8</i> in this file.</p>
|
||||
* <p>This contains, for each document, a pointer to its field data, as
|
||||
* follows:</p>
|
||||
* <ul>
|
||||
* <li>FieldIndex (.fdx) --> <Header>, <FieldValuesPosition> <sup>SegSize</sup></li>
|
||||
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||
* <li>FieldValuesPosition --> {@link DataOutput#writeLong Uint64}</li>
|
||||
* </ul>
|
||||
* </li>
|
||||
* <li>
|
||||
* <p><a name="field_data" id="field_data"></a>The field data, or <tt>.fdt</tt> file.</p>
|
||||
* <p>This contains the stored fields of each document, as follows:</p>
|
||||
* <ul>
|
||||
* <li>FieldData (.fdt) --> <Header>, <DocFieldData> <sup>SegSize</sup></li>
|
||||
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||
* <li>DocFieldData --> FieldCount, <FieldNum, Bits, Value>
|
||||
* <sup>FieldCount</sup></li>
|
||||
* <li>FieldCount --> {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>FieldNum --> {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>Bits --> {@link DataOutput#writeByte Byte}</li>
|
||||
* <ul>
|
||||
* <li>low order bit reserved.</li>
|
||||
* <li>second bit is one for fields containing binary data</li>
|
||||
* <li>third bit reserved.</li>
|
||||
* <li>4th to 6th bit (mask: 0x7<<3) define the type of a numeric field:
|
||||
* <ul>
|
||||
* <li>all bits in mask are cleared if no numeric field at all</li>
|
||||
* <li>1<<3: Value is Int</li>
|
||||
* <li>2<<3: Value is Long</li>
|
||||
* <li>3<<3: Value is Int as Float (as of {@link Float#intBitsToFloat(int)}</li>
|
||||
* <li>4<<3: Value is Long as Double (as of {@link Double#longBitsToDouble(long)}</li>
|
||||
* </ul>
|
||||
* </li>
|
||||
* </ul>
|
||||
* <li>Value --> String | BinaryValue | Int | Long (depending on Bits)</li>
|
||||
* <li>BinaryValue --> ValueSize, <{@link DataOutput#writeByte Byte}>^ValueSize</li>
|
||||
* <li>ValueSize --> {@link DataOutput#writeVInt VInt}</li>
|
||||
* </li>
|
||||
* </ul>
|
||||
* </ol>
|
||||
* @lucene.experimental */
|
||||
* @deprecated only for reading 4.0 segments */
|
||||
@Deprecated
|
||||
public class Lucene40StoredFieldsFormat extends StoredFieldsFormat {
|
||||
|
||||
/** Sole constructor. */
|
||||
|
@ -86,7 +38,7 @@ public class Lucene40StoredFieldsFormat extends StoredFieldsFormat {
|
|||
}
|
||||
|
||||
@Override
|
||||
public StoredFieldsReader fieldsReader(Directory directory, SegmentInfo si,
|
||||
public final StoredFieldsReader fieldsReader(Directory directory, SegmentInfo si,
|
||||
FieldInfos fn, IOContext context) throws IOException {
|
||||
return new Lucene40StoredFieldsReader(directory, si, fn, context);
|
||||
}
|
||||
|
|
|
@ -40,14 +40,11 @@ import java.nio.charset.StandardCharsets;
|
|||
import java.util.Collections;
|
||||
|
||||
/**
|
||||
* Class responsible for access to stored document fields.
|
||||
* <p/>
|
||||
* It uses <segment>.fdt and <segment>.fdx; files.
|
||||
*
|
||||
* @see Lucene40StoredFieldsFormat
|
||||
* @lucene.internal
|
||||
* Reader for 4.0 stored fields
|
||||
* @deprecated only for reading 4.0 segments
|
||||
*/
|
||||
public final class Lucene40StoredFieldsReader extends StoredFieldsReader implements Cloneable, Closeable {
|
||||
@Deprecated
|
||||
final class Lucene40StoredFieldsReader extends StoredFieldsReader implements Cloneable, Closeable {
|
||||
|
||||
// NOTE: bit 0 is free here! You can steal it!
|
||||
static final int FIELD_IS_BINARY = 1 << 1;
|
||||
|
@ -76,10 +73,10 @@ public final class Lucene40StoredFieldsReader extends StoredFieldsReader impleme
|
|||
|
||||
|
||||
/** Extension of stored fields file */
|
||||
public static final String FIELDS_EXTENSION = "fdt";
|
||||
static final String FIELDS_EXTENSION = "fdt";
|
||||
|
||||
/** Extension of stored fields index file */
|
||||
public static final String FIELDS_INDEX_EXTENSION = "fdx";
|
||||
static final String FIELDS_INDEX_EXTENSION = "fdx";
|
||||
|
||||
private static final long RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(Lucene40StoredFieldsReader.class);
|
||||
|
||||
|
|
|
@ -19,100 +19,19 @@ package org.apache.lucene.codecs.lucene40;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.TermVectorsFormat;
|
||||
import org.apache.lucene.codecs.TermVectorsReader;
|
||||
import org.apache.lucene.codecs.TermVectorsWriter;
|
||||
import org.apache.lucene.index.FieldInfos;
|
||||
import org.apache.lucene.index.SegmentInfo;
|
||||
import org.apache.lucene.store.DataOutput; // javadocs
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
|
||||
/**
|
||||
* Lucene 4.0 Term Vectors format.
|
||||
* <p>Term Vector support is an optional on a field by field basis. It consists of
|
||||
* 3 files.</p>
|
||||
* <ol>
|
||||
* <li><a name="tvx" id="tvx"></a>
|
||||
* <p>The Document Index or .tvx file.</p>
|
||||
* <p>For each document, this stores the offset into the document data (.tvd) and
|
||||
* field data (.tvf) files.</p>
|
||||
* <p>DocumentIndex (.tvx) --> Header,<DocumentPosition,FieldPosition>
|
||||
* <sup>NumDocs</sup></p>
|
||||
* <ul>
|
||||
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||
* <li>DocumentPosition --> {@link DataOutput#writeLong UInt64} (offset in the .tvd file)</li>
|
||||
* <li>FieldPosition --> {@link DataOutput#writeLong UInt64} (offset in the .tvf file)</li>
|
||||
* </ul>
|
||||
* </li>
|
||||
* <li><a name="tvd" id="tvd"></a>
|
||||
* <p>The Document or .tvd file.</p>
|
||||
* <p>This contains, for each document, the number of fields, a list of the fields
|
||||
* with term vector info and finally a list of pointers to the field information
|
||||
* in the .tvf (Term Vector Fields) file.</p>
|
||||
* <p>The .tvd file is used to map out the fields that have term vectors stored
|
||||
* and where the field information is in the .tvf file.</p>
|
||||
* <p>Document (.tvd) --> Header,<NumFields, FieldNums,
|
||||
* FieldPositions> <sup>NumDocs</sup></p>
|
||||
* <ul>
|
||||
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||
* <li>NumFields --> {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>FieldNums --> <FieldNumDelta> <sup>NumFields</sup></li>
|
||||
* <li>FieldNumDelta --> {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>FieldPositions --> <FieldPositionDelta> <sup>NumFields-1</sup></li>
|
||||
* <li>FieldPositionDelta --> {@link DataOutput#writeVLong VLong}</li>
|
||||
* </ul>
|
||||
* </li>
|
||||
* <li><a name="tvf" id="tvf"></a>
|
||||
* <p>The Field or .tvf file.</p>
|
||||
* <p>This file contains, for each field that has a term vector stored, a list of
|
||||
* the terms, their frequencies and, optionally, position, offset, and payload
|
||||
* information.</p>
|
||||
* <p>Field (.tvf) --> Header,<NumTerms, Flags, TermFreqs>
|
||||
* <sup>NumFields</sup></p>
|
||||
* <ul>
|
||||
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||
* <li>NumTerms --> {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>Flags --> {@link DataOutput#writeByte Byte}</li>
|
||||
* <li>TermFreqs --> <TermText, TermFreq, Positions?, PayloadData?, Offsets?>
|
||||
* <sup>NumTerms</sup></li>
|
||||
* <li>TermText --> <PrefixLength, Suffix></li>
|
||||
* <li>PrefixLength --> {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>Suffix --> {@link DataOutput#writeString String}</li>
|
||||
* <li>TermFreq --> {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>Positions --> <PositionDelta PayloadLength?><sup>TermFreq</sup></li>
|
||||
* <li>PositionDelta --> {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>PayloadLength --> {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>PayloadData --> {@link DataOutput#writeByte Byte}<sup>NumPayloadBytes</sup></li>
|
||||
* <li>Offsets --> <{@link DataOutput#writeVInt VInt}, {@link DataOutput#writeVInt VInt}><sup>TermFreq</sup></li>
|
||||
* </ul>
|
||||
* <p>Notes:</p>
|
||||
* <ul>
|
||||
* <li>Flags byte stores whether this term vector has position, offset, payload.
|
||||
* information stored.</li>
|
||||
* <li>Term byte prefixes are shared. The PrefixLength is the number of initial
|
||||
* bytes from the previous term which must be pre-pended to a term's suffix
|
||||
* in order to form the term's bytes. Thus, if the previous term's text was "bone"
|
||||
* and the term is "boy", the PrefixLength is two and the suffix is "y".</li>
|
||||
* <li>PositionDelta is, if payloads are disabled for the term's field, the
|
||||
* difference between the position of the current occurrence in the document and
|
||||
* the previous occurrence (or zero, if this is the first occurrence in this
|
||||
* document). If payloads are enabled for the term's field, then PositionDelta/2
|
||||
* is the difference between the current and the previous position. If payloads
|
||||
* are enabled and PositionDelta is odd, then PayloadLength is stored, indicating
|
||||
* the length of the payload at the current term position.</li>
|
||||
* <li>PayloadData is metadata associated with a term position. If
|
||||
* PayloadLength is stored at the current position, then it indicates the length
|
||||
* of this payload. If PayloadLength is not stored, then this payload has the same
|
||||
* length as the payload at the previous position. PayloadData encodes the
|
||||
* concatenated bytes for all of a terms occurrences.</li>
|
||||
* <li>Offsets are stored as delta encoded VInts. The first VInt is the
|
||||
* startOffset, the second is the endOffset.</li>
|
||||
* </ul>
|
||||
* </li>
|
||||
* </ol>
|
||||
* @deprecated only for reading 4.0 and 4.1 segments
|
||||
*/
|
||||
@Deprecated
|
||||
public class Lucene40TermVectorsFormat extends TermVectorsFormat {
|
||||
|
||||
/** Sole constructor. */
|
||||
|
@ -120,7 +39,7 @@ public class Lucene40TermVectorsFormat extends TermVectorsFormat {
|
|||
}
|
||||
|
||||
@Override
|
||||
public TermVectorsReader vectorsReader(Directory directory, SegmentInfo segmentInfo, FieldInfos fieldInfos, IOContext context) throws IOException {
|
||||
public final TermVectorsReader vectorsReader(Directory directory, SegmentInfo segmentInfo, FieldInfos fieldInfos, IOContext context) throws IOException {
|
||||
return new Lucene40TermVectorsReader(directory, segmentInfo, fieldInfos, context);
|
||||
}
|
||||
|
||||
|
|
|
@ -48,12 +48,10 @@ import org.apache.lucene.util.IOUtils;
|
|||
|
||||
/**
|
||||
* Lucene 4.0 Term Vectors reader.
|
||||
* <p>
|
||||
* It reads .tvd, .tvf, and .tvx files.
|
||||
*
|
||||
* @see Lucene40TermVectorsFormat
|
||||
* @deprecated only for reading 4.0 and 4.1 segments
|
||||
*/
|
||||
public class Lucene40TermVectorsReader extends TermVectorsReader implements Closeable {
|
||||
@Deprecated
|
||||
final class Lucene40TermVectorsReader extends TermVectorsReader implements Closeable {
|
||||
|
||||
static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x1;
|
||||
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.codecs.lucene41;
|
|||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.FieldInfosFormat;
|
||||
import org.apache.lucene.codecs.FilterCodec;
|
||||
import org.apache.lucene.codecs.LiveDocsFormat;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.SegmentInfoFormat;
|
||||
|
@ -36,14 +35,8 @@ import org.apache.lucene.codecs.lucene40.Lucene40TermVectorsFormat;
|
|||
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
||||
|
||||
/**
|
||||
* Implements the Lucene 4.1 index format, with configurable per-field postings formats.
|
||||
* <p>
|
||||
* If you want to reuse functionality of this codec in another codec, extend
|
||||
* {@link FilterCodec}.
|
||||
*
|
||||
* @see org.apache.lucene.codecs.lucene41 package documentation for file format details.
|
||||
* @deprecated Only for reading old 4.0 segments
|
||||
* @lucene.experimental
|
||||
* Implements the Lucene 4.1 index format
|
||||
* @deprecated Only for reading old 4.1 segments
|
||||
*/
|
||||
@Deprecated
|
||||
public class Lucene41Codec extends Codec {
|
||||
|
@ -65,7 +58,6 @@ public class Lucene41Codec extends Codec {
|
|||
super("Lucene41");
|
||||
}
|
||||
|
||||
// TODO: slightly evil
|
||||
@Override
|
||||
public StoredFieldsFormat storedFieldsFormat() {
|
||||
return fieldsFormat;
|
||||
|
|
|
@ -30,6 +30,7 @@ import org.apache.lucene.store.IOContext;
|
|||
|
||||
/**
|
||||
* Lucene 4.1 stored fields format.
|
||||
* @deprecated only for reading old 4.x segments
|
||||
*/
|
||||
@Deprecated
|
||||
public class Lucene41StoredFieldsFormat extends StoredFieldsFormat {
|
||||
|
|
|
@ -34,7 +34,7 @@ import org.apache.lucene.util.RamUsageEstimator;
|
|||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/**
|
||||
* Random-access reader for {@code Lucene41CompressingStoredFieldsIndexWriter}.
|
||||
* Reader for 4.x stored fields/term vectors index
|
||||
* @deprecated only for reading old segments
|
||||
*/
|
||||
@Deprecated
|
||||
|
|
|
@ -22,7 +22,6 @@ import java.io.IOException;
|
|||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.codecs.FieldInfosFormat;
|
||||
import org.apache.lucene.codecs.FilterCodec;
|
||||
import org.apache.lucene.codecs.LiveDocsFormat;
|
||||
import org.apache.lucene.codecs.NormsConsumer;
|
||||
import org.apache.lucene.codecs.NormsFormat;
|
||||
|
@ -41,18 +40,10 @@ import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
|||
import org.apache.lucene.index.SegmentWriteState;
|
||||
|
||||
/**
|
||||
* Implements the Lucene 4.10 index format, with configurable per-field postings
|
||||
* and docvalues formats.
|
||||
* <p>
|
||||
* If you want to reuse functionality of this codec in another codec, extend
|
||||
* {@link FilterCodec}.
|
||||
*
|
||||
* @see org.apache.lucene.codecs.lucene410 package documentation for file format details.
|
||||
* @lucene.experimental
|
||||
* Implements the Lucene 4.10 codec
|
||||
* @deprecated only for reading old 4.10 segments
|
||||
*/
|
||||
// NOTE: if we make largish changes in a minor release, easier to just make Lucene411Codec or whatever
|
||||
// if they are backwards compatible or smallish we can probably do the backwards in the postingsreader
|
||||
// (it writes a minor version, etc).
|
||||
@Deprecated
|
||||
public class Lucene410Codec extends Codec {
|
||||
private final StoredFieldsFormat fieldsFormat = new Lucene41StoredFieldsFormat();
|
||||
private final TermVectorsFormat vectorsFormat = new Lucene42TermVectorsFormat();
|
||||
|
|
|
@ -22,7 +22,6 @@ import java.io.IOException;
|
|||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.codecs.FieldInfosFormat;
|
||||
import org.apache.lucene.codecs.FilterCodec;
|
||||
import org.apache.lucene.codecs.LiveDocsFormat;
|
||||
import org.apache.lucene.codecs.NormsConsumer;
|
||||
import org.apache.lucene.codecs.NormsFormat;
|
||||
|
@ -38,19 +37,9 @@ import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
|||
import org.apache.lucene.index.SegmentWriteState;
|
||||
|
||||
/**
|
||||
* Implements the Lucene 4.2 index format, with configurable per-field postings
|
||||
* and docvalues formats.
|
||||
* <p>
|
||||
* If you want to reuse functionality of this codec in another codec, extend
|
||||
* {@link FilterCodec}.
|
||||
*
|
||||
* @see org.apache.lucene.codecs.lucene42 package documentation for file format details.
|
||||
* @lucene.experimental
|
||||
* Implements the Lucene 4.2 index format
|
||||
* @deprecated Only for reading old 4.2 segments
|
||||
*/
|
||||
// NOTE: if we make largish changes in a minor release, easier to just make Lucene43Codec or whatever
|
||||
// if they are backwards compatible or smallish we can probably do the backwards in the postingsreader
|
||||
// (it writes a minor version, etc).
|
||||
@Deprecated
|
||||
public class Lucene42Codec extends Codec {
|
||||
private final StoredFieldsFormat fieldsFormat = new Lucene41StoredFieldsFormat();
|
||||
|
|
|
@ -19,119 +19,22 @@ package org.apache.lucene.codecs.lucene42;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.DocValuesConsumer;
|
||||
import org.apache.lucene.codecs.DocValuesProducer;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
import org.apache.lucene.util.packed.BlockPackedWriter;
|
||||
|
||||
/**
|
||||
* Lucene 4.2 DocValues format.
|
||||
* <p>
|
||||
* Encodes the four per-document value types (Numeric,Binary,Sorted,SortedSet) with seven basic strategies.
|
||||
* <p>
|
||||
* <ul>
|
||||
* <li>Delta-compressed Numerics: per-document integers written in blocks of 4096. For each block
|
||||
* the minimum value is encoded, and each entry is a delta from that minimum value.
|
||||
* <li>Table-compressed Numerics: when the number of unique values is very small, a lookup table
|
||||
* is written instead. Each per-document entry is instead the ordinal to this table.
|
||||
* <li>Uncompressed Numerics: when all values would fit into a single byte, and the
|
||||
* <code>acceptableOverheadRatio</code> would pack values into 8 bits per value anyway, they
|
||||
* are written as absolute values (with no indirection or packing) for performance.
|
||||
* <li>GCD-compressed Numerics: when all numbers share a common divisor, such as dates, the greatest
|
||||
* common denominator (GCD) is computed, and quotients are stored using Delta-compressed Numerics.
|
||||
* <li>Fixed-width Binary: one large concatenated byte[] is written, along with the fixed length.
|
||||
* Each document's value can be addressed by maxDoc*length.
|
||||
* <li>Variable-width Binary: one large concatenated byte[] is written, along with end addresses
|
||||
* for each document. The addresses are written in blocks of 4096, with the current absolute
|
||||
* start for the block, and the average (expected) delta per entry. For each document the
|
||||
* deviation from the delta (actual - expected) is written.
|
||||
* <li>Sorted: an FST mapping deduplicated terms to ordinals is written, along with the per-document
|
||||
* ordinals written using one of the numeric strategies above.
|
||||
* <li>SortedSet: an FST mapping deduplicated terms to ordinals is written, along with the per-document
|
||||
* ordinal list written using one of the binary strategies above.
|
||||
* </ul>
|
||||
* <p>
|
||||
* Files:
|
||||
* <ol>
|
||||
* <li><tt>.dvd</tt>: DocValues data</li>
|
||||
* <li><tt>.dvm</tt>: DocValues metadata</li>
|
||||
* </ol>
|
||||
* <ol>
|
||||
* <li><a name="dvm" id="dvm"></a>
|
||||
* <p>The DocValues metadata or .dvm file.</p>
|
||||
* <p>For DocValues field, this stores metadata, such as the offset into the
|
||||
* DocValues data (.dvd)</p>
|
||||
* <p>DocValues metadata (.dvm) --> Header,<FieldNumber,EntryType,Entry><sup>NumFields</sup>,Footer</p>
|
||||
* <ul>
|
||||
* <li>Entry --> NumericEntry | BinaryEntry | SortedEntry</li>
|
||||
* <li>NumericEntry --> DataOffset,CompressionType,PackedVersion</li>
|
||||
* <li>BinaryEntry --> DataOffset,DataLength,MinLength,MaxLength,PackedVersion?,BlockSize?</li>
|
||||
* <li>SortedEntry --> DataOffset,ValueCount</li>
|
||||
* <li>FieldNumber,PackedVersion,MinLength,MaxLength,BlockSize,ValueCount --> {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>DataOffset,DataLength --> {@link DataOutput#writeLong Int64}</li>
|
||||
* <li>EntryType,CompressionType --> {@link DataOutput#writeByte Byte}</li>
|
||||
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li>
|
||||
* </ul>
|
||||
* <p>Sorted fields have two entries: a SortedEntry with the FST metadata,
|
||||
* and an ordinary NumericEntry for the document-to-ord metadata.</p>
|
||||
* <p>SortedSet fields have two entries: a SortedEntry with the FST metadata,
|
||||
* and an ordinary BinaryEntry for the document-to-ord-list metadata.</p>
|
||||
* <p>FieldNumber of -1 indicates the end of metadata.</p>
|
||||
* <p>EntryType is a 0 (NumericEntry), 1 (BinaryEntry, or 2 (SortedEntry)</p>
|
||||
* <p>DataOffset is the pointer to the start of the data in the DocValues data (.dvd)</p>
|
||||
* <p>CompressionType indicates how Numeric values will be compressed:
|
||||
* <ul>
|
||||
* <li>0 --> delta-compressed. For each block of 4096 integers, every integer is delta-encoded
|
||||
* from the minimum value within the block.
|
||||
* <li>1 --> table-compressed. When the number of unique numeric values is small and it would save space,
|
||||
* a lookup table of unique values is written, followed by the ordinal for each document.
|
||||
* <li>2 --> uncompressed. When the <code>acceptableOverheadRatio</code> parameter would upgrade the number
|
||||
* of bits required to 8, and all values fit in a byte, these are written as absolute binary values
|
||||
* for performance.
|
||||
* <li>3 -->, gcd-compressed. When all integers share a common divisor, only quotients are stored
|
||||
* using blocks of delta-encoded ints.
|
||||
* </ul>
|
||||
* <p>MinLength and MaxLength represent the min and max byte[] value lengths for Binary values.
|
||||
* If they are equal, then all values are of a fixed size, and can be addressed as DataOffset + (docID * length).
|
||||
* Otherwise, the binary values are of variable size, and packed integer metadata (PackedVersion,BlockSize)
|
||||
* is written for the addresses.
|
||||
* <li><a name="dvd" id="dvd"></a>
|
||||
* <p>The DocValues data or .dvd file.</p>
|
||||
* <p>For DocValues field, this stores the actual per-document data (the heavy-lifting)</p>
|
||||
* <p>DocValues data (.dvd) --> Header,<NumericData | BinaryData | SortedData><sup>NumFields</sup>,Footer</p>
|
||||
* <ul>
|
||||
* <li>NumericData --> DeltaCompressedNumerics | TableCompressedNumerics | UncompressedNumerics | GCDCompressedNumerics</li>
|
||||
* <li>BinaryData --> {@link DataOutput#writeByte Byte}<sup>DataLength</sup>,Addresses</li>
|
||||
* <li>SortedData --> {@link FST FST<Int64>}</li>
|
||||
* <li>DeltaCompressedNumerics --> {@link BlockPackedWriter BlockPackedInts(blockSize=4096)}</li>
|
||||
* <li>TableCompressedNumerics --> TableSize,{@link DataOutput#writeLong Int64}<sup>TableSize</sup>,{@link PackedInts PackedInts}</li>
|
||||
* <li>UncompressedNumerics --> {@link DataOutput#writeByte Byte}<sup>maxdoc</sup></li>
|
||||
* <li>Addresses --> {@link MonotonicBlockPackedWriter MonotonicBlockPackedInts(blockSize=4096)}</li>
|
||||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li>
|
||||
* </ul>
|
||||
* <p>SortedSet entries store the list of ordinals in their BinaryData as a
|
||||
* sequences of increasing {@link DataOutput#writeVLong vLong}s, delta-encoded.</p>
|
||||
* </ol>
|
||||
* <p>
|
||||
* Limitations:
|
||||
* <ul>
|
||||
* <li> Binary doc values can be at most {@link #MAX_BINARY_FIELD_LENGTH} in length.
|
||||
* </ul>
|
||||
* @deprecated Only for reading old 4.2 segments
|
||||
*/
|
||||
@Deprecated
|
||||
public class Lucene42DocValuesFormat extends DocValuesFormat {
|
||||
|
||||
/** Maximum length for each binary doc values field. */
|
||||
public static final int MAX_BINARY_FIELD_LENGTH = (1 << 15) - 2;
|
||||
static final int MAX_BINARY_FIELD_LENGTH = (1 << 15) - 2;
|
||||
|
||||
final float acceptableOverheadRatio;
|
||||
|
||||
|
@ -162,7 +65,7 @@ public class Lucene42DocValuesFormat extends DocValuesFormat {
|
|||
}
|
||||
|
||||
@Override
|
||||
public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
public final DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
return new Lucene42DocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION);
|
||||
}
|
||||
|
||||
|
|
|
@ -66,9 +66,11 @@ import org.apache.lucene.util.packed.MonotonicBlockPackedReader;
|
|||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/**
|
||||
* Reader for {@link Lucene42DocValuesFormat}
|
||||
* Reader for 4.2 docvalues
|
||||
* @deprecated only for reading old 4.x segments
|
||||
*/
|
||||
class Lucene42DocValuesProducer extends DocValuesProducer {
|
||||
@Deprecated
|
||||
final class Lucene42DocValuesProducer extends DocValuesProducer {
|
||||
// metadata maps (just file pointers and minimal stuff)
|
||||
private final Map<String,NumericEntry> numerics;
|
||||
private final Map<String,BinaryEntry> binaries;
|
||||
|
|
|
@ -19,70 +19,12 @@ package org.apache.lucene.codecs.lucene42;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.FieldInfosFormat;
|
||||
import org.apache.lucene.codecs.FieldInfosReader;
|
||||
import org.apache.lucene.codecs.FieldInfosWriter;
|
||||
import org.apache.lucene.index.FieldInfo.DocValuesType; // javadoc
|
||||
import org.apache.lucene.store.DataOutput; // javadoc
|
||||
|
||||
/**
|
||||
* Lucene 4.2 Field Infos format.
|
||||
* <p>
|
||||
* <p>Field names are stored in the field info file, with suffix <tt>.fnm</tt>.</p>
|
||||
* <p>FieldInfos (.fnm) --> Header,FieldsCount, <FieldName,FieldNumber,
|
||||
* FieldBits,DocValuesBits,Attributes> <sup>FieldsCount</sup></p>
|
||||
* <p>Data types:
|
||||
* <ul>
|
||||
* <li>Header --> {@link CodecUtil#checkHeader CodecHeader}</li>
|
||||
* <li>FieldsCount --> {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>FieldName --> {@link DataOutput#writeString String}</li>
|
||||
* <li>FieldBits, DocValuesBits --> {@link DataOutput#writeByte Byte}</li>
|
||||
* <li>FieldNumber --> {@link DataOutput#writeInt VInt}</li>
|
||||
* <li>Attributes --> {@link DataOutput#writeStringStringMap Map<String,String>}</li>
|
||||
* </ul>
|
||||
* </p>
|
||||
* Field Descriptions:
|
||||
* <ul>
|
||||
* <li>FieldsCount: the number of fields in this file.</li>
|
||||
* <li>FieldName: name of the field as a UTF-8 String.</li>
|
||||
* <li>FieldNumber: the field's number. Note that unlike previous versions of
|
||||
* Lucene, the fields are not numbered implicitly by their order in the
|
||||
* file, instead explicitly.</li>
|
||||
* <li>FieldBits: a byte containing field options.
|
||||
* <ul>
|
||||
* <li>The low-order bit is one for indexed fields, and zero for non-indexed
|
||||
* fields.</li>
|
||||
* <li>The second lowest-order bit is one for fields that have term vectors
|
||||
* stored, and zero for fields without term vectors.</li>
|
||||
* <li>If the third lowest order-bit is set (0x4), offsets are stored into
|
||||
* the postings list in addition to positions.</li>
|
||||
* <li>Fourth bit is unused.</li>
|
||||
* <li>If the fifth lowest-order bit is set (0x10), norms are omitted for the
|
||||
* indexed field.</li>
|
||||
* <li>If the sixth lowest-order bit is set (0x20), payloads are stored for the
|
||||
* indexed field.</li>
|
||||
* <li>If the seventh lowest-order bit is set (0x40), term frequencies and
|
||||
* positions omitted for the indexed field.</li>
|
||||
* <li>If the eighth lowest-order bit is set (0x80), positions are omitted for the
|
||||
* indexed field.</li>
|
||||
* </ul>
|
||||
* </li>
|
||||
* <li>DocValuesBits: a byte containing per-document value types. The type
|
||||
* recorded as two four-bit integers, with the high-order bits representing
|
||||
* <code>norms</code> options, and the low-order bits representing
|
||||
* {@code DocValues} options. Each four-bit integer can be decoded as such:
|
||||
* <ul>
|
||||
* <li>0: no DocValues for this field.</li>
|
||||
* <li>1: NumericDocValues. ({@link DocValuesType#NUMERIC})</li>
|
||||
* <li>2: BinaryDocValues. ({@code DocValuesType#BINARY})</li>
|
||||
* <li>3: SortedDocValues. ({@code DocValuesType#SORTED})</li>
|
||||
* </ul>
|
||||
* </li>
|
||||
* <li>Attributes: a key-value map of codec-private attributes.</li>
|
||||
* </ul>
|
||||
*
|
||||
* @lucene.experimental
|
||||
* @deprecated Only for reading old 4.2-4.5 segments
|
||||
*/
|
||||
@Deprecated
|
||||
|
@ -94,7 +36,7 @@ public class Lucene42FieldInfosFormat extends FieldInfosFormat {
|
|||
}
|
||||
|
||||
@Override
|
||||
public FieldInfosReader getFieldInfosReader() throws IOException {
|
||||
public final FieldInfosReader getFieldInfosReader() throws IOException {
|
||||
return reader;
|
||||
}
|
||||
|
||||
|
|
|
@ -38,9 +38,7 @@ import org.apache.lucene.util.IOUtils;
|
|||
/**
|
||||
* Lucene 4.2 FieldInfos reader.
|
||||
*
|
||||
* @lucene.experimental
|
||||
* @deprecated Only for reading old 4.2-4.5 segments
|
||||
* @see Lucene42FieldInfosFormat
|
||||
*/
|
||||
@Deprecated
|
||||
final class Lucene42FieldInfosReader extends FieldInfosReader {
|
||||
|
|
|
@ -19,8 +19,6 @@ package org.apache.lucene.codecs.lucene42;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.DocValuesConsumer;
|
||||
import org.apache.lucene.codecs.DocValuesProducer;
|
||||
import org.apache.lucene.codecs.NormsConsumer;
|
||||
import org.apache.lucene.codecs.NormsFormat;
|
||||
import org.apache.lucene.codecs.NormsProducer;
|
||||
|
@ -30,19 +28,9 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
|
||||
/**
|
||||
* Lucene 4.2 score normalization format.
|
||||
* <p>
|
||||
* NOTE: this uses the same format as {@link Lucene42DocValuesFormat}
|
||||
* Numeric DocValues, but with different file extensions, and passing
|
||||
* {@link PackedInts#FASTEST} for uncompressed encoding: trading off
|
||||
* space for performance.
|
||||
* <p>
|
||||
* Files:
|
||||
* <ul>
|
||||
* <li><tt>.nvd</tt>: DocValues data</li>
|
||||
* <li><tt>.nvm</tt>: DocValues metadata</li>
|
||||
* </ul>
|
||||
* @see Lucene42DocValuesFormat
|
||||
* @deprecated only for reading old 4.x segments
|
||||
*/
|
||||
@Deprecated
|
||||
public class Lucene42NormsFormat extends NormsFormat {
|
||||
final float acceptableOverheadRatio;
|
||||
|
||||
|
@ -73,7 +61,7 @@ public class Lucene42NormsFormat extends NormsFormat {
|
|||
}
|
||||
|
||||
@Override
|
||||
public NormsProducer normsProducer(SegmentReadState state) throws IOException {
|
||||
public final NormsProducer normsProducer(SegmentReadState state) throws IOException {
|
||||
return new Lucene42NormsProducer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION);
|
||||
}
|
||||
|
||||
|
|
|
@ -18,7 +18,6 @@ package org.apache.lucene.codecs.lucene42;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
|
||||
import org.apache.lucene.codecs.NormsProducer;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
|
@ -28,11 +27,10 @@ import org.apache.lucene.util.Accountable;
|
|||
|
||||
/**
|
||||
* Reads 4.2-4.8 norms.
|
||||
* Implemented the same as docvalues, but with a different filename.
|
||||
* @deprecated Only for reading old segments
|
||||
*/
|
||||
@Deprecated
|
||||
class Lucene42NormsProducer extends NormsProducer {
|
||||
final class Lucene42NormsProducer extends NormsProducer {
|
||||
private final Lucene42DocValuesProducer impl;
|
||||
|
||||
Lucene42NormsProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
|
||||
|
|
|
@ -55,7 +55,7 @@ import org.apache.lucene.util.packed.BlockPackedReaderIterator;
|
|||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/**
|
||||
* {@link TermVectorsReader} for {@code Lucene42TermVectorsFormat}.
|
||||
* 4.2 term vectors reader
|
||||
* @deprecated only for reading old segments
|
||||
*/
|
||||
@Deprecated
|
||||
|
|
|
@ -22,7 +22,6 @@ import java.io.IOException;
|
|||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.codecs.FieldInfosFormat;
|
||||
import org.apache.lucene.codecs.FilterCodec;
|
||||
import org.apache.lucene.codecs.LiveDocsFormat;
|
||||
import org.apache.lucene.codecs.NormsConsumer;
|
||||
import org.apache.lucene.codecs.NormsFormat;
|
||||
|
@ -41,19 +40,9 @@ import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
|||
import org.apache.lucene.index.SegmentWriteState;
|
||||
|
||||
/**
|
||||
* Implements the Lucene 4.5 index format, with configurable per-field postings
|
||||
* and docvalues formats.
|
||||
* <p>
|
||||
* If you want to reuse functionality of this codec in another codec, extend
|
||||
* {@link FilterCodec}.
|
||||
*
|
||||
* @see org.apache.lucene.codecs.lucene45 package documentation for file format details.
|
||||
* @lucene.experimental
|
||||
* Implements the Lucene 4.5 index format
|
||||
* @deprecated Only for reading old 4.3-4.5 segments
|
||||
*/
|
||||
// NOTE: if we make largish changes in a minor release, easier to just make Lucene46Codec or whatever
|
||||
// if they are backwards compatible or smallish we can probably do the backwards in the postingsreader
|
||||
// (it writes a minor version, etc).
|
||||
@Deprecated
|
||||
public class Lucene45Codec extends Codec {
|
||||
private final StoredFieldsFormat fieldsFormat = new Lucene41StoredFieldsFormat();
|
||||
|
|
|
@ -39,7 +39,11 @@ import org.apache.lucene.util.packed.BlockPackedWriter;
|
|||
import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/** writer for {@link Lucene45DocValuesFormat} */
|
||||
/**
|
||||
* writer for 4.5 docvalues format
|
||||
* @deprecated only for old 4.x segments
|
||||
*/
|
||||
@Deprecated
|
||||
class Lucene45DocValuesConsumer extends DocValuesConsumer implements Closeable {
|
||||
|
||||
static final int BLOCK_SIZE = 16384;
|
||||
|
|
|
@ -19,148 +19,15 @@ package org.apache.lucene.codecs.lucene45;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.DocValuesConsumer;
|
||||
import org.apache.lucene.codecs.DocValuesProducer;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.index.FieldInfo.DocValuesType;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.SmallFloat;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.packed.BlockPackedWriter;
|
||||
import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/**
|
||||
* Lucene 4.5 DocValues format.
|
||||
* <p>
|
||||
* Encodes the four per-document value types (Numeric,Binary,Sorted,SortedSet) with these strategies:
|
||||
* <p>
|
||||
* {@link DocValuesType#NUMERIC NUMERIC}:
|
||||
* <ul>
|
||||
* <li>Delta-compressed: per-document integers written in blocks of 16k. For each block
|
||||
* the minimum value in that block is encoded, and each entry is a delta from that
|
||||
* minimum value. Each block of deltas is compressed with bitpacking. For more
|
||||
* information, see {@link BlockPackedWriter}.
|
||||
* <li>Table-compressed: when the number of unique values is very small (< 256), and
|
||||
* when there are unused "gaps" in the range of values used (such as {@link SmallFloat}),
|
||||
* a lookup table is written instead. Each per-document entry is instead the ordinal
|
||||
* to this table, and those ordinals are compressed with bitpacking ({@link PackedInts}).
|
||||
* <li>GCD-compressed: when all numbers share a common divisor, such as dates, the greatest
|
||||
* common denominator (GCD) is computed, and quotients are stored using Delta-compressed Numerics.
|
||||
* </ul>
|
||||
* <p>
|
||||
* {@link DocValuesType#BINARY BINARY}:
|
||||
* <ul>
|
||||
* <li>Fixed-width Binary: one large concatenated byte[] is written, along with the fixed length.
|
||||
* Each document's value can be addressed directly with multiplication ({@code docID * length}).
|
||||
* <li>Variable-width Binary: one large concatenated byte[] is written, along with end addresses
|
||||
* for each document. The addresses are written in blocks of 16k, with the current absolute
|
||||
* start for the block, and the average (expected) delta per entry. For each document the
|
||||
* deviation from the delta (actual - expected) is written.
|
||||
* <li>Prefix-compressed Binary: values are written in chunks of 16, with the first value written
|
||||
* completely and other values sharing prefixes. chunk addresses are written in blocks of 16k,
|
||||
* with the current absolute start for the block, and the average (expected) delta per entry.
|
||||
* For each chunk the deviation from the delta (actual - expected) is written.
|
||||
* </ul>
|
||||
* <p>
|
||||
* {@link DocValuesType#SORTED SORTED}:
|
||||
* <ul>
|
||||
* <li>Sorted: a mapping of ordinals to deduplicated terms is written as Prefix-Compressed Binary,
|
||||
* along with the per-document ordinals written using one of the numeric strategies above.
|
||||
* </ul>
|
||||
* <p>
|
||||
* {@link DocValuesType#SORTED_SET SORTED_SET}:
|
||||
* <ul>
|
||||
* <li>SortedSet: a mapping of ordinals to deduplicated terms is written as Prefix-Compressed Binary,
|
||||
* an ordinal list and per-document index into this list are written using the numeric strategies
|
||||
* above.
|
||||
* </ul>
|
||||
* <p>
|
||||
* Files:
|
||||
* <ol>
|
||||
* <li><tt>.dvd</tt>: DocValues data</li>
|
||||
* <li><tt>.dvm</tt>: DocValues metadata</li>
|
||||
* </ol>
|
||||
* <ol>
|
||||
* <li><a name="dvm" id="dvm"></a>
|
||||
* <p>The DocValues metadata or .dvm file.</p>
|
||||
* <p>For DocValues field, this stores metadata, such as the offset into the
|
||||
* DocValues data (.dvd)</p>
|
||||
* <p>DocValues metadata (.dvm) --> Header,<Entry><sup>NumFields</sup>,Footer</p>
|
||||
* <ul>
|
||||
* <li>Entry --> NumericEntry | BinaryEntry | SortedEntry | SortedSetEntry</li>
|
||||
* <li>NumericEntry --> GCDNumericEntry | TableNumericEntry | DeltaNumericEntry</li>
|
||||
* <li>GCDNumericEntry --> NumericHeader,MinValue,GCD</li>
|
||||
* <li>TableNumericEntry --> NumericHeader,TableSize,{@link DataOutput#writeLong Int64}<sup>TableSize</sup></li>
|
||||
* <li>DeltaNumericEntry --> NumericHeader</li>
|
||||
* <li>NumericHeader --> FieldNumber,EntryType,NumericType,MissingOffset,PackedVersion,DataOffset,Count,BlockSize</li>
|
||||
* <li>BinaryEntry --> FixedBinaryEntry | VariableBinaryEntry | PrefixBinaryEntry</li>
|
||||
* <li>FixedBinaryEntry --> BinaryHeader</li>
|
||||
* <li>VariableBinaryEntry --> BinaryHeader,AddressOffset,PackedVersion,BlockSize</li>
|
||||
* <li>PrefixBinaryEntry --> BinaryHeader,AddressInterval,AddressOffset,PackedVersion,BlockSize</li>
|
||||
* <li>BinaryHeader --> FieldNumber,EntryType,BinaryType,MissingOffset,MinLength,MaxLength,DataOffset</li>
|
||||
* <li>SortedEntry --> FieldNumber,EntryType,BinaryEntry,NumericEntry</li>
|
||||
* <li>SortedSetEntry --> EntryType,BinaryEntry,NumericEntry,NumericEntry</li>
|
||||
* <li>FieldNumber,PackedVersion,MinLength,MaxLength,BlockSize,ValueCount --> {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>EntryType,CompressionType --> {@link DataOutput#writeByte Byte}</li>
|
||||
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||
* <li>MinValue,GCD,MissingOffset,AddressOffset,DataOffset --> {@link DataOutput#writeLong Int64}</li>
|
||||
* <li>TableSize --> {@link DataOutput#writeVInt vInt}</li>
|
||||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li>
|
||||
* </ul>
|
||||
* <p>Sorted fields have two entries: a BinaryEntry with the value metadata,
|
||||
* and an ordinary NumericEntry for the document-to-ord metadata.</p>
|
||||
* <p>SortedSet fields have three entries: a BinaryEntry with the value metadata,
|
||||
* and two NumericEntries for the document-to-ord-index and ordinal list metadata.</p>
|
||||
* <p>FieldNumber of -1 indicates the end of metadata.</p>
|
||||
* <p>EntryType is a 0 (NumericEntry) or 1 (BinaryEntry)</p>
|
||||
* <p>DataOffset is the pointer to the start of the data in the DocValues data (.dvd)</p>
|
||||
* <p>NumericType indicates how Numeric values will be compressed:
|
||||
* <ul>
|
||||
* <li>0 --> delta-compressed. For each block of 16k integers, every integer is delta-encoded
|
||||
* from the minimum value within the block.
|
||||
* <li>1 -->, gcd-compressed. When all integers share a common divisor, only quotients are stored
|
||||
* using blocks of delta-encoded ints.
|
||||
* <li>2 --> table-compressed. When the number of unique numeric values is small and it would save space,
|
||||
* a lookup table of unique values is written, followed by the ordinal for each document.
|
||||
* </ul>
|
||||
* <p>BinaryType indicates how Binary values will be stored:
|
||||
* <ul>
|
||||
* <li>0 --> fixed-width. All values have the same length, addressing by multiplication.
|
||||
* <li>1 -->, variable-width. An address for each value is stored.
|
||||
* <li>2 --> prefix-compressed. An address to the start of every interval'th value is stored.
|
||||
* </ul>
|
||||
* <p>MinLength and MaxLength represent the min and max byte[] value lengths for Binary values.
|
||||
* If they are equal, then all values are of a fixed size, and can be addressed as DataOffset + (docID * length).
|
||||
* Otherwise, the binary values are of variable size, and packed integer metadata (PackedVersion,BlockSize)
|
||||
* is written for the addresses.
|
||||
* <p>MissingOffset points to a byte[] containing a bitset of all documents that had a value for the field.
|
||||
* If its -1, then there are no missing values.
|
||||
* <p>Checksum contains the CRC32 checksum of all bytes in the .dvm file up
|
||||
* until the checksum. This is used to verify integrity of the file on opening the
|
||||
* index.
|
||||
* <li><a name="dvd" id="dvd"></a>
|
||||
* <p>The DocValues data or .dvd file.</p>
|
||||
* <p>For DocValues field, this stores the actual per-document data (the heavy-lifting)</p>
|
||||
* <p>DocValues data (.dvd) --> Header,<NumericData | BinaryData | SortedData><sup>NumFields</sup>,Footer</p>
|
||||
* <ul>
|
||||
* <li>NumericData --> DeltaCompressedNumerics | TableCompressedNumerics | GCDCompressedNumerics</li>
|
||||
* <li>BinaryData --> {@link DataOutput#writeByte Byte}<sup>DataLength</sup>,Addresses</li>
|
||||
* <li>SortedData --> {@link FST FST<Int64>}</li>
|
||||
* <li>DeltaCompressedNumerics --> {@link BlockPackedWriter BlockPackedInts(blockSize=16k)}</li>
|
||||
* <li>TableCompressedNumerics --> {@link PackedInts PackedInts}</li>
|
||||
* <li>GCDCompressedNumerics --> {@link BlockPackedWriter BlockPackedInts(blockSize=16k)}</li>
|
||||
* <li>Addresses --> {@link MonotonicBlockPackedWriter MonotonicBlockPackedInts(blockSize=16k)}</li>
|
||||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li>
|
||||
* </ul>
|
||||
* <p>SortedSet entries store the list of ordinals in their BinaryData as a
|
||||
* sequences of increasing {@link DataOutput#writeVLong vLong}s, delta-encoded.</p>
|
||||
* </ol>
|
||||
* @deprecated Only for reading old 4.3-4.5 segments
|
||||
* @lucene.experimental
|
||||
*/
|
||||
@Deprecated
|
||||
public class Lucene45DocValuesFormat extends DocValuesFormat {
|
||||
|
@ -177,7 +44,7 @@ public class Lucene45DocValuesFormat extends DocValuesFormat {
|
|||
}
|
||||
|
||||
@Override
|
||||
public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
public final DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
return new Lucene45DocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION);
|
||||
}
|
||||
|
||||
|
|
|
@ -63,13 +63,16 @@ import org.apache.lucene.util.BytesRef;
|
|||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.LongValues;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.lucene.util.StringHelper;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.apache.lucene.util.packed.BlockPackedReader;
|
||||
import org.apache.lucene.util.packed.MonotonicBlockPackedReader;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/** reader for {@link Lucene45DocValuesFormat} */
|
||||
/**
|
||||
* reader for 4.5 docvalues format
|
||||
* @deprecated only for reading old 4.x segments
|
||||
*/
|
||||
@Deprecated
|
||||
class Lucene45DocValuesProducer extends DocValuesProducer implements Closeable {
|
||||
private final Map<Integer,NumericEntry> numerics;
|
||||
private final Map<Integer,BinaryEntry> binaries;
|
||||
|
|
|
@ -22,7 +22,6 @@ import java.io.IOException;
|
|||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.codecs.FieldInfosFormat;
|
||||
import org.apache.lucene.codecs.FilterCodec;
|
||||
import org.apache.lucene.codecs.LiveDocsFormat;
|
||||
import org.apache.lucene.codecs.NormsConsumer;
|
||||
import org.apache.lucene.codecs.NormsFormat;
|
||||
|
@ -39,19 +38,9 @@ import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
|||
import org.apache.lucene.index.SegmentWriteState;
|
||||
|
||||
/**
|
||||
* Implements the Lucene 4.6 index format, with configurable per-field postings
|
||||
* and docvalues formats.
|
||||
* <p>
|
||||
* If you want to reuse functionality of this codec in another codec, extend
|
||||
* {@link FilterCodec}.
|
||||
*
|
||||
* @see org.apache.lucene.codecs.lucene46 package documentation for file format details.
|
||||
* @lucene.experimental
|
||||
* Implements the Lucene 4.6 index format
|
||||
* @deprecated Only for reading old 4.6-4.8 segments
|
||||
*/
|
||||
// NOTE: if we make largish changes in a minor release, easier to just make Lucene46Codec or whatever
|
||||
// if they are backwards compatible or smallish we can probably do the backwards in the postingsreader
|
||||
// (it writes a minor version, etc).
|
||||
@Deprecated
|
||||
public class Lucene46Codec extends Codec {
|
||||
private final StoredFieldsFormat fieldsFormat = new Lucene41StoredFieldsFormat();
|
||||
|
|
|
@ -19,77 +19,15 @@ package org.apache.lucene.codecs.lucene46;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.codecs.FieldInfosFormat;
|
||||
import org.apache.lucene.codecs.FieldInfosReader;
|
||||
import org.apache.lucene.codecs.FieldInfosWriter;
|
||||
import org.apache.lucene.index.FieldInfo.DocValuesType;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
|
||||
/**
|
||||
* Lucene 4.6 Field Infos format.
|
||||
* <p>
|
||||
* <p>Field names are stored in the field info file, with suffix <tt>.fnm</tt>.</p>
|
||||
* <p>FieldInfos (.fnm) --> Header,FieldsCount, <FieldName,FieldNumber,
|
||||
* FieldBits,DocValuesBits,DocValuesGen,Attributes> <sup>FieldsCount</sup>,Footer</p>
|
||||
* <p>Data types:
|
||||
* <ul>
|
||||
* <li>Header --> {@link CodecUtil#checkHeader CodecHeader}</li>
|
||||
* <li>FieldsCount --> {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>FieldName --> {@link DataOutput#writeString String}</li>
|
||||
* <li>FieldBits, DocValuesBits --> {@link DataOutput#writeByte Byte}</li>
|
||||
* <li>FieldNumber --> {@link DataOutput#writeInt VInt}</li>
|
||||
* <li>Attributes --> {@link DataOutput#writeStringStringMap Map<String,String>}</li>
|
||||
* <li>DocValuesGen --> {@link DataOutput#writeLong(long) Int64}</li>
|
||||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li>
|
||||
* </ul>
|
||||
* </p>
|
||||
* Field Descriptions:
|
||||
* <ul>
|
||||
* <li>FieldsCount: the number of fields in this file.</li>
|
||||
* <li>FieldName: name of the field as a UTF-8 String.</li>
|
||||
* <li>FieldNumber: the field's number. Note that unlike previous versions of
|
||||
* Lucene, the fields are not numbered implicitly by their order in the
|
||||
* file, instead explicitly.</li>
|
||||
* <li>FieldBits: a byte containing field options.
|
||||
* <ul>
|
||||
* <li>The low-order bit is one for indexed fields, and zero for non-indexed
|
||||
* fields.</li>
|
||||
* <li>The second lowest-order bit is one for fields that have term vectors
|
||||
* stored, and zero for fields without term vectors.</li>
|
||||
* <li>If the third lowest order-bit is set (0x4), offsets are stored into
|
||||
* the postings list in addition to positions.</li>
|
||||
* <li>Fourth bit is unused.</li>
|
||||
* <li>If the fifth lowest-order bit is set (0x10), norms are omitted for the
|
||||
* indexed field.</li>
|
||||
* <li>If the sixth lowest-order bit is set (0x20), payloads are stored for the
|
||||
* indexed field.</li>
|
||||
* <li>If the seventh lowest-order bit is set (0x40), term frequencies and
|
||||
* positions omitted for the indexed field.</li>
|
||||
* <li>If the eighth lowest-order bit is set (0x80), positions are omitted for the
|
||||
* indexed field.</li>
|
||||
* </ul>
|
||||
* </li>
|
||||
* <li>DocValuesBits: a byte containing per-document value types. The type
|
||||
* recorded as two four-bit integers, with the high-order bits representing
|
||||
* <code>norms</code> options, and the low-order bits representing
|
||||
* {@code DocValues} options. Each four-bit integer can be decoded as such:
|
||||
* <ul>
|
||||
* <li>0: no DocValues for this field.</li>
|
||||
* <li>1: NumericDocValues. ({@link DocValuesType#NUMERIC})</li>
|
||||
* <li>2: BinaryDocValues. ({@code DocValuesType#BINARY})</li>
|
||||
* <li>3: SortedDocValues. ({@code DocValuesType#SORTED})</li>
|
||||
* </ul>
|
||||
* </li>
|
||||
* <li>DocValuesGen is the generation count of the field's DocValues. If this is -1,
|
||||
* there are no DocValues updates to that field. Anything above zero means there
|
||||
* are updates stored by {@link DocValuesFormat}.</li>
|
||||
* <li>Attributes: a key-value map of codec-private attributes.</li>
|
||||
* </ul>
|
||||
*
|
||||
* @lucene.experimental
|
||||
* @deprecated only for old 4.x segments
|
||||
*/
|
||||
@Deprecated
|
||||
public final class Lucene46FieldInfosFormat extends FieldInfosFormat {
|
||||
private final FieldInfosReader reader = new Lucene46FieldInfosReader();
|
||||
private final FieldInfosWriter writer = new Lucene46FieldInfosWriter();
|
||||
|
@ -99,7 +37,7 @@ public final class Lucene46FieldInfosFormat extends FieldInfosFormat {
|
|||
}
|
||||
|
||||
@Override
|
||||
public FieldInfosReader getFieldInfosReader() throws IOException {
|
||||
public final FieldInfosReader getFieldInfosReader() throws IOException {
|
||||
return reader;
|
||||
}
|
||||
|
||||
|
|
|
@ -38,9 +38,9 @@ import org.apache.lucene.store.IndexInput;
|
|||
/**
|
||||
* Lucene 4.6 FieldInfos reader.
|
||||
*
|
||||
* @lucene.experimental
|
||||
* @see Lucene46FieldInfosFormat
|
||||
* @deprecated only for old 4.x segments
|
||||
*/
|
||||
@Deprecated
|
||||
final class Lucene46FieldInfosReader extends FieldInfosReader {
|
||||
|
||||
/** Sole constructor. */
|
||||
|
|
|
@ -34,9 +34,9 @@ import org.apache.lucene.store.IOContext;
|
|||
/**
|
||||
* Lucene 4.6 FieldInfos writer.
|
||||
*
|
||||
* @see Lucene46FieldInfosFormat
|
||||
* @lucene.experimental
|
||||
* @deprecated only for old 4.x segments
|
||||
*/
|
||||
@Deprecated
|
||||
final class Lucene46FieldInfosWriter extends FieldInfosWriter {
|
||||
|
||||
/** Sole constructor. */
|
||||
|
|
|
@ -17,54 +17,16 @@ package org.apache.lucene.codecs.lucene46;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.SegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.SegmentInfoReader;
|
||||
import org.apache.lucene.codecs.SegmentInfoWriter;
|
||||
import org.apache.lucene.index.IndexWriter; // javadocs
|
||||
import org.apache.lucene.index.SegmentInfo; // javadocs
|
||||
import org.apache.lucene.index.SegmentInfos; // javadocs
|
||||
import org.apache.lucene.store.DataOutput; // javadocs
|
||||
import org.apache.lucene.index.SegmentInfo;
|
||||
|
||||
/**
|
||||
* Lucene 4.6 Segment info format.
|
||||
* <p>
|
||||
* Files:
|
||||
* <ul>
|
||||
* <li><tt>.si</tt>: Header, SegVersion, SegSize, IsCompoundFile, Diagnostics, Files, Footer
|
||||
* </ul>
|
||||
* </p>
|
||||
* Data types:
|
||||
* <p>
|
||||
* <ul>
|
||||
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||
* <li>SegSize --> {@link DataOutput#writeInt Int32}</li>
|
||||
* <li>SegVersion --> {@link DataOutput#writeString String}</li>
|
||||
* <li>Files --> {@link DataOutput#writeStringSet Set<String>}</li>
|
||||
* <li>Diagnostics --> {@link DataOutput#writeStringStringMap Map<String,String>}</li>
|
||||
* <li>IsCompoundFile --> {@link DataOutput#writeByte Int8}</li>
|
||||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li>
|
||||
* </ul>
|
||||
* </p>
|
||||
* Field Descriptions:
|
||||
* <p>
|
||||
* <ul>
|
||||
* <li>SegVersion is the code version that created the segment.</li>
|
||||
* <li>SegSize is the number of documents contained in the segment index.</li>
|
||||
* <li>IsCompoundFile records whether the segment is written as a compound file or
|
||||
* not. If this is -1, the segment is not a compound file. If it is 1, the segment
|
||||
* is a compound file.</li>
|
||||
* <li>The Diagnostics Map is privately written by {@link IndexWriter}, as a debugging aid,
|
||||
* for each segment it creates. It includes metadata like the current Lucene
|
||||
* version, OS, Java version, why the segment was created (merge, flush,
|
||||
* addIndexes), etc.</li>
|
||||
* <li>Files is a list of files referred to by this segment.</li>
|
||||
* </ul>
|
||||
* </p>
|
||||
*
|
||||
* @see SegmentInfos
|
||||
* @lucene.experimental
|
||||
* @deprecated only for old 4.x segments
|
||||
*/
|
||||
@Deprecated
|
||||
public class Lucene46SegmentInfoFormat extends SegmentInfoFormat {
|
||||
private final SegmentInfoReader reader = new Lucene46SegmentInfoReader();
|
||||
|
||||
|
@ -73,7 +35,7 @@ public class Lucene46SegmentInfoFormat extends SegmentInfoFormat {
|
|||
}
|
||||
|
||||
@Override
|
||||
public SegmentInfoReader getSegmentInfoReader() {
|
||||
public final SegmentInfoReader getSegmentInfoReader() {
|
||||
return reader;
|
||||
}
|
||||
|
||||
|
@ -83,7 +45,7 @@ public class Lucene46SegmentInfoFormat extends SegmentInfoFormat {
|
|||
}
|
||||
|
||||
/** File extension used to store {@link SegmentInfo}. */
|
||||
public final static String SI_EXTENSION = "si";
|
||||
final static String SI_EXTENSION = "si";
|
||||
static final String CODEC_NAME = "Lucene46SegmentInfo";
|
||||
static final int VERSION_START = 0;
|
||||
static final int VERSION_CHECKSUM = 1;
|
||||
|
|
|
@ -33,12 +33,11 @@ import org.apache.lucene.store.IOContext;
|
|||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Lucene 4.6 implementation of {@link SegmentInfoReader}.
|
||||
*
|
||||
* @see Lucene46SegmentInfoFormat
|
||||
* @lucene.experimental
|
||||
* Lucene 4.6 segment infos reader
|
||||
* @deprecated only for old 4.x segments
|
||||
*/
|
||||
public class Lucene46SegmentInfoReader extends SegmentInfoReader {
|
||||
@Deprecated
|
||||
final class Lucene46SegmentInfoReader extends SegmentInfoReader {
|
||||
|
||||
/** Sole constructor. */
|
||||
public Lucene46SegmentInfoReader() {
|
||||
|
|
|
@ -22,7 +22,6 @@ import java.io.IOException;
|
|||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.codecs.FieldInfosFormat;
|
||||
import org.apache.lucene.codecs.FilterCodec;
|
||||
import org.apache.lucene.codecs.LiveDocsFormat;
|
||||
import org.apache.lucene.codecs.NormsConsumer;
|
||||
import org.apache.lucene.codecs.NormsFormat;
|
||||
|
@ -40,18 +39,10 @@ import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
|||
import org.apache.lucene.index.SegmentWriteState;
|
||||
|
||||
/**
|
||||
* Implements the Lucene 4.9 index format, with configurable per-field postings
|
||||
* and docvalues formats.
|
||||
* <p>
|
||||
* If you want to reuse functionality of this codec in another codec, extend
|
||||
* {@link FilterCodec}.
|
||||
*
|
||||
* @see org.apache.lucene.codecs.lucene49 package documentation for file format details.
|
||||
* @lucene.experimental
|
||||
* Implements the Lucene 4.9 index format
|
||||
* @deprecated only for old 4.x segments
|
||||
*/
|
||||
// NOTE: if we make largish changes in a minor release, easier to just make Lucene410Codec or whatever
|
||||
// if they are backwards compatible or smallish we can probably do the backwards in the postingsreader
|
||||
// (it writes a minor version, etc).
|
||||
@Deprecated
|
||||
public class Lucene49Codec extends Codec {
|
||||
private final StoredFieldsFormat fieldsFormat = new Lucene41StoredFieldsFormat();
|
||||
private final TermVectorsFormat vectorsFormat = new Lucene42TermVectorsFormat();
|
||||
|
|
|
@ -40,7 +40,11 @@ import org.apache.lucene.util.packed.DirectWriter;
|
|||
import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/** writer for {@link Lucene49DocValuesFormat} */
|
||||
/**
|
||||
* writer for 4.9 docvalues format
|
||||
* @deprecated only for old 4.x segments
|
||||
*/
|
||||
@Deprecated
|
||||
class Lucene49DocValuesConsumer extends DocValuesConsumer implements Closeable {
|
||||
|
||||
static final int BLOCK_SIZE = 16384;
|
||||
|
|
|
@ -19,151 +19,17 @@ package org.apache.lucene.codecs.lucene49;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.DocValuesConsumer;
|
||||
import org.apache.lucene.codecs.DocValuesProducer;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.index.FieldInfo.DocValuesType;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.SmallFloat;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.packed.DirectWriter;
|
||||
import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
|
||||
|
||||
/**
|
||||
* Lucene 4.9 DocValues format.
|
||||
* <p>
|
||||
* Encodes the five per-document value types (Numeric,Binary,Sorted,SortedSet,SortedNumeric) with these strategies:
|
||||
* <p>
|
||||
* {@link DocValuesType#NUMERIC NUMERIC}:
|
||||
* <ul>
|
||||
* <li>Delta-compressed: per-document integers written as deltas from the minimum value,
|
||||
* compressed with bitpacking. For more information, see {@link DirectWriter}.
|
||||
* <li>Table-compressed: when the number of unique values is very small (< 256), and
|
||||
* when there are unused "gaps" in the range of values used (such as {@link SmallFloat}),
|
||||
* a lookup table is written instead. Each per-document entry is instead the ordinal
|
||||
* to this table, and those ordinals are compressed with bitpacking ({@link DirectWriter}).
|
||||
* <li>GCD-compressed: when all numbers share a common divisor, such as dates, the greatest
|
||||
* common denominator (GCD) is computed, and quotients are stored using Delta-compressed Numerics.
|
||||
* <li>Monotonic-compressed: when all numbers are monotonically increasing offsets, they are written
|
||||
* as blocks of bitpacked integers, encoding the deviation from the expected delta.
|
||||
* </ul>
|
||||
* <p>
|
||||
* {@link DocValuesType#BINARY BINARY}:
|
||||
* <ul>
|
||||
* <li>Fixed-width Binary: one large concatenated byte[] is written, along with the fixed length.
|
||||
* Each document's value can be addressed directly with multiplication ({@code docID * length}).
|
||||
* <li>Variable-width Binary: one large concatenated byte[] is written, along with end addresses
|
||||
* for each document. The addresses are written as Monotonic-compressed numerics.
|
||||
* <li>Prefix-compressed Binary: values are written in chunks of 16, with the first value written
|
||||
* completely and other values sharing prefixes. chunk addresses are written as Monotonic-compressed
|
||||
* numerics.
|
||||
* </ul>
|
||||
* <p>
|
||||
* {@link DocValuesType#SORTED SORTED}:
|
||||
* <ul>
|
||||
* <li>Sorted: a mapping of ordinals to deduplicated terms is written as Prefix-Compressed Binary,
|
||||
* along with the per-document ordinals written using one of the numeric strategies above.
|
||||
* </ul>
|
||||
* <p>
|
||||
* {@link DocValuesType#SORTED_SET SORTED_SET}:
|
||||
* <ul>
|
||||
* <li>SortedSet: a mapping of ordinals to deduplicated terms is written as Prefix-Compressed Binary,
|
||||
* an ordinal list and per-document index into this list are written using the numeric strategies
|
||||
* above.
|
||||
* </ul>
|
||||
* <p>
|
||||
* {@link DocValuesType#SORTED_NUMERIC SORTED_NUMERIC}:
|
||||
* <ul>
|
||||
* <li>SortedNumeric: a value list and per-document index into this list are written using the numeric
|
||||
* strategies above.
|
||||
* </ul>
|
||||
* <p>
|
||||
* Files:
|
||||
* <ol>
|
||||
* <li><tt>.dvd</tt>: DocValues data</li>
|
||||
* <li><tt>.dvm</tt>: DocValues metadata</li>
|
||||
* </ol>
|
||||
* <ol>
|
||||
* <li><a name="dvm" id="dvm"></a>
|
||||
* <p>The DocValues metadata or .dvm file.</p>
|
||||
* <p>For DocValues field, this stores metadata, such as the offset into the
|
||||
* DocValues data (.dvd)</p>
|
||||
* <p>DocValues metadata (.dvm) --> Header,<Entry><sup>NumFields</sup>,Footer</p>
|
||||
* <ul>
|
||||
* <li>Entry --> NumericEntry | BinaryEntry | SortedEntry | SortedSetEntry | SortedNumericEntry</li>
|
||||
* <li>NumericEntry --> GCDNumericEntry | TableNumericEntry | DeltaNumericEntry</li>
|
||||
* <li>GCDNumericEntry --> NumericHeader,MinValue,GCD,BitsPerValue</li>
|
||||
* <li>TableNumericEntry --> NumericHeader,TableSize,{@link DataOutput#writeLong Int64}<sup>TableSize</sup>,BitsPerValue</li>
|
||||
* <li>DeltaNumericEntry --> NumericHeader,MinValue,BitsPerValue</li>
|
||||
* <li>MonotonicNumericEntry --> NumericHeader,PackedVersion,BlockSize</li>
|
||||
* <li>NumericHeader --> FieldNumber,EntryType,NumericType,MissingOffset,DataOffset,Count,EndOffset</li>
|
||||
* <li>BinaryEntry --> FixedBinaryEntry | VariableBinaryEntry | PrefixBinaryEntry</li>
|
||||
* <li>FixedBinaryEntry --> BinaryHeader</li>
|
||||
* <li>VariableBinaryEntry --> BinaryHeader,AddressOffset,PackedVersion,BlockSize</li>
|
||||
* <li>PrefixBinaryEntry --> BinaryHeader,AddressInterval,AddressOffset,PackedVersion,BlockSize</li>
|
||||
* <li>BinaryHeader --> FieldNumber,EntryType,BinaryType,MissingOffset,MinLength,MaxLength,DataOffset</li>
|
||||
* <li>SortedEntry --> FieldNumber,EntryType,BinaryEntry,NumericEntry</li>
|
||||
* <li>SortedSetEntry --> EntryType,BinaryEntry,NumericEntry,NumericEntry</li>
|
||||
* <li>SortedNumericEntry --> EntryType,NumericEntry,NumericEntry</li>
|
||||
* <li>FieldNumber,PackedVersion,MinLength,MaxLength,BlockSize,ValueCount --> {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>EntryType,CompressionType --> {@link DataOutput#writeByte Byte}</li>
|
||||
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||
* <li>MinValue,GCD,MissingOffset,AddressOffset,DataOffset,EndOffset --> {@link DataOutput#writeLong Int64}</li>
|
||||
* <li>TableSize,BitsPerValue --> {@link DataOutput#writeVInt vInt}</li>
|
||||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li>
|
||||
* </ul>
|
||||
* <p>Sorted fields have two entries: a BinaryEntry with the value metadata,
|
||||
* and an ordinary NumericEntry for the document-to-ord metadata.</p>
|
||||
* <p>SortedSet fields have three entries: a BinaryEntry with the value metadata,
|
||||
* and two NumericEntries for the document-to-ord-index and ordinal list metadata.</p>
|
||||
* <p>SortedNumeric fields have two entries: A NumericEntry with the value metadata,
|
||||
* and a numeric entry with the document-to-value index.</p>
|
||||
* <p>FieldNumber of -1 indicates the end of metadata.</p>
|
||||
* <p>EntryType is a 0 (NumericEntry) or 1 (BinaryEntry)</p>
|
||||
* <p>DataOffset is the pointer to the start of the data in the DocValues data (.dvd)</p>
|
||||
* <p>EndOffset is the pointer to the end of the data in the DocValues data (.dvd)</p>
|
||||
* <p>NumericType indicates how Numeric values will be compressed:
|
||||
* <ul>
|
||||
* <li>0 --> delta-compressed. For each block of 16k integers, every integer is delta-encoded
|
||||
* from the minimum value within the block.
|
||||
* <li>1 -->, gcd-compressed. When all integers share a common divisor, only quotients are stored
|
||||
* using blocks of delta-encoded ints.
|
||||
* <li>2 --> table-compressed. When the number of unique numeric values is small and it would save space,
|
||||
* a lookup table of unique values is written, followed by the ordinal for each document.
|
||||
* </ul>
|
||||
* <p>BinaryType indicates how Binary values will be stored:
|
||||
* <ul>
|
||||
* <li>0 --> fixed-width. All values have the same length, addressing by multiplication.
|
||||
* <li>1 -->, variable-width. An address for each value is stored.
|
||||
* <li>2 --> prefix-compressed. An address to the start of every interval'th value is stored.
|
||||
* </ul>
|
||||
* <p>MinLength and MaxLength represent the min and max byte[] value lengths for Binary values.
|
||||
* If they are equal, then all values are of a fixed size, and can be addressed as DataOffset + (docID * length).
|
||||
* Otherwise, the binary values are of variable size, and packed integer metadata (PackedVersion,BlockSize)
|
||||
* is written for the addresses.
|
||||
* <p>MissingOffset points to a byte[] containing a bitset of all documents that had a value for the field.
|
||||
* If its -1, then there are no missing values.
|
||||
* <p>Checksum contains the CRC32 checksum of all bytes in the .dvm file up
|
||||
* until the checksum. This is used to verify integrity of the file on opening the
|
||||
* index.
|
||||
* <li><a name="dvd" id="dvd"></a>
|
||||
* <p>The DocValues data or .dvd file.</p>
|
||||
* <p>For DocValues field, this stores the actual per-document data (the heavy-lifting)</p>
|
||||
* <p>DocValues data (.dvd) --> Header,<NumericData | BinaryData | SortedData><sup>NumFields</sup>,Footer</p>
|
||||
* <ul>
|
||||
* <li>NumericData --> DeltaCompressedNumerics | TableCompressedNumerics | GCDCompressedNumerics</li>
|
||||
* <li>BinaryData --> {@link DataOutput#writeByte Byte}<sup>DataLength</sup>,Addresses</li>
|
||||
* <li>SortedData --> {@link FST FST<Int64>}</li>
|
||||
* <li>DeltaCompressedNumerics,TableCompressedNumerics,GCDCompressedNumerics --> {@link DirectWriter PackedInts}</li>
|
||||
* <li>Addresses --> {@link MonotonicBlockPackedWriter MonotonicBlockPackedInts(blockSize=16k)}</li>
|
||||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li>
|
||||
* </ul>
|
||||
* </ol>
|
||||
* @lucene.experimental
|
||||
* @deprecated only for old 4.x segments
|
||||
*/
|
||||
@Deprecated
|
||||
public class Lucene49DocValuesFormat extends DocValuesFormat {
|
||||
|
||||
/** Sole Constructor */
|
||||
|
@ -177,7 +43,7 @@ public class Lucene49DocValuesFormat extends DocValuesFormat {
|
|||
}
|
||||
|
||||
@Override
|
||||
public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
public final DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
return new Lucene49DocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION);
|
||||
}
|
||||
|
||||
|
|
|
@ -67,7 +67,11 @@ import org.apache.lucene.util.RamUsageEstimator;
|
|||
import org.apache.lucene.util.packed.DirectReader;
|
||||
import org.apache.lucene.util.packed.MonotonicBlockPackedReader;
|
||||
|
||||
/** reader for {@link Lucene49DocValuesFormat} */
|
||||
/**
|
||||
* reader for 4.9 docvalues format
|
||||
* @deprecated only for 4.x segments
|
||||
*/
|
||||
@Deprecated
|
||||
class Lucene49DocValuesProducer extends DocValuesProducer implements Closeable {
|
||||
private final Map<String,NumericEntry> numerics;
|
||||
private final Map<String,BinaryEntry> binaries;
|
||||
|
|
|
@ -26,7 +26,6 @@ import org.apache.lucene.codecs.CodecUtil;
|
|||
import org.apache.lucene.codecs.DocValuesConsumer;
|
||||
import org.apache.lucene.codecs.MissingOrdRemapper;
|
||||
import org.apache.lucene.codecs.lucene40.Lucene40FieldInfosReader.LegacyDocValuesType;
|
||||
import org.apache.lucene.index.BinaryDocValues;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
|
@ -37,7 +36,12 @@ import org.apache.lucene.util.BytesRef;
|
|||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
class Lucene40DocValuesWriter extends DocValuesConsumer {
|
||||
/**
|
||||
* Writer for 4.0 docvalues format
|
||||
* @deprecated for test purposes only
|
||||
*/
|
||||
@Deprecated
|
||||
final class Lucene40DocValuesWriter extends DocValuesConsumer {
|
||||
private final Directory dir;
|
||||
private final SegmentWriteState state;
|
||||
private final String legacyKey;
|
||||
|
|
|
@ -33,13 +33,11 @@ import org.apache.lucene.store.IndexOutput;
|
|||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/**
|
||||
* Lucene 4.0 FieldInfos writer.
|
||||
*
|
||||
* @see Lucene40FieldInfosFormat
|
||||
* @lucene.experimental
|
||||
* Writer for 4.0 fieldinfos format
|
||||
* @deprecated for test purposes only
|
||||
*/
|
||||
@Deprecated
|
||||
public class Lucene40FieldInfosWriter extends FieldInfosWriter {
|
||||
public final class Lucene40FieldInfosWriter extends FieldInfosWriter {
|
||||
|
||||
/** Sole constructor. */
|
||||
public Lucene40FieldInfosWriter() {
|
||||
|
|
|
@ -37,12 +37,11 @@ import org.apache.lucene.util.BytesRef;
|
|||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/**
|
||||
* Concrete class that writes the 4.0 frq/prx postings format.
|
||||
*
|
||||
* @see Lucene40PostingsFormat
|
||||
* @lucene.experimental
|
||||
* Writer for 4.0 postings format
|
||||
* @deprecated for test purposes only
|
||||
*/
|
||||
public final class Lucene40PostingsWriter extends PushPostingsWriterBase {
|
||||
@Deprecated
|
||||
final class Lucene40PostingsWriter extends PushPostingsWriterBase {
|
||||
|
||||
final IndexOutput freqOut;
|
||||
final IndexOutput proxOut;
|
||||
|
|
|
@ -28,8 +28,11 @@ import org.apache.lucene.codecs.TermVectorsFormat;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/** Read-write version of Lucene40Codec for testing */
|
||||
@SuppressWarnings("deprecation")
|
||||
/**
|
||||
* Read-write version of 4.0 codec for testing
|
||||
* @deprecated for test purposes only
|
||||
*/
|
||||
@Deprecated
|
||||
public final class Lucene40RWCodec extends Lucene40Codec {
|
||||
|
||||
private final FieldInfosFormat fieldInfos = new Lucene40FieldInfosFormat() {
|
||||
|
|
|
@ -23,9 +23,12 @@ import org.apache.lucene.codecs.DocValuesConsumer;
|
|||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
|
||||
/** Read-write version of {@link Lucene40DocValuesFormat} for testing */
|
||||
@SuppressWarnings("deprecation")
|
||||
public class Lucene40RWDocValuesFormat extends Lucene40DocValuesFormat {
|
||||
/**
|
||||
* Read-write version of 4.0 docvalues format for testing
|
||||
* @deprecated for test purposes only
|
||||
*/
|
||||
@Deprecated
|
||||
public final class Lucene40RWDocValuesFormat extends Lucene40DocValuesFormat {
|
||||
|
||||
@Override
|
||||
public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
|
|
|
@ -24,9 +24,12 @@ import org.apache.lucene.index.FieldInfo;
|
|||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
|
||||
/** Read-write version of {@link Lucene40NormsFormat} for testing */
|
||||
@SuppressWarnings("deprecation")
|
||||
public class Lucene40RWNormsFormat extends Lucene40NormsFormat {
|
||||
/**
|
||||
* Read-write version of 4.0 norms format for testing
|
||||
* @deprecated for test purposes only
|
||||
*/
|
||||
@Deprecated
|
||||
public final class Lucene40RWNormsFormat extends Lucene40NormsFormat {
|
||||
|
||||
@Override
|
||||
public NormsConsumer normsConsumer(SegmentWriteState state) throws IOException {
|
||||
|
|
|
@ -23,13 +23,18 @@ import org.apache.lucene.codecs.FieldsConsumer;
|
|||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.blocktree.BlockTreeTermsWriter;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
/**
|
||||
* Read-write version of {@link Lucene40PostingsFormat} for testing.
|
||||
* Read-write version of 4.0 postings format for testing
|
||||
* @deprecated for test purposes only
|
||||
*/
|
||||
@SuppressWarnings("deprecation")
|
||||
public class Lucene40RWPostingsFormat extends Lucene40PostingsFormat {
|
||||
@Deprecated
|
||||
public final class Lucene40RWPostingsFormat extends Lucene40PostingsFormat {
|
||||
|
||||
/** minimum items (terms or sub-blocks) per block for 4.0 BlockTree */
|
||||
final static int MIN_BLOCK_SIZE = 25;
|
||||
/** maximum items (terms or sub-blocks) per block for 4.0 BlockTree */
|
||||
final static int MAX_BLOCK_SIZE = 48;
|
||||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
|
@ -41,7 +46,7 @@ public class Lucene40RWPostingsFormat extends Lucene40PostingsFormat {
|
|||
// Or... you must make a new Codec for this?
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsConsumer ret = new BlockTreeTermsWriter(state, docs, minBlockSize, maxBlockSize);
|
||||
FieldsConsumer ret = new BlockTreeTermsWriter(state, docs, MIN_BLOCK_SIZE, MAX_BLOCK_SIZE);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
|
|
|
@ -19,8 +19,12 @@ package org.apache.lucene.codecs.lucene40;
|
|||
|
||||
import org.apache.lucene.codecs.SegmentInfoWriter;
|
||||
|
||||
/** read-write version of 4.0 segmentinfos for testing */
|
||||
public class Lucene40RWSegmentInfoFormat extends Lucene40SegmentInfoFormat {
|
||||
/**
|
||||
* Read-write version of 4.0 segmentinfo format for testing
|
||||
* @deprecated for test purposes only
|
||||
*/
|
||||
@Deprecated
|
||||
public final class Lucene40RWSegmentInfoFormat extends Lucene40SegmentInfoFormat {
|
||||
|
||||
@Override
|
||||
public SegmentInfoWriter getSegmentInfoWriter() {
|
||||
|
|
|
@ -25,9 +25,11 @@ import org.apache.lucene.store.Directory;
|
|||
import org.apache.lucene.store.IOContext;
|
||||
|
||||
/**
|
||||
* Simulates writing Lucene 4.0 Stored Fields Format.
|
||||
* Read-write version of 4.0 stored fields format for testing
|
||||
* @deprecated for test purposes only
|
||||
*/
|
||||
public class Lucene40RWStoredFieldsFormat extends Lucene40StoredFieldsFormat {
|
||||
@Deprecated
|
||||
final class Lucene40RWStoredFieldsFormat extends Lucene40StoredFieldsFormat {
|
||||
|
||||
@Override
|
||||
public StoredFieldsWriter fieldsWriter(Directory directory, SegmentInfo si, IOContext context) throws IOException {
|
||||
|
|
|
@ -23,12 +23,13 @@ import org.apache.lucene.codecs.TermVectorsWriter;
|
|||
import org.apache.lucene.index.SegmentInfo;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
/**
|
||||
* Simulates writing Lucene 4.0 Stored Fields Format.
|
||||
* Read-write version of 4.0 term vectors format for testing
|
||||
* @deprecated for test purposes only
|
||||
*/
|
||||
public class Lucene40RWTermVectorsFormat extends Lucene40TermVectorsFormat {
|
||||
@Deprecated
|
||||
public final class Lucene40RWTermVectorsFormat extends Lucene40TermVectorsFormat {
|
||||
|
||||
@Override
|
||||
public TermVectorsWriter vectorsWriter(Directory directory, SegmentInfo segmentInfo, IOContext context) throws IOException {
|
||||
|
|
|
@ -31,13 +31,11 @@ import org.apache.lucene.store.IndexOutput;
|
|||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/**
|
||||
* Lucene 4.0 implementation of {@link SegmentInfoWriter}.
|
||||
*
|
||||
* @see Lucene40SegmentInfoFormat
|
||||
* @lucene.experimental
|
||||
* writer for 4.0 segmentinfos for testing
|
||||
* @deprecated for test purposes only
|
||||
*/
|
||||
@Deprecated
|
||||
public class Lucene40SegmentInfoWriter extends SegmentInfoWriter {
|
||||
public final class Lucene40SegmentInfoWriter extends SegmentInfoWriter {
|
||||
|
||||
/** Sole constructor. */
|
||||
public Lucene40SegmentInfoWriter() {
|
||||
|
|
|
@ -25,14 +25,11 @@ import org.apache.lucene.codecs.MultiLevelSkipListWriter;
|
|||
|
||||
|
||||
/**
|
||||
* Implements the skip list writer for the 4.0 posting list format
|
||||
* that stores positions and payloads.
|
||||
*
|
||||
* @see Lucene40PostingsFormat
|
||||
* @deprecated Only for reading old 4.0 segments
|
||||
* Writer of 4.0 skip lists for testing
|
||||
* @deprecated for test purposes only
|
||||
*/
|
||||
@Deprecated
|
||||
public class Lucene40SkipListWriter extends MultiLevelSkipListWriter {
|
||||
final class Lucene40SkipListWriter extends MultiLevelSkipListWriter {
|
||||
private int[] lastSkipDoc;
|
||||
private int[] lastSkipPayloadLength;
|
||||
private int[] lastSkipOffsetLength;
|
||||
|
|
|
@ -35,14 +35,11 @@ import static org.apache.lucene.codecs.lucene40.Lucene40StoredFieldsReader.*;
|
|||
|
||||
|
||||
/**
|
||||
* Class responsible for writing stored document fields.
|
||||
* <p/>
|
||||
* It uses <segment>.fdt and <segment>.fdx; files.
|
||||
*
|
||||
* @see Lucene40StoredFieldsFormat
|
||||
* @lucene.experimental
|
||||
* Writer for 4.0 stored fields format for testing
|
||||
* @deprecated for test purposes only
|
||||
*/
|
||||
public final class Lucene40StoredFieldsWriter extends StoredFieldsWriter {
|
||||
@Deprecated
|
||||
final class Lucene40StoredFieldsWriter extends StoredFieldsWriter {
|
||||
|
||||
private final Directory directory;
|
||||
private final String segment;
|
||||
|
|
|
@ -36,24 +36,12 @@ import org.apache.lucene.util.StringHelper;
|
|||
|
||||
import static org.apache.lucene.codecs.lucene40.Lucene40TermVectorsReader.*;
|
||||
|
||||
|
||||
// TODO: make a new 4.0 TV format that encodes better
|
||||
// - use startOffset (not endOffset) as base for delta on
|
||||
// next startOffset because today for syns or ngrams or
|
||||
// WDF or shingles etc. we are encoding negative vints
|
||||
// (= slow, 5 bytes per)
|
||||
// - if doc has no term vectors, write 0 into the tvx
|
||||
// file; saves a seek to tvd only to read a 0 vint (and
|
||||
// saves a byte in tvd)
|
||||
|
||||
/**
|
||||
* Lucene 4.0 Term Vectors writer.
|
||||
* <p>
|
||||
* It writes .tvd, .tvf, and .tvx files.
|
||||
*
|
||||
* @see Lucene40TermVectorsFormat
|
||||
* Writer for 4.0 term vectors format for testing
|
||||
* @deprecated for test purposes only
|
||||
*/
|
||||
public final class Lucene40TermVectorsWriter extends TermVectorsWriter {
|
||||
@Deprecated
|
||||
final class Lucene40TermVectorsWriter extends TermVectorsWriter {
|
||||
private final Directory directory;
|
||||
private final String segment;
|
||||
private IndexOutput tvx = null, tvd = null, tvf = null;
|
||||
|
|
|
@ -24,7 +24,6 @@ import org.apache.lucene.store.MockDirectoryWrapper;
|
|||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
/**
|
||||
* <code>TestBitVector</code> tests the <code>BitVector</code>, obviously.
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.codecs.lucene40;
|
|||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.index.BaseStoredFieldsFormatTestCase;
|
||||
import org.junit.BeforeClass;
|
||||
|
||||
public class TestLucene40StoredFieldsFormat extends BaseStoredFieldsFormatTestCase {
|
||||
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.codecs.lucene40;
|
|||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.index.BaseTermVectorsFormatTestCase;
|
||||
import org.junit.BeforeClass;
|
||||
|
||||
public class TestLucene40TermVectorsFormat extends BaseTermVectorsFormatTestCase {
|
||||
|
||||
|
|
|
@ -34,10 +34,11 @@ import org.apache.lucene.codecs.lucene40.Lucene40RWTermVectorsFormat;
|
|||
*/
|
||||
|
||||
/**
|
||||
* Read-write version of {@link Lucene41Codec} for testing.
|
||||
* Read-write version of 4.1 codec for testing
|
||||
* @deprecated for test purposes only
|
||||
*/
|
||||
@SuppressWarnings("deprecation")
|
||||
public class Lucene41RWCodec extends Lucene41Codec {
|
||||
@Deprecated
|
||||
public final class Lucene41RWCodec extends Lucene41Codec {
|
||||
private final StoredFieldsFormat fieldsFormat = new Lucene41RWStoredFieldsFormat();
|
||||
private final FieldInfosFormat fieldInfos = new Lucene40FieldInfosFormat() {
|
||||
@Override
|
||||
|
|
|
@ -24,8 +24,12 @@ import org.apache.lucene.index.SegmentInfo;
|
|||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
|
||||
/** read-write version of Lucene41StoredsFieldsFormat for testing */
|
||||
public class Lucene41RWStoredFieldsFormat extends Lucene41StoredFieldsFormat {
|
||||
/**
|
||||
* Read-write version of 4.1 stored fields format for testing
|
||||
* @deprecated for test purposes only
|
||||
*/
|
||||
@Deprecated
|
||||
public final class Lucene41RWStoredFieldsFormat extends Lucene41StoredFieldsFormat {
|
||||
@Override
|
||||
public StoredFieldsWriter fieldsWriter(Directory directory, SegmentInfo si, IOContext context) throws IOException {
|
||||
return new Lucene41StoredFieldsWriter(directory, si, SEGMENT_SUFFIX, context, FORMAT_NAME, COMPRESSION_MODE, CHUNK_SIZE);
|
||||
|
|
|
@ -27,8 +27,10 @@ import org.apache.lucene.store.IndexOutput;
|
|||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/**
|
||||
* writer for lucene 4.x stored fields/vectors index for testing
|
||||
* Writer for 4.1 stored fields/term vectors index for testing
|
||||
* @deprecated for test purposes only
|
||||
*/
|
||||
@Deprecated
|
||||
public final class Lucene41StoredFieldsIndexWriter implements Closeable {
|
||||
|
||||
static final int BLOCK_SIZE = 1024; // number of chunks to serialize at once
|
||||
|
|
|
@ -53,8 +53,10 @@ import org.apache.lucene.util.IOUtils;
|
|||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/**
|
||||
* writer for Lucene 4.1 stored fields for testing
|
||||
* Writer for 4.1 stored fields format for testing
|
||||
* @deprecated for test purposes only
|
||||
*/
|
||||
@Deprecated
|
||||
final class Lucene41StoredFieldsWriter extends StoredFieldsWriter {
|
||||
|
||||
// hard limit on the maximum number of documents per chunk
|
||||
|
|
|
@ -34,7 +34,6 @@ import org.apache.lucene.store.IndexOutput;
|
|||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
import org.apache.lucene.util.MathUtil;
|
||||
import org.apache.lucene.util.fst.Builder;
|
||||
|
@ -58,9 +57,11 @@ import static org.apache.lucene.codecs.lucene42.Lucene42DocValuesProducer.TABLE_
|
|||
import static org.apache.lucene.codecs.lucene42.Lucene42DocValuesProducer.UNCOMPRESSED;
|
||||
|
||||
/**
|
||||
* Writer for {@link Lucene42DocValuesFormat}
|
||||
* Writer for 4.2 docvalues format for testing
|
||||
* @deprecated for test purposes only
|
||||
*/
|
||||
class Lucene42DocValuesConsumer extends DocValuesConsumer {
|
||||
@Deprecated
|
||||
final class Lucene42DocValuesConsumer extends DocValuesConsumer {
|
||||
final IndexOutput data, meta;
|
||||
final int maxDoc;
|
||||
final float acceptableOverheadRatio;
|
||||
|
|
|
@ -33,10 +33,8 @@ import org.apache.lucene.store.IndexOutput;
|
|||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/**
|
||||
* Lucene 4.2 FieldInfos writer.
|
||||
*
|
||||
* @see Lucene42FieldInfosFormat
|
||||
* @lucene.experimental
|
||||
* Writer for 4.2 fieldinfos format for testing
|
||||
* @deprecated for test purposes only
|
||||
*/
|
||||
@Deprecated
|
||||
public final class Lucene42FieldInfosWriter extends FieldInfosWriter {
|
||||
|
|
|
@ -36,9 +36,11 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
import static org.apache.lucene.codecs.lucene42.Lucene42DocValuesProducer.VERSION_CURRENT;
|
||||
|
||||
/**
|
||||
* Writer for {@link Lucene42NormsFormat}
|
||||
* Writer for 4.2 norms format for testing
|
||||
* @deprecated for test purposes only
|
||||
*/
|
||||
class Lucene42NormsConsumer extends NormsConsumer {
|
||||
@Deprecated
|
||||
final class Lucene42NormsConsumer extends NormsConsumer {
|
||||
static final byte NUMBER = 0;
|
||||
|
||||
static final int BLOCK_SIZE = 4096;
|
||||
|
|
|
@ -28,13 +28,13 @@ import org.apache.lucene.codecs.StoredFieldsFormat;
|
|||
import org.apache.lucene.codecs.TermVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene40.Lucene40RWSegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.lucene41.Lucene41RWStoredFieldsFormat;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
/**
|
||||
* Read-write version of {@link Lucene42Codec} for testing.
|
||||
* Read-Write version of 4.2 codec for testing
|
||||
* @deprecated for test purposes only
|
||||
*/
|
||||
@SuppressWarnings("deprecation")
|
||||
public class Lucene42RWCodec extends Lucene42Codec {
|
||||
@Deprecated
|
||||
public final class Lucene42RWCodec extends Lucene42Codec {
|
||||
|
||||
private static final DocValuesFormat dv = new Lucene42RWDocValuesFormat();
|
||||
private static final NormsFormat norms = new Lucene42RWNormsFormat();
|
||||
|
|
|
@ -21,13 +21,13 @@ import java.io.IOException;
|
|||
|
||||
import org.apache.lucene.codecs.DocValuesConsumer;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
/**
|
||||
* Read-write version of {@link Lucene42DocValuesFormat} for testing.
|
||||
* Read-Write version of 4.2 docvalues format for testing
|
||||
* @deprecated for test purposes only
|
||||
*/
|
||||
@SuppressWarnings("deprecation")
|
||||
public class Lucene42RWDocValuesFormat extends Lucene42DocValuesFormat {
|
||||
@Deprecated
|
||||
public final class Lucene42RWDocValuesFormat extends Lucene42DocValuesFormat {
|
||||
|
||||
@Override
|
||||
public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
|
|
|
@ -21,12 +21,13 @@ import java.io.IOException;
|
|||
|
||||
import org.apache.lucene.codecs.NormsConsumer;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
/**
|
||||
* Read-write version of {@link Lucene42NormsFormat}
|
||||
* Read-write version of 4.2 norms format for testing
|
||||
* @deprecated for test purposes only
|
||||
*/
|
||||
public class Lucene42RWNormsFormat extends Lucene42NormsFormat {
|
||||
@Deprecated
|
||||
public final class Lucene42RWNormsFormat extends Lucene42NormsFormat {
|
||||
|
||||
@Override
|
||||
public NormsConsumer normsConsumer(SegmentWriteState state) throws IOException {
|
||||
|
|
|
@ -24,8 +24,12 @@ import org.apache.lucene.index.SegmentInfo;
|
|||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
|
||||
/** read-write version of lucene 4.2 term vectors for testing */
|
||||
public class Lucene42RWTermVectorsFormat extends Lucene42TermVectorsFormat {
|
||||
/**
|
||||
* Read-Write version of 4.2 term vectors format for testing
|
||||
* @deprecated for test purposes only
|
||||
*/
|
||||
@Deprecated
|
||||
public final class Lucene42RWTermVectorsFormat extends Lucene42TermVectorsFormat {
|
||||
|
||||
@Override
|
||||
public TermVectorsWriter vectorsWriter(Directory directory, SegmentInfo segmentInfo, IOContext context) throws IOException {
|
||||
|
|
|
@ -58,8 +58,10 @@ import org.apache.lucene.util.packed.BlockPackedWriter;
|
|||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/**
|
||||
* writer for Lucene 4.2 term vectors for testing
|
||||
* Writer for 4.2 term vectors format for testing
|
||||
* @deprecated for test purposes only
|
||||
*/
|
||||
@Deprecated
|
||||
final class Lucene42TermVectorsWriter extends TermVectorsWriter {
|
||||
|
||||
// hard limit on the maximum number of documents per chunk
|
||||
|
|
|
@ -37,7 +37,7 @@ import org.apache.lucene.codecs.lucene42.Lucene42RWTermVectorsFormat;
|
|||
* Read-write version of {@link Lucene45Codec} for testing.
|
||||
*/
|
||||
@SuppressWarnings("deprecation")
|
||||
public class Lucene45RWCodec extends Lucene45Codec {
|
||||
public final class Lucene45RWCodec extends Lucene45Codec {
|
||||
|
||||
private final FieldInfosFormat fieldInfosFormat = new Lucene42FieldInfosFormat() {
|
||||
@Override
|
||||
|
|
|
@ -24,9 +24,11 @@ import org.apache.lucene.index.FieldInfo;
|
|||
import org.apache.lucene.index.SegmentWriteState;
|
||||
|
||||
/**
|
||||
* Read-write version of {@link Lucene45DocValuesFormat} for testing.
|
||||
* Read-write version of 4.5 docvalues format for testing
|
||||
* @deprecated for test purposes only
|
||||
*/
|
||||
public class Lucene45RWDocValuesFormat extends Lucene45DocValuesFormat {
|
||||
@Deprecated
|
||||
public final class Lucene45RWDocValuesFormat extends Lucene45DocValuesFormat {
|
||||
|
||||
@Override
|
||||
public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.codecs.lucene45;
|
|||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.index.BaseCompressingDocValuesFormatTestCase;
|
||||
import org.junit.BeforeClass;
|
||||
|
||||
/**
|
||||
* Tests Lucene45DocValuesFormat
|
||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.codecs.lucene46;
|
|||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.codecs.NormsFormat;
|
||||
import org.apache.lucene.codecs.SegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.SegmentInfoWriter;
|
||||
import org.apache.lucene.codecs.StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.TermVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene41.Lucene41RWStoredFieldsFormat;
|
||||
|
@ -29,10 +28,11 @@ import org.apache.lucene.codecs.lucene42.Lucene42RWTermVectorsFormat;
|
|||
import org.apache.lucene.codecs.lucene45.Lucene45RWDocValuesFormat;
|
||||
|
||||
/**
|
||||
* Read-write version of {@link Lucene46Codec} for testing.
|
||||
* Read-write version of 4.6 codec for testing
|
||||
* @deprecated for test purposes only
|
||||
*/
|
||||
@SuppressWarnings("deprecation")
|
||||
public class Lucene46RWCodec extends Lucene46Codec {
|
||||
@Deprecated
|
||||
public final class Lucene46RWCodec extends Lucene46Codec {
|
||||
|
||||
private static final DocValuesFormat docValues = new Lucene45RWDocValuesFormat();
|
||||
|
||||
|
|
|
@ -19,8 +19,12 @@ package org.apache.lucene.codecs.lucene46;
|
|||
|
||||
import org.apache.lucene.codecs.SegmentInfoWriter;
|
||||
|
||||
/** read-write version of 4.6 segmentinfos for testing */
|
||||
public class Lucene46RWSegmentInfoFormat extends Lucene46SegmentInfoFormat {
|
||||
/**
|
||||
* Read-Write version of 4.6 segmentinfo format for testing
|
||||
* @deprecated for test purposes only
|
||||
*/
|
||||
@Deprecated
|
||||
public final class Lucene46RWSegmentInfoFormat extends Lucene46SegmentInfoFormat {
|
||||
@Override
|
||||
public SegmentInfoWriter getSegmentInfoWriter() {
|
||||
return new Lucene46SegmentInfoWriter();
|
||||
|
|
|
@ -31,12 +31,11 @@ import org.apache.lucene.util.IOUtils;
|
|||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Lucene 4.0 implementation of {@link SegmentInfoWriter}.
|
||||
*
|
||||
* @see Lucene46SegmentInfoFormat
|
||||
* @lucene.experimental
|
||||
* Writer for 4.0 segmentinfo format for testing
|
||||
* @deprecated for test purposes only
|
||||
*/
|
||||
public class Lucene46SegmentInfoWriter extends SegmentInfoWriter {
|
||||
@Deprecated
|
||||
final class Lucene46SegmentInfoWriter extends SegmentInfoWriter {
|
||||
|
||||
/** Sole constructor. */
|
||||
public Lucene46SegmentInfoWriter() {
|
||||
|
|
|
@ -27,10 +27,11 @@ import org.apache.lucene.codecs.lucene42.Lucene42RWTermVectorsFormat;
|
|||
import org.apache.lucene.codecs.lucene46.Lucene46RWSegmentInfoFormat;
|
||||
|
||||
/**
|
||||
* Read-write version of {@link Lucene49Codec} for testing.
|
||||
* Read-Write version of 4.9 codec for testing
|
||||
* @deprecated for test purposes only
|
||||
*/
|
||||
@SuppressWarnings("deprecation")
|
||||
public class Lucene49RWCodec extends Lucene49Codec {
|
||||
@Deprecated
|
||||
public final class Lucene49RWCodec extends Lucene49Codec {
|
||||
|
||||
private static final DocValuesFormat docValues = new Lucene49RWDocValuesFormat();
|
||||
|
||||
|
|
|
@ -22,10 +22,13 @@ import java.io.IOException;
|
|||
import org.apache.lucene.codecs.DocValuesConsumer;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
/** Read-write version of {@link Lucene49DocValuesFormat} for testing */
|
||||
public class Lucene49RWDocValuesFormat extends Lucene49DocValuesFormat {
|
||||
/**
|
||||
* Read-Write version of 4.9 docvalues format for testing
|
||||
* @deprecated for test purposes only
|
||||
*/
|
||||
@Deprecated
|
||||
public final class Lucene49RWDocValuesFormat extends Lucene49DocValuesFormat {
|
||||
|
||||
@Override
|
||||
public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
|
|
|
@ -107,22 +107,34 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader {
|
|||
boolean success = false;
|
||||
fieldInfos = fn;
|
||||
numDocs = si.getDocCount();
|
||||
ChecksumIndexInput indexStream = null;
|
||||
try {
|
||||
final String indexStreamFN = IndexFileNames.segmentFileName(segment, segmentSuffix, FIELDS_INDEX_EXTENSION);
|
||||
final String fieldsStreamFN = IndexFileNames.segmentFileName(segment, segmentSuffix, FIELDS_EXTENSION);
|
||||
|
||||
int version = -1;
|
||||
long maxPointer = -1;
|
||||
CompressingStoredFieldsIndexReader indexReader = null;
|
||||
|
||||
// Load the index into memory
|
||||
indexStream = d.openChecksumInput(indexStreamFN, context);
|
||||
final String indexName = IndexFileNames.segmentFileName(segment, segmentSuffix, FIELDS_INDEX_EXTENSION);
|
||||
try (ChecksumIndexInput indexStream = d.openChecksumInput(indexName, context)) {
|
||||
Throwable priorE = null;
|
||||
try {
|
||||
final String codecNameIdx = formatName + CODEC_SFX_IDX;
|
||||
version = CodecUtil.checkSegmentHeader(indexStream, codecNameIdx, VERSION_START, VERSION_CURRENT, si.getId());
|
||||
assert CodecUtil.segmentHeaderLength(codecNameIdx) == indexStream.getFilePointer();
|
||||
indexReader = new CompressingStoredFieldsIndexReader(indexStream, si);
|
||||
|
||||
maxPointer = indexStream.readVLong();
|
||||
CodecUtil.checkFooter(indexStream);
|
||||
indexStream.close();
|
||||
indexStream = null;
|
||||
} catch (Throwable exception) {
|
||||
priorE = exception;
|
||||
} finally {
|
||||
CodecUtil.checkFooter(indexStream, priorE);
|
||||
}
|
||||
}
|
||||
|
||||
this.version = version;
|
||||
this.maxPointer = maxPointer;
|
||||
this.indexReader = indexReader;
|
||||
|
||||
final String fieldsStreamFN = IndexFileNames.segmentFileName(segment, segmentSuffix, FIELDS_EXTENSION);
|
||||
try {
|
||||
// Open the data file and read metadata
|
||||
fieldsStream = d.openInput(fieldsStreamFN, context);
|
||||
if (maxPointer + CodecUtil.footerLength() != fieldsStream.length()) {
|
||||
|
@ -149,7 +161,7 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader {
|
|||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(this, indexStream);
|
||||
IOUtils.closeWhileHandlingException(this);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -105,21 +105,30 @@ public final class CompressingTermVectorsReader extends TermVectorsReader implem
|
|||
boolean success = false;
|
||||
fieldInfos = fn;
|
||||
numDocs = si.getDocCount();
|
||||
ChecksumIndexInput indexStream = null;
|
||||
try {
|
||||
int version = -1;
|
||||
CompressingStoredFieldsIndexReader indexReader = null;
|
||||
|
||||
// Load the index into memory
|
||||
final String indexStreamFN = IndexFileNames.segmentFileName(segment, segmentSuffix, VECTORS_INDEX_EXTENSION);
|
||||
indexStream = d.openChecksumInput(indexStreamFN, context);
|
||||
final String indexName = IndexFileNames.segmentFileName(segment, segmentSuffix, VECTORS_INDEX_EXTENSION);
|
||||
try (ChecksumIndexInput input = d.openChecksumInput(indexName, context)) {
|
||||
Throwable priorE = null;
|
||||
try {
|
||||
final String codecNameIdx = formatName + CODEC_SFX_IDX;
|
||||
version = CodecUtil.checkSegmentHeader(indexStream, codecNameIdx, VERSION_START, VERSION_CURRENT, si.getId());
|
||||
assert CodecUtil.segmentHeaderLength(codecNameIdx) == indexStream.getFilePointer();
|
||||
indexReader = new CompressingStoredFieldsIndexReader(indexStream, si);
|
||||
version = CodecUtil.checkSegmentHeader(input, codecNameIdx, VERSION_START, VERSION_CURRENT, si.getId());
|
||||
assert CodecUtil.segmentHeaderLength(codecNameIdx) == input.getFilePointer();
|
||||
indexReader = new CompressingStoredFieldsIndexReader(input, si);
|
||||
input.readVLong(); // the end of the data file
|
||||
} catch (Throwable exception) {
|
||||
priorE = exception;
|
||||
} finally {
|
||||
CodecUtil.checkFooter(input, priorE);
|
||||
}
|
||||
}
|
||||
|
||||
indexStream.readVLong(); // the end of the data file
|
||||
CodecUtil.checkFooter(indexStream);
|
||||
indexStream.close();
|
||||
indexStream = null;
|
||||
this.version = version;
|
||||
this.indexReader = indexReader;
|
||||
|
||||
try {
|
||||
// Open the data file and read metadata
|
||||
final String vectorsStreamFN = IndexFileNames.segmentFileName(segment, segmentSuffix, VECTORS_EXTENSION);
|
||||
vectorsStream = d.openInput(vectorsStreamFN, context);
|
||||
|
@ -146,7 +155,7 @@ public final class CompressingTermVectorsReader extends TermVectorsReader implem
|
|||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(this, indexStream);
|
||||
IOUtils.closeWhileHandlingException(this);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,9 +18,6 @@ package org.apache.lucene.codecs.compressing;
|
|||
*/
|
||||
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FieldInfos;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.MergeState;
|
||||
import org.apache.lucene.index.SegmentReader;
|
||||
|
||||
|
@ -35,7 +32,7 @@ class MatchingReaders {
|
|||
* vectors may be bulk merged. */
|
||||
final boolean[] matchingReaders;
|
||||
|
||||
/** How many {@link #matchingSegmentReaders} are set. */
|
||||
/** How many {@link #matchingReaders} are set. */
|
||||
final int count;
|
||||
|
||||
MatchingReaders(MergeState mergeState) {
|
||||
|
|
|
@ -18,7 +18,6 @@ package org.apache.lucene.index;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
@ -46,14 +45,19 @@ public class MergeState {
|
|||
/** {@link FieldInfos} of the newly merged segment. */
|
||||
public FieldInfos mergeFieldInfos;
|
||||
|
||||
/** Stored field producers being merged */
|
||||
public final StoredFieldsReader[] storedFieldsReaders;
|
||||
|
||||
/** Term vector producers being merged */
|
||||
public final TermVectorsReader[] termVectorsReaders;
|
||||
|
||||
/** Norms producers being merged */
|
||||
public final NormsProducer[] normsProducers;
|
||||
|
||||
/** DocValues producers being merged */
|
||||
public final DocValuesProducer[] docValuesProducers;
|
||||
|
||||
/** FieldInfos being merged */
|
||||
public final FieldInfos[] fieldInfos;
|
||||
|
||||
/** Live docs for each reader */
|
||||
|
|
|
@ -232,8 +232,6 @@ public class SolrIndexConfig {
|
|||
iwc.setMergedSegmentWarmer(warmer);
|
||||
}
|
||||
|
||||
iwc.setCheckIntegrityAtMerge(checkIntegrityAtMerge);
|
||||
|
||||
return iwc;
|
||||
}
|
||||
|
||||
|
|
|
@ -78,18 +78,6 @@ public class SolrIndexConfigTest extends SolrTestCaseJ4 {
|
|||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCheckIntegrityAtMerge() throws Exception {
|
||||
SolrConfig solrConfig = new SolrConfig("solr" + File.separator
|
||||
+ "collection1", "solrconfig-indexconfig.xml", null);
|
||||
SolrIndexConfig solrIndexConfig = new SolrIndexConfig(solrConfig, null, null);
|
||||
assertNotNull(solrIndexConfig.checkIntegrityAtMerge);
|
||||
assertTrue(solrIndexConfig.checkIntegrityAtMerge);
|
||||
IndexSchema indexSchema = IndexSchemaFactory.buildIndexSchema("schema.xml", solrConfig);
|
||||
IndexWriterConfig iwc = solrIndexConfig.toIndexWriterConfig(indexSchema);
|
||||
assertTrue(iwc.getCheckIntegrityAtMerge());
|
||||
}
|
||||
|
||||
public void testMergedSegmentWarmerIndexConfigCreation() throws Exception {
|
||||
SolrConfig solrConfig = new SolrConfig("solr" + File.separator
|
||||
+ "collection1", "solrconfig-warmer.xml", null);
|
||||
|
|
Loading…
Reference in New Issue