mirror of https://github.com/apache/lucene.git
LUCENE-2946: doc 4.0 livedocs format
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1331464 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
a5bd0e63cb
commit
46085655dc
|
@ -23,11 +23,45 @@ import java.util.Set;
|
|||
import org.apache.lucene.codecs.LiveDocsFormat;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentInfo;
|
||||
import org.apache.lucene.store.DataOutput; // javadocs
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.CodecUtil; // javadocs
|
||||
import org.apache.lucene.util.MutableBits;
|
||||
|
||||
/**
|
||||
* Lucene 4.0 Live Documents Format.
|
||||
* <p>
|
||||
* <p>The .del file is optional, and only exists when a segment contains
|
||||
* deletions.</p>
|
||||
* <p>Although per-segment, this file is maintained exterior to compound segment
|
||||
* files.</p>
|
||||
* <p>Deletions (.del) --> Format,Header,ByteCount,BitCount, Bits | DGaps (depending
|
||||
* on Format)</p>
|
||||
* <ul>
|
||||
* <li>Format,ByteSize,BitCount --> {@link DataOutput#writeInt Uint32}</li>
|
||||
* <li>Bits --> <{@link DataOutput#writeByte Byte}> <sup>ByteCount</sup></li>
|
||||
* <li>DGaps --> <DGap,NonOnesByte> <sup>NonzeroBytesCount</sup></li>
|
||||
* <li>DGap --> {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>NonOnesByte --> {@link DataOutput#writeByte Byte}</li>
|
||||
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||
* </ul>
|
||||
* <p>Format is 1: indicates cleared DGaps.</p>
|
||||
* <p>ByteCount indicates the number of bytes in Bits. It is typically
|
||||
* (SegSize/8)+1.</p>
|
||||
* <p>BitCount indicates the number of bits that are currently set in Bits.</p>
|
||||
* <p>Bits contains one bit for each document indexed. When the bit corresponding
|
||||
* to a document number is cleared, that document is marked as deleted. Bit ordering
|
||||
* is from least to most significant. Thus, if Bits contains two bytes, 0x00 and
|
||||
* 0x02, then document 9 is marked as alive (not deleted).</p>
|
||||
* <p>DGaps represents sparse bit-vectors more efficiently than Bits. It is made
|
||||
* of DGaps on indexes of nonOnes bytes in Bits, and the nonOnes bytes themselves.
|
||||
* The number of nonOnes bytes in Bits (NonOnesBytesCount) is not stored.</p>
|
||||
* <p>For example, if there are 8000 bits and only bits 10,12,32 are cleared, DGaps
|
||||
* would be used:</p>
|
||||
* <p>(VInt) 1 , (byte) 20 , (VInt) 3 , (Byte) 1</p>
|
||||
*/
|
||||
public class Lucene40LiveDocsFormat extends LiveDocsFormat {
|
||||
|
||||
/** Extension of deletes */
|
||||
|
|
|
@ -28,16 +28,47 @@ import org.apache.lucene.store.DataOutput;
|
|||
|
||||
/**
|
||||
* Utility class for reading and writing versioned headers.
|
||||
* This is useful to ensure that a file is in the format
|
||||
* you think it is.
|
||||
* <p>
|
||||
* Writing codec headers is useful to ensure that a file is in
|
||||
* the format you think it is.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
|
||||
public final class CodecUtil {
|
||||
private CodecUtil() {} // no instance
|
||||
|
||||
private final static int CODEC_MAGIC = 0x3fd76c17;
|
||||
/**
|
||||
* Constant to identify the start of a codec header.
|
||||
*/
|
||||
public final static int CODEC_MAGIC = 0x3fd76c17;
|
||||
|
||||
/**
|
||||
* Writes a codec header, which records both a string to
|
||||
* identify the file and a version number. This header can
|
||||
* be parsed and validated with
|
||||
* {@link #checkHeader(DataInput, String, int, int) checkHeader()}.
|
||||
* <p>
|
||||
* CodecHeader --> Magic,CodecName,Version
|
||||
* <ul>
|
||||
* <li>Magic --> {@link DataOutput#writeInt Uint32}. This
|
||||
* identifies the start of the header. It is always {@value #CODEC_MAGIC}.
|
||||
* <li>CodecName --> {@link DataOutput#writeString String}. This
|
||||
* is a string to identify this file.
|
||||
* <li>Version --> {@link DataOutput#writeInt Uint32}. Records
|
||||
* the version of the file.
|
||||
* </ul>
|
||||
* <p>
|
||||
* Note that the length of a codec header depends only upon the
|
||||
* name of the codec, so this length can be computed at any time
|
||||
* with {@link #headerLength(String)}.
|
||||
*
|
||||
* @param out Output stream
|
||||
* @param codec String to identify this file. It should be simple ASCII,
|
||||
* less than 128 characters in length.
|
||||
* @param version Version number
|
||||
* @throws IOException If there is an I/O error writing to the underlying medium.
|
||||
*/
|
||||
public static void writeHeader(DataOutput out, String codec, int version)
|
||||
throws IOException {
|
||||
BytesRef bytes = new BytesRef(codec);
|
||||
|
@ -49,10 +80,44 @@ public final class CodecUtil {
|
|||
out.writeInt(version);
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the length of a codec header.
|
||||
*
|
||||
* @param codec Codec name.
|
||||
* @return length of the entire codec header.
|
||||
* @see #writeHeader(DataOutput, String, int)
|
||||
*/
|
||||
public static int headerLength(String codec) {
|
||||
return 9+codec.length();
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads and validates a header previously written with
|
||||
* {@link #writeHeader(DataOutput, String, int)}.
|
||||
* <p>
|
||||
* When reading a file, supply the expected <code>codec</code> and
|
||||
* an expected version range (<code>minVersion to maxVersion</code>).
|
||||
*
|
||||
* @param in Input stream, positioned at the point where the
|
||||
* header was previously written. Typically this is located
|
||||
* at the beginning of the file.
|
||||
* @param codec The expected codec name.
|
||||
* @param minVersion The minimum supported expected version number.
|
||||
* @param maxVersion The maximum supported expected version number.
|
||||
* @return The actual version found, when a valid header is found
|
||||
* that matches <code>codec</code>, with an actual version
|
||||
* where <code>minVersion <= actual <= maxVersion</code>.
|
||||
* Otherwise an exception is thrown.
|
||||
* @throws CorruptIndexException If the first four bytes are not
|
||||
* {@link #CODEC_MAGIC}, or if the actual codec found is
|
||||
* not <code>codec</code>.
|
||||
* @throws IndexFormatTooOldException If the actual version is less
|
||||
* than <code>minVersion</code>.
|
||||
* @throws IndexFormatTooNewException If the actual version is greater
|
||||
* than <code>maxVersion</code>.
|
||||
* @throws IOException If there is an I/O error reading from the underlying medium.
|
||||
* @see #writeHeader(DataOutput, String, int)
|
||||
*/
|
||||
public static int checkHeader(DataInput in, String codec, int minVersion, int maxVersion)
|
||||
throws IOException {
|
||||
|
||||
|
|
Loading…
Reference in New Issue