LUCENE-2946: doc 4.0 livedocs format

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1331464 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-04-27 15:12:37 +00:00
parent a5bd0e63cb
commit 46085655dc
2 changed files with 102 additions and 3 deletions

View File

@ -23,11 +23,45 @@ import java.util.Set;
import org.apache.lucene.codecs.LiveDocsFormat;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.DataOutput; // javadocs
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.CodecUtil; // javadocs
import org.apache.lucene.util.MutableBits;
/**
* Lucene 4.0 Live Documents Format.
* <p>
* <p>The .del file is optional, and only exists when a segment contains
* deletions.</p>
* <p>Although per-segment, this file is maintained exterior to compound segment
* files.</p>
* <p>Deletions (.del) --&gt; Format,Header,ByteCount,BitCount, Bits | DGaps (depending
* on Format)</p>
* <ul>
* <li>Format,ByteSize,BitCount --&gt; {@link DataOutput#writeInt Uint32}</li>
* <li>Bits --&gt; &lt;{@link DataOutput#writeByte Byte}&gt; <sup>ByteCount</sup></li>
* <li>DGaps --&gt; &lt;DGap,NonOnesByte&gt; <sup>NonzeroBytesCount</sup></li>
* <li>DGap --&gt; {@link DataOutput#writeVInt VInt}</li>
* <li>NonOnesByte --&gt; {@link DataOutput#writeByte Byte}</li>
* <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
* </ul>
* <p>Format is 1: indicates cleared DGaps.</p>
* <p>ByteCount indicates the number of bytes in Bits. It is typically
* (SegSize/8)+1.</p>
* <p>BitCount indicates the number of bits that are currently set in Bits.</p>
* <p>Bits contains one bit for each document indexed. When the bit corresponding
* to a document number is cleared, that document is marked as deleted. Bit ordering
* is from least to most significant. Thus, if Bits contains two bytes, 0x00 and
* 0x02, then document 9 is marked as alive (not deleted).</p>
* <p>DGaps represents sparse bit-vectors more efficiently than Bits. It is made
* of DGaps on indexes of nonOnes bytes in Bits, and the nonOnes bytes themselves.
* The number of nonOnes bytes in Bits (NonOnesBytesCount) is not stored.</p>
* <p>For example, if there are 8000 bits and only bits 10,12,32 are cleared, DGaps
* would be used:</p>
* <p>(VInt) 1 , (byte) 20 , (VInt) 3 , (Byte) 1</p>
*/
public class Lucene40LiveDocsFormat extends LiveDocsFormat {
/** Extension of deletes */

View File

@ -28,16 +28,47 @@ import org.apache.lucene.store.DataOutput;
/**
* Utility class for reading and writing versioned headers.
* This is useful to ensure that a file is in the format
* you think it is.
* <p>
* Writing codec headers is useful to ensure that a file is in
* the format you think it is.
*
* @lucene.experimental
*/
public final class CodecUtil {
private CodecUtil() {} // no instance
private final static int CODEC_MAGIC = 0x3fd76c17;
/**
* Constant to identify the start of a codec header.
*/
public final static int CODEC_MAGIC = 0x3fd76c17;
/**
* Writes a codec header, which records both a string to
* identify the file and a version number. This header can
* be parsed and validated with
* {@link #checkHeader(DataInput, String, int, int) checkHeader()}.
* <p>
* CodecHeader --&gt; Magic,CodecName,Version
* <ul>
* <li>Magic --&gt; {@link DataOutput#writeInt Uint32}. This
* identifies the start of the header. It is always {@value #CODEC_MAGIC}.
* <li>CodecName --&gt; {@link DataOutput#writeString String}. This
* is a string to identify this file.
* <li>Version --&gt; {@link DataOutput#writeInt Uint32}. Records
* the version of the file.
* </ul>
* <p>
* Note that the length of a codec header depends only upon the
* name of the codec, so this length can be computed at any time
* with {@link #headerLength(String)}.
*
* @param out Output stream
* @param codec String to identify this file. It should be simple ASCII,
* less than 128 characters in length.
* @param version Version number
* @throws IOException If there is an I/O error writing to the underlying medium.
*/
public static void writeHeader(DataOutput out, String codec, int version)
throws IOException {
BytesRef bytes = new BytesRef(codec);
@ -49,10 +80,44 @@ public final class CodecUtil {
out.writeInt(version);
}
/**
* Computes the length of a codec header.
*
* @param codec Codec name.
* @return length of the entire codec header.
* @see #writeHeader(DataOutput, String, int)
*/
public static int headerLength(String codec) {
return 9+codec.length();
}
/**
* Reads and validates a header previously written with
* {@link #writeHeader(DataOutput, String, int)}.
* <p>
* When reading a file, supply the expected <code>codec</code> and
* an expected version range (<code>minVersion to maxVersion</code>).
*
* @param in Input stream, positioned at the point where the
* header was previously written. Typically this is located
* at the beginning of the file.
* @param codec The expected codec name.
* @param minVersion The minimum supported expected version number.
* @param maxVersion The maximum supported expected version number.
* @return The actual version found, when a valid header is found
* that matches <code>codec</code>, with an actual version
* where <code>minVersion <= actual <= maxVersion</code>.
* Otherwise an exception is thrown.
* @throws CorruptIndexException If the first four bytes are not
* {@link #CODEC_MAGIC}, or if the actual codec found is
* not <code>codec</code>.
* @throws IndexFormatTooOldException If the actual version is less
* than <code>minVersion</code>.
* @throws IndexFormatTooNewException If the actual version is greater
* than <code>maxVersion</code>.
* @throws IOException If there is an I/O error reading from the underlying medium.
* @see #writeHeader(DataOutput, String, int)
*/
public static int checkHeader(DataInput in, String codec, int minVersion, int maxVersion)
throws IOException {