LUCENE-2946: doc 4.0 livedocs format

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1331464 13f79535-47bb-0310-9956-ffa450edef68
2012-04-27 15:12:37 +00:00 · 2012-04-27 15:12:37 +00:00 · 46085655dc
parent a5bd0e63cb
commit 46085655dc
2 changed files with 102 additions and 3 deletions
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40LiveDocsFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40LiveDocsFormat.java
@ -23,11 +23,45 @@ import java.util.Set;
 import org.apache.lucene.codecs.LiveDocsFormat;
 import org.apache.lucene.index.IndexFileNames;
 import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.store.DataOutput; // javadocs
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
 import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.CodecUtil; // javadocs
 import org.apache.lucene.util.MutableBits;

+/**
+ * Lucene 4.0 Live Documents Format.
+ * <p>
+ * <p>The .del file is optional, and only exists when a segment contains
+ * deletions.</p>
+ * <p>Although per-segment, this file is maintained exterior to compound segment
+ * files.</p>
+ * <p>Deletions (.del) --&gt; Format,Header,ByteCount,BitCount, Bits | DGaps (depending
+ * on Format)</p>
+ * <ul>
+ *   <li>Format,ByteSize,BitCount --&gt; {@link DataOutput#writeInt Uint32}</li>
+ *   <li>Bits --&gt; &lt;{@link DataOutput#writeByte Byte}&gt; <sup>ByteCount</sup></li>
+ *   <li>DGaps --&gt; &lt;DGap,NonOnesByte&gt; <sup>NonzeroBytesCount</sup></li>
+ *   <li>DGap --&gt; {@link DataOutput#writeVInt VInt}</li>
+ *   <li>NonOnesByte --&gt; {@link DataOutput#writeByte Byte}</li>
+ *   <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
+ * </ul>
+ * <p>Format is 1: indicates cleared DGaps.</p>
+ * <p>ByteCount indicates the number of bytes in Bits. It is typically
+ * (SegSize/8)+1.</p>
+ * <p>BitCount indicates the number of bits that are currently set in Bits.</p>
+ * <p>Bits contains one bit for each document indexed. When the bit corresponding
+ * to a document number is cleared, that document is marked as deleted. Bit ordering
+ * is from least to most significant. Thus, if Bits contains two bytes, 0x00 and
+ * 0x02, then document 9 is marked as alive (not deleted).</p>
+ * <p>DGaps represents sparse bit-vectors more efficiently than Bits. It is made
+ * of DGaps on indexes of nonOnes bytes in Bits, and the nonOnes bytes themselves.
+ * The number of nonOnes bytes in Bits (NonOnesBytesCount) is not stored.</p>
+ * <p>For example, if there are 8000 bits and only bits 10,12,32 are cleared, DGaps
+ * would be used:</p>
+ * <p>(VInt) 1 , (byte) 20 , (VInt) 3 , (Byte) 1</p>
+ */
 public class Lucene40LiveDocsFormat extends LiveDocsFormat {

  /** Extension of deletes */
--- a/lucene/core/src/java/org/apache/lucene/util/CodecUtil.java
+++ b/lucene/core/src/java/org/apache/lucene/util/CodecUtil.java
@ -28,16 +28,47 @@ import org.apache.lucene.store.DataOutput;

 /**
 * Utility class for reading and writing versioned headers.
- * This is useful to ensure that a file is in the format
- * you think it is.
+ * <p>
+ * Writing codec headers is useful to ensure that a file is in 
+ * the format you think it is.
+ * 
 * @lucene.experimental
 */

 public final class CodecUtil {
  private CodecUtil() {} // no instance

-  private final static int CODEC_MAGIC = 0x3fd76c17;
+  /**
+   * Constant to identify the start of a codec header.
+   */
+  public final static int CODEC_MAGIC = 0x3fd76c17;

+  /**
+   * Writes a codec header, which records both a string to
+   * identify the file and a version number. This header can
+   * be parsed and validated with 
+   * {@link #checkHeader(DataInput, String, int, int) checkHeader()}.
+   * <p>
+   * CodecHeader --&gt; Magic,CodecName,Version
+   * <ul>
+   *    <li>Magic --&gt; {@link DataOutput#writeInt Uint32}. This
+   *        identifies the start of the header. It is always {@value #CODEC_MAGIC}.
+   *    <li>CodecName --&gt; {@link DataOutput#writeString String}. This
+   *        is a string to identify this file.
+   *    <li>Version --&gt; {@link DataOutput#writeInt Uint32}. Records
+   *        the version of the file.
+   * </ul>
+   * <p>
+   * Note that the length of a codec header depends only upon the
+   * name of the codec, so this length can be computed at any time
+   * with {@link #headerLength(String)}.
+   * 
+   * @param out Output stream
+   * @param codec String to identify this file. It should be simple ASCII, 
+   *              less than 128 characters in length.
+   * @param version Version number
+   * @throws IOException If there is an I/O error writing to the underlying medium.
+   */
  public static void writeHeader(DataOutput out, String codec, int version)
    throws IOException {
    BytesRef bytes = new BytesRef(codec);
@ -49,10 +80,44 @@ public final class CodecUtil {
    out.writeInt(version);
  }

+  /**
+   * Computes the length of a codec header.
+   * 
+   * @param codec Codec name.
+   * @return length of the entire codec header.
+   * @see #writeHeader(DataOutput, String, int)
+   */
  public static int headerLength(String codec) {
    return 9+codec.length();
  }

+  /**
+   * Reads and validates a header previously written with 
+   * {@link #writeHeader(DataOutput, String, int)}.
+   * <p>
+   * When reading a file, supply the expected <code>codec</code> and
+   * an expected version range (<code>minVersion to maxVersion</code>).
+   * 
+   * @param in Input stream, positioned at the point where the
+   *        header was previously written. Typically this is located
+   *        at the beginning of the file.
+   * @param codec The expected codec name.
+   * @param minVersion The minimum supported expected version number.
+   * @param maxVersion The maximum supported expected version number.
+   * @return The actual version found, when a valid header is found 
+   *         that matches <code>codec</code>, with an actual version 
+   *         where <code>minVersion <= actual <= maxVersion</code>.
+   *         Otherwise an exception is thrown.
+   * @throws CorruptIndexException If the first four bytes are not
+   *         {@link #CODEC_MAGIC}, or if the actual codec found is
+   *         not <code>codec</code>.
+   * @throws IndexFormatTooOldException If the actual version is less 
+   *         than <code>minVersion</code>.
+   * @throws IndexFormatTooNewException If the actual version is greater 
+   *         than <code>maxVersion</code>.
+   * @throws IOException If there is an I/O error reading from the underlying medium.
+   * @see #writeHeader(DataOutput, String, int)
+   */
  public static int checkHeader(DataInput in, String codec, int minVersion, int maxVersion)
    throws IOException {