additional 4.0 checks, javadocs

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4547@1438595 13f79535-47bb-0310-9956-ffa450edef68
2025-02-08 19:15:06 +00:00 · 2013-01-25 17:12:34 +00:00 · 2013-01-25 17:12:34 +00:00 · 86e30c7f7f
commit 86e30c7f7f
parent f7db724186
5 changed files with 211 additions and 24 deletions
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesFormat.java
@ -19,17 +19,113 @@ package org.apache.lucene.codecs.lucene40;
 import java.io.IOException;
 import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.codecs.DocValuesConsumer;
 import org.apache.lucene.codecs.DocValuesFormat;
 import org.apache.lucene.codecs.DocValuesProducer;
 import org.apache.lucene.index.IndexFileNames;
 import org.apache.lucene.index.SegmentReadState;
 import org.apache.lucene.index.SegmentWriteState;
 import org.apache.lucene.store.CompoundFileDirectory;
 import org.apache.lucene.store.DataOutput;
 import org.apache.lucene.util.packed.PackedInts;
 /**
 * Lucene 4.0 DocValues format.
 * <p>
 * Files:
 * <ul>
 *   <li><tt>.dv.cfs</tt>: {@link CompoundFileDirectory compound container}</li>
 *   <li><tt>.dv.cfe</tt>: {@link CompoundFileDirectory compound entries}</li>
 * </ul>
 * Entries within the compound file:
 * <ul>
 *   <li><tt>&lt;segment&gt;_&lt;fieldNumber&gt;.dat</tt>: data values</li>
 *   <li><tt>&lt;segment&gt;_&lt;fieldNumber&gt;.idx</tt>: index into the .dat for DEREF types</li>
 * </ul>
 * <p>
 * There are several many types of {@code DocValues} with different encodings.
 * From the perspective of filenames, all types store their values in <tt>.dat</tt>
 * entries within the compound file. In the case of dereferenced/sorted types, the <tt>.dat</tt>
 * actually contains only the unique values, and an additional <tt>.idx</tt> file contains
 * pointers to these unique values.
 * </p>
 * Formats:
 * <ul>
 *    <li>{@code VAR_INTS} .dat --&gt; Header, PackedType, MinValue, 
 *        DefaultValue, PackedStream</li>
 *    <li>{@code FIXED_INTS_8} .dat --&gt; Header, ValueSize, 
 *        {@link DataOutput#writeByte Byte}<sup>maxdoc</sup></li>
 *    <li>{@code FIXED_INTS_16} .dat --&gt; Header, ValueSize,
 *        {@link DataOutput#writeShort Short}<sup>maxdoc</sup></li>
 *    <li>{@code FIXED_INTS_32} .dat --&gt; Header, ValueSize,
 *        {@link DataOutput#writeInt Int32}<sup>maxdoc</sup></li>
 *    <li>{@code FIXED_INTS_64} .dat --&gt; Header, ValueSize,
 *        {@link DataOutput#writeLong Int64}<sup>maxdoc</sup></li>
 *    <li>{@code FLOAT_32} .dat --&gt; Header, ValueSize, Float32<sup>maxdoc</sup></li>
 *    <li>{@code FLOAT_64} .dat --&gt; Header, ValueSize, Float64<sup>maxdoc</sup></li>
 *    <li>{@code BYTES_FIXED_STRAIGHT} .dat --&gt; Header, ValueSize,
 *        ({@link DataOutput#writeByte Byte} * ValueSize)<sup>maxdoc</sup></li>
 *    <li>{@code BYTES_VAR_STRAIGHT} .idx --&gt; Header, TotalBytes, Addresses</li>
 *    <li>{@code BYTES_VAR_STRAIGHT} .dat --&gt; Header,
          ({@link DataOutput#writeByte Byte} * <i>variable ValueSize</i>)<sup>maxdoc</sup></li>
 *    <li>{@code BYTES_FIXED_DEREF} .idx --&gt; Header, NumValues, Addresses</li>
 *    <li>{@code BYTES_FIXED_DEREF} .dat --&gt; Header, ValueSize,
 *        ({@link DataOutput#writeByte Byte} * ValueSize)<sup>NumValues</sup></li>
 *    <li>{@code BYTES_VAR_DEREF} .idx --&gt; Header, TotalVarBytes, Addresses</li>
 *    <li>{@code BYTES_VAR_DEREF} .dat --&gt; Header,
 *        (LengthPrefix + {@link DataOutput#writeByte Byte} * <i>variable ValueSize</i>)<sup>NumValues</sup></li>
 *    <li>{@code BYTES_FIXED_SORTED} .idx --&gt; Header, NumValues, Ordinals</li>
 *    <li>{@code BYTES_FIXED_SORTED} .dat --&gt; Header, ValueSize,
 *        ({@link DataOutput#writeByte Byte} * ValueSize)<sup>NumValues</sup></li>
 *    <li>{@code BYTES_VAR_SORTED} .idx --&gt; Header, TotalVarBytes, Addresses, Ordinals</li>
 *    <li>{@code BYTES_VAR_SORTED} .dat --&gt; Header,
 *        ({@link DataOutput#writeByte Byte} * <i>variable ValueSize</i>)<sup>NumValues</sup></li>
 * </ul>
 * Data Types:
 * <ul>
 *    <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
 *    <li>PackedType --&gt; {@link DataOutput#writeByte Byte}</li>
 *    <li>MaxAddress, MinValue, DefaultValue --&gt; {@link DataOutput#writeLong Int64}</li>
 *    <li>PackedStream, Addresses, Ordinals --&gt; {@link PackedInts}</li>
 *    <li>ValueSize, NumValues --&gt; {@link DataOutput#writeInt Int32}</li>
 *    <li>Float32 --&gt; 32-bit float encoded with {@link Float#floatToRawIntBits(float)}
 *                       then written as {@link DataOutput#writeInt Int32}</li>
 *    <li>Float64 --&gt; 64-bit float encoded with {@link Double#doubleToRawLongBits(double)}
 *                       then written as {@link DataOutput#writeLong Int64}</li>
 *    <li>TotalBytes --&gt; {@link DataOutput#writeVLong VLong}</li>
 *    <li>TotalVarBytes --&gt; {@link DataOutput#writeLong Int64}</li>
 *    <li>LengthPrefix --&gt; Length of the data value as {@link DataOutput#writeVInt VInt} (maximum
 *                       of 2 bytes)</li>
 * </ul>
 * Notes:
 * <ul>
 *    <li>PackedType is a 0 when compressed, 1 when the stream is written as 64-bit integers.</li>
 *    <li>Addresses stores pointers to the actual byte location (indexed by docid). In the VAR_STRAIGHT
 *        case, each entry can have a different length, so to determine the length, docid+1 is 
 *        retrieved. A sentinel address is written at the end for the VAR_STRAIGHT case, so the Addresses 
 *        stream contains maxdoc+1 indices. For the deduplicated VAR_DEREF case, each length
 *        is encoded as a prefix to the data itself as a {@link DataOutput#writeVInt VInt} 
 *        (maximum of 2 bytes).</li>
 *    <li>Ordinals stores the term ID in sorted order (indexed by docid). In the FIXED_SORTED case,
 *        the address into the .dat can be computed from the ordinal as 
 *        <code>Header+ValueSize+(ordinal*ValueSize)</code> because the byte length is fixed.
 *        In the VAR_SORTED case, there is double indirection (docid -> ordinal -> address), but
 *        an additional sentinel ordinal+address is always written (so there are NumValues+1 ordinals). To
 *        determine the length, ord+1's address is looked up as well.</li>
 *    <li>{@code BYTES_VAR_STRAIGHT BYTES_VAR_STRAIGHT} in contrast to other straight 
 *        variants uses a <tt>.idx</tt> file to improve lookup perfromance. In contrast to 
 *        {@code BYTES_VAR_DEREF BYTES_VAR_DEREF} it doesn't apply deduplication of the document values.
 *    </li>
 * </ul>
 * @deprecated Only for reading old 4.0 and 4.1 segments
 */
@Deprecated
 // NOTE: not registered in SPI, doesnt respect segment suffix, etc
 // for back compat only!
 public class Lucene40DocValuesFormat extends DocValuesFormat {
  /** Sole constructor. */
  public Lucene40DocValuesFormat() {
    super("Lucene40");
  }
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesReader.java
@ -38,6 +38,12 @@ import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.packed.PackedInts;
 /**
 * Reads the 4.0 format of norms/docvalues
 * @lucene.experimental
 * @deprecated Only for reading old 4.0 and 4.1 segments
 */
@Deprecated
 class Lucene40DocValuesReader extends DocValuesProducer {
  private final Directory dir;
  private final SegmentReadState state;
@ -56,24 +62,6 @@ class Lucene40DocValuesReader extends DocValuesProducer {
    this.state = state;
    this.legacyKey = legacyKey;
    this.dir = new CompoundFileDirectory(state.directory, filename, state.context, false);
    // nocommit: uncomment to debug
    /*
    if (legacyKey.equals(Lucene40FieldInfosReader.LEGACY_DV_TYPE_KEY)) {
      System.out.println("dv READER:");
      for (FieldInfo fi : state.fieldInfos) {
        if (fi.hasDocValues()) {
          System.out.println(fi.name + " -> " + fi.getAttribute(legacyKey) + " -> " + fi.getDocValuesType());
        }
      }
    } else {
      System.out.println("nrm READER:");
      for (FieldInfo fi : state.fieldInfos) {
        if (fi.hasNorms()) {
          System.out.println(fi.name + " -> " + fi.getAttribute(legacyKey) + " -> " + fi.getNormType());
        }
      }
    }
    */
  }
  @Override
@ -109,6 +97,9 @@ class Lucene40DocValuesReader extends DocValuesProducer {
          default: 
            throw new AssertionError();
        }
        if (input.getFilePointer() != input.length()) {
          throw new CorruptIndexException("did not read all bytes from file \"" + fileName + "\": read " + input.getFilePointer() + " vs size " + input.length() + " (resource: " + input + ")");
        }
        success = true;
      } finally {
        if (success) {
@ -163,7 +154,10 @@ class Lucene40DocValuesReader extends DocValuesProducer {
    CodecUtil.checkHeader(input, Lucene40DocValuesFormat.INTS_CODEC_NAME, 
                                 Lucene40DocValuesFormat.INTS_VERSION_START, 
                                 Lucene40DocValuesFormat.INTS_VERSION_CURRENT);
-    input.readInt();
+    int valueSize = input.readInt();
    if (valueSize != 1) {
      throw new CorruptIndexException("invalid valueSize: " + valueSize);
    }
    int maxDoc = state.segmentInfo.getDocCount();
    final byte values[] = new byte[maxDoc];
    input.readBytes(values, 0, values.length);
@ -179,7 +173,10 @@ class Lucene40DocValuesReader extends DocValuesProducer {
    CodecUtil.checkHeader(input, Lucene40DocValuesFormat.INTS_CODEC_NAME, 
                                 Lucene40DocValuesFormat.INTS_VERSION_START, 
                                 Lucene40DocValuesFormat.INTS_VERSION_CURRENT);
-    input.readInt();
+    int valueSize = input.readInt();
    if (valueSize != 2) {
      throw new CorruptIndexException("invalid valueSize: " + valueSize);
    }
    int maxDoc = state.segmentInfo.getDocCount();
    final short values[] = new short[maxDoc];
    for (int i = 0; i < values.length; i++) {
@ -197,7 +194,10 @@ class Lucene40DocValuesReader extends DocValuesProducer {
    CodecUtil.checkHeader(input, Lucene40DocValuesFormat.INTS_CODEC_NAME, 
                                 Lucene40DocValuesFormat.INTS_VERSION_START, 
                                 Lucene40DocValuesFormat.INTS_VERSION_CURRENT);
-    input.readInt();
+    int valueSize = input.readInt();
    if (valueSize != 4) {
      throw new CorruptIndexException("invalid valueSize: " + valueSize);
    }
    int maxDoc = state.segmentInfo.getDocCount();
    final int values[] = new int[maxDoc];
    for (int i = 0; i < values.length; i++) {
@ -215,7 +215,10 @@ class Lucene40DocValuesReader extends DocValuesProducer {
    CodecUtil.checkHeader(input, Lucene40DocValuesFormat.INTS_CODEC_NAME, 
                                 Lucene40DocValuesFormat.INTS_VERSION_START, 
                                 Lucene40DocValuesFormat.INTS_VERSION_CURRENT);
-    input.readInt();
+    int valueSize = input.readInt();
    if (valueSize != 8) {
      throw new CorruptIndexException("invalid valueSize: " + valueSize);
    }
    int maxDoc = state.segmentInfo.getDocCount();
    final long values[] = new long[maxDoc];
    for (int i = 0; i < values.length; i++) {
@ -233,7 +236,10 @@ class Lucene40DocValuesReader extends DocValuesProducer {
    CodecUtil.checkHeader(input, Lucene40DocValuesFormat.FLOATS_CODEC_NAME, 
                                 Lucene40DocValuesFormat.FLOATS_VERSION_START, 
                                 Lucene40DocValuesFormat.FLOATS_VERSION_CURRENT);
-    input.readInt();
+    int valueSize = input.readInt();
    if (valueSize != 4) {
      throw new CorruptIndexException("invalid valueSize: " + valueSize);
    }
    int maxDoc = state.segmentInfo.getDocCount();
    final int values[] = new int[maxDoc];
    for (int i = 0; i < values.length; i++) {
@ -251,7 +257,10 @@ class Lucene40DocValuesReader extends DocValuesProducer {
    CodecUtil.checkHeader(input, Lucene40DocValuesFormat.FLOATS_CODEC_NAME, 
                                 Lucene40DocValuesFormat.FLOATS_VERSION_START, 
                                 Lucene40DocValuesFormat.FLOATS_VERSION_CURRENT);
-    input.readInt();
+    int valueSize = input.readInt();
    if (valueSize != 8) {
      throw new CorruptIndexException("invalid valueSize: " + valueSize);
    }
    int maxDoc = state.segmentInfo.getDocCount();
    final long values[] = new long[maxDoc];
    for (int i = 0; i < values.length; i++) {
@ -302,6 +311,9 @@ class Lucene40DocValuesReader extends DocValuesProducer {
      // nocommit? can the current impl even handle > 2G?
      final byte bytes[] = new byte[state.segmentInfo.getDocCount() * fixedLength];
      input.readBytes(bytes, 0, bytes.length);
      if (input.getFilePointer() != input.length()) {
        throw new CorruptIndexException("did not read all bytes from file \"" + fileName + "\": read " + input.getFilePointer() + " vs size " + input.length() + " (resource: " + input + ")");
      }
      success = true;
      return new BinaryDocValues() {
        @Override
@ -340,6 +352,12 @@ class Lucene40DocValuesReader extends DocValuesProducer {
      final byte bytes[] = new byte[(int)totalBytes];
      data.readBytes(bytes, 0, bytes.length);
      final PackedInts.Reader reader = PackedInts.getReader(index);
      if (data.getFilePointer() != data.length()) {
        throw new CorruptIndexException("did not read all bytes from file \"" + dataName + "\": read " + data.getFilePointer() + " vs size " + data.length() + " (resource: " + data + ")");
      }
      if (index.getFilePointer() != index.length()) {
        throw new CorruptIndexException("did not read all bytes from file \"" + indexName + "\": read " + index.getFilePointer() + " vs size " + index.length() + " (resource: " + index + ")");
      }
      success = true;
      return new BinaryDocValues() {
        @Override
@ -382,6 +400,12 @@ class Lucene40DocValuesReader extends DocValuesProducer {
      final byte bytes[] = new byte[fixedLength * valueCount];
      data.readBytes(bytes, 0, bytes.length);
      final PackedInts.Reader reader = PackedInts.getReader(index);
      if (data.getFilePointer() != data.length()) {
        throw new CorruptIndexException("did not read all bytes from file \"" + dataName + "\": read " + data.getFilePointer() + " vs size " + data.length() + " (resource: " + data + ")");
      }
      if (index.getFilePointer() != index.length()) {
        throw new CorruptIndexException("did not read all bytes from file \"" + indexName + "\": read " + index.getFilePointer() + " vs size " + index.length() + " (resource: " + index + ")");
      }
      success = true;
      return new BinaryDocValues() {
        @Override
@ -422,6 +446,12 @@ class Lucene40DocValuesReader extends DocValuesProducer {
      final byte bytes[] = new byte[(int)totalBytes];
      data.readBytes(bytes, 0, bytes.length);
      final PackedInts.Reader reader = PackedInts.getReader(index);
      if (data.getFilePointer() != data.length()) {
        throw new CorruptIndexException("did not read all bytes from file \"" + dataName + "\": read " + data.getFilePointer() + " vs size " + data.length() + " (resource: " + data + ")");
      }
      if (index.getFilePointer() != index.length()) {
        throw new CorruptIndexException("did not read all bytes from file \"" + indexName + "\": read " + index.getFilePointer() + " vs size " + index.length() + " (resource: " + index + ")");
      }
      success = true;
      return new BinaryDocValues() {
        @Override
@ -470,6 +500,12 @@ class Lucene40DocValuesReader extends DocValuesProducer {
          default:
            throw new AssertionError();
        }
        if (data.getFilePointer() != data.length()) {
          throw new CorruptIndexException("did not read all bytes from file \"" + dataName + "\": read " + data.getFilePointer() + " vs size " + data.length() + " (resource: " + data + ")");
        }
        if (index.getFilePointer() != index.length()) {
          throw new CorruptIndexException("did not read all bytes from file \"" + indexName + "\": read " + index.getFilePointer() + " vs size " + index.length() + " (resource: " + index + ")");
        }
        success = true;
      } finally {
        if (success) {
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40NormsFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40NormsFormat.java
@ -25,9 +25,29 @@ import org.apache.lucene.codecs.NormsFormat;
 import org.apache.lucene.index.IndexFileNames;
 import org.apache.lucene.index.SegmentReadState;
 import org.apache.lucene.index.SegmentWriteState;
 import org.apache.lucene.store.CompoundFileDirectory;
 /**
 * Lucene 4.0 Norms Format.
 * <p>
 * Files:
 * <ul>
 *   <li><tt>.nrm.cfs</tt>: {@link CompoundFileDirectory compound container}</li>
 *   <li><tt>.nrm.cfe</tt>: {@link CompoundFileDirectory compound entries}</li>
 * </ul>
 * Norms are implemented as DocValues, so other than file extension, norms are 
 * written exactly the same way as {@link Lucene40DocValuesFormat DocValues}.
 * 
 * @see Lucene40DocValuesFormat
 * @lucene.experimental
 * @deprecated Only for reading old 4.0 and 4.1 segments
 */
@Deprecated
 public class Lucene40NormsFormat extends NormsFormat {
  /** Sole constructor. */
  public Lucene40NormsFormat() {}
  @Override
  public DocValuesConsumer normsConsumer(SegmentWriteState state) throws IOException {
    throw new UnsupportedOperationException("this codec can only be used for reading");
--- a/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java
+++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java
@ -81,9 +81,26 @@ public final class FieldInfo {
    DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS,
  };
  /**
   * DocValues types.
   * Note that DocValues is strongly typed, so a field cannot have different types
   * across different documents.
   */
  public static enum DocValuesType {
    /** 
     * A per-document Number
     */
    NUMERIC,
    /**
     * A per-document byte[].
     */
    BINARY,
    /** 
     * A pre-sorted byte[]. Fields with this type only store distinct byte values 
     * and store an additional offset pointer per document to dereference the shared 
     * byte[]. The stored byte[] is presorted and allows access via document id, 
     * ordinal and by-value.
     */
    SORTED
  };
--- a/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java
+++ b/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java
@ -31,9 +31,24 @@ import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.Version;
 /**
 * A wrapper for CompositeIndexReader providing access to DocValues.
 * 
 * <p><b>NOTE</b>: for multi readers, you'll get better
 * performance by gathering the sub readers using
 * {@link IndexReader#getContext()} to get the
 * atomic leaves and then operate per-AtomicReader,
 * instead of using this class.
 * 
 * <p><b>NOTE</b>: This is very costly.
 *
 * @lucene.experimental
 * @lucene.internal
 */
 // nocommit move this back to test-framework!!!
 public class MultiDocValues {
  /** returns a NumericDocValues for a reader's norms (potentially merging on-the-fly) */
  // moved to src/java so SlowWrapper can use it... uggggggh
  public static NumericDocValues getNormValues(final IndexReader r, final String field) throws IOException {
    final List<AtomicReaderContext> leaves = r.leaves();
@ -74,6 +89,7 @@ public class MultiDocValues {
    };
  }
  /** returns a NumericDocValues for a reader's docvalues (potentially merging on-the-fly) */
  public static NumericDocValues getNumericValues(final IndexReader r, final String field) throws IOException {
    final List<AtomicReaderContext> leaves = r.leaves();
    if (leaves.size() == 1) {
@ -111,6 +127,7 @@ public class MultiDocValues {
    }
  }
  /** returns a BinaryDocValues for a reader's docvalues (potentially merging on-the-fly) */
  public static BinaryDocValues getBinaryValues(final IndexReader r, final String field) throws IOException {
    final List<AtomicReaderContext> leaves = r.leaves();
    if (leaves.size() == 1) {
@ -152,6 +169,7 @@ public class MultiDocValues {
    }
  }
  /** returns a SortedDocValues for a reader's docvalues (potentially doing extremely slow things) */
  public static SortedDocValues getSortedValues(final IndexReader r, final String field) throws IOException {
    final List<AtomicReaderContext> leaves = r.leaves();
    if (leaves.size() == 1) {