javadocs/cleanups

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5178@1515563 13f79535-47bb-0310-9956-ffa450edef68
2013-08-19 19:07:49 +00:00 · 2013-08-19 19:07:49 +00:00 · c2f37e0de2
parent e925901014
commit c2f37e0de2
10 changed files with 120 additions and 48 deletions
--- a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesProducer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesProducer.java
@ -75,6 +75,7 @@ public abstract class DocValuesProducer implements Closeable {
    final SortedDocValues in;
    final int maxDoc;
    
+    /** Creates a {@link Bits} returning true if the document has a value */
    public SortedDocsWithField(SortedDocValues in, int maxDoc) {
      this.in = in;
      this.maxDoc = maxDoc;
@ -102,6 +103,7 @@ public abstract class DocValuesProducer implements Closeable {
    final SortedSetDocValues in;
    final int maxDoc;
    
+    /** Creates a {@link Bits} returning true if the document has a value */
    public SortedSetDocsWithField(SortedSetDocValues in, int maxDoc) {
      this.in = in;
      this.maxDoc = maxDoc;
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesConsumer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesConsumer.java
@ -17,6 +17,7 @@ package org.apache.lucene.codecs.lucene45;
 * limitations under the License.
 */

+import java.io.Closeable; // javadocs
 import java.io.IOException;
 import java.util.HashMap;
 import java.util.HashSet;
@ -37,7 +38,7 @@ import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
 import org.apache.lucene.util.packed.PackedInts;

 /** writer for {@link Lucene45DocValuesFormat} */
-public class Lucene45DocValuesConsumer extends DocValuesConsumer {
+public class Lucene45DocValuesConsumer extends DocValuesConsumer implements Closeable {

  static final int BLOCK_SIZE = 16384;
  static final int ADDRESS_INTERVAL = 16;
@ -59,6 +60,7 @@ public class Lucene45DocValuesConsumer extends DocValuesConsumer {
  final IndexOutput data, meta;
  final int maxDoc;
  
+  /** expert: Creates a new writer */
  public Lucene45DocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
    boolean success = false;
    try {
@ -273,6 +275,7 @@ public class Lucene45DocValuesConsumer extends DocValuesConsumer {
    }
  }
  
+  /** expert: writes a value dictionary for a sorted/sortedset field */
  protected void addTermsDict(FieldInfo field, final Iterable<BytesRef> values) throws IOException {
    // first check if its a "fixed-length" terms dict
    int minLength = Integer.MAX_VALUE;
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesFormat.java
@ -60,19 +60,23 @@ import org.apache.lucene.util.packed.PackedInts;
 *        for each document. The addresses are written in blocks of 16k, with the current absolute
 *        start for the block, and the average (expected) delta per entry. For each document the 
 *        deviation from the delta (actual - expected) is written.
- *    <li>Prefix-compressed Binary: nocommit
+ *    <li>Prefix-compressed Binary: values are written in chunks of 16, with the first value written
+ *        completely and other values sharing prefixes. chunk addresses are written in blocks of 16k,
+ *        with the current absolute start for the block, and the average (expected) delta per entry. 
+ *        For each chunk the deviation from the delta (actual - expected) is written.
 * </ul>
 * <p>
 * {@link DocValuesType#SORTED SORTED}:
 * <ul>
- *    <li>Sorted: an FST mapping deduplicated terms to ordinals is written, along with the per-document
- *        ordinals written using one of the numeric strategies above.
+ *    <li>Sorted: a mapping of ordinals to deduplicated terms is written as Prefix-Compressed Binary, 
+ *        along with the per-document ordinals written using one of the numeric strategies above.
 * </ul>
 * <p>
 * {@link DocValuesType#SORTED_SET SORTED_SET}:
 * <ul>
- *    <li>SortedSet: an FST mapping deduplicated terms to ordinals is written, along with the per-document
- *        ordinal list written using one of the binary strategies above.  
+ *    <li>SortedSet: a mapping of ordinals to deduplicated terms is written as Prefix-Compressed Binary, 
+ *        an ordinal list and per-document index into this list are written using the numeric strategies 
+ *        above. 
 * </ul>
 * <p>
 * Files:
@ -85,25 +89,35 @@ import org.apache.lucene.util.packed.PackedInts;
 *   <p>The DocValues metadata or .dvm file.</p>
 *   <p>For DocValues field, this stores metadata, such as the offset into the 
 *      DocValues data (.dvd)</p>
- *   <p>DocValues metadata (.dvm) --&gt; Header,&lt;FieldNumber,EntryType,Entry&gt;<sup>NumFields</sup></p>
+ *   <p>DocValues metadata (.dvm) --&gt; Header,&lt;Entry&gt;<sup>NumFields</sup></p>
 *   <ul>
- *     <li>Entry --&gt; NumericEntry | BinaryEntry | SortedEntry</li>
- *     <li>NumericEntry --&gt; DataOffset,NumericCompressionType,PackedVersion</li>
- *     <li>BinaryEntry --&gt; DataOffset,DataLength,MinLength,MaxLength,PackedVersion?,BlockSize?</li>
- *     <li>SortedEntry --&gt; DataOffset,ValueCount</li>
+ *     <li>Entry --&gt; NumericEntry | BinaryEntry | SortedEntry | SortedSetEntry</li>
+ *     <li>NumericEntry --&gt; GCDNumericEntry | TableNumericEntry | DeltaNumericEntry</li>
+ *     <li>GCDNumericEntry --&gt; NumericHeader,MinValue,GCD</li>
+ *     <li>TableNumericEntry --&gt; NumericHeader,TableSize,{@link DataOutput#writeLong Int64}<sup>TableSize</sup></li>
+ *     <li>DeltaNumericEntry --&gt; NumericHeader</li>
+ *     <li>NumericHeader --&gt; FieldNumber,EntryType,NumericType,MissingOffset,PackedVersion,DataOffset,Count,BlockSize</li>
+ *     <li>BinaryEntry --&gt; FixedBinaryEntry | VariableBinaryEntry | PrefixBinaryEntry</li>
+ *     <li>FixedBinaryEntry --&gt; BinaryHeader</li>
+ *     <li>VariableBinaryEntry --&gt; BinaryHeader,AddressOffset,PackedVersion,BlockSize</li>
+ *     <li>PrefixBinaryEntry --&gt; BinaryHeader,AddressInterval,AddressOffset,PackedVersion,BlockSize</li>
+ *     <li>BinaryHeader --&gt; FieldNumber,EntryType,BinaryType,MissingOffset,MinLength,MaxLength,DataOffset</li>
+ *     <li>SortedEntry --&gt; FieldNumber,EntryType,BinaryEntry,NumericEntry</li>
+ *     <li>SortedSetEntry --&gt; EntryType,BinaryEntry,NumericEntry,NumericEntry</li>
 *     <li>FieldNumber,PackedVersion,MinLength,MaxLength,BlockSize,ValueCount --&gt; {@link DataOutput#writeVInt VInt}</li>
- *     <li>DataOffset,DataLength --&gt; {@link DataOutput#writeLong Int64}</li>
 *     <li>EntryType,CompressionType --&gt; {@link DataOutput#writeByte Byte}</li>
 *     <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
+ *     <li>MinValue,GCD,MissingOffset,AddressOffset,DataOffset --&gt; {@link DataOutput#writeLong Int64}</li>
+ *     <li>TableSize --&gt; {@link DataOutput#writeVInt vInt}</li>
 *   </ul>
- *   <p>Sorted fields have two entries: a SortedEntry with the FST metadata,
+ *   <p>Sorted fields have two entries: a BinaryEntry with the value metadata,
 *      and an ordinary NumericEntry for the document-to-ord metadata.</p>
- *   <p>SortedSet fields have two entries: a SortedEntry with the FST metadata,
- *      and an ordinary BinaryEntry for the document-to-ord-list metadata.</p>
+ *   <p>SortedSet fields have three entries: a BinaryEntry with the value metadata,
+ *      and two NumericEntries for the document-to-ord-index and ordinal list metadata.</p>
 *   <p>FieldNumber of -1 indicates the end of metadata.</p>
- *   <p>EntryType is a 0 (NumericEntry), 1 (BinaryEntry, or 2 (SortedEntry)</p>
+ *   <p>EntryType is a 0 (NumericEntry) or 1 (BinaryEntry)</p>
 *   <p>DataOffset is the pointer to the start of the data in the DocValues data (.dvd)</p>
- *   <p>NumericCompressionType indicates how Numeric values will be compressed:
+ *   <p>NumericType indicates how Numeric values will be compressed:
 *      <ul>
 *         <li>0 --&gt; delta-compressed. For each block of 16k integers, every integer is delta-encoded
 *             from the minimum value within the block. 
@ -112,10 +126,18 @@ import org.apache.lucene.util.packed.PackedInts;
 *         <li>2 --&gt; table-compressed. When the number of unique numeric values is small and it would save space,
 *             a lookup table of unique values is written, followed by the ordinal for each document.
 *      </ul>
+ *   <p>BinaryType indicates how Binary values will be stored:
+ *      <ul>
+ *         <li>0 --&gt; fixed-width. All values have the same length, addressing by multiplication. 
+ *         <li>1 --&gt, variable-width. An address for each value is stored.
+ *         <li>2 --&gt; prefix-compressed. An address to the start of every interval'th value is stored.
+ *      </ul>
 *   <p>MinLength and MaxLength represent the min and max byte[] value lengths for Binary values.
 *      If they are equal, then all values are of a fixed size, and can be addressed as DataOffset + (docID * length).
 *      Otherwise, the binary values are of variable size, and packed integer metadata (PackedVersion,BlockSize)
 *      is written for the addresses.
+ *   <p>MissingOffset points to a byte[] containing a bitset of all documents that had a value for the field.
+ *      If its -1, then there are no missing values.
 *   <li><a name="dvd" id="dvd"></a>
 *   <p>The DocValues data or .dvd file.</p>
 *   <p>For DocValues field, this stores the actual per-document data (the heavy-lifting)</p>
@ -125,21 +147,18 @@ import org.apache.lucene.util.packed.PackedInts;
 *     <li>BinaryData --&gt;  {@link DataOutput#writeByte Byte}<sup>DataLength</sup>,Addresses</li>
 *     <li>SortedData --&gt; {@link FST FST&lt;Int64&gt;}</li>
 *     <li>DeltaCompressedNumerics --&gt; {@link BlockPackedWriter BlockPackedInts(blockSize=16k)}</li>
- *     <li>TableCompressedNumerics --&gt; TableSize,{@link DataOutput#writeLong Int64}<sup>TableSize</sup>,{@link PackedInts PackedInts}</li>
- *     <li>GCDCompressedNumerics --&gt; MinValue,GCD,{@link BlockPackedWriter BlockPackedInts(blockSize=16k)}</li>
+ *     <li>TableCompressedNumerics --&gt; {@link PackedInts PackedInts}</li>
+ *     <li>GCDCompressedNumerics --&gt; {@link BlockPackedWriter BlockPackedInts(blockSize=16k)}</li>
 *     <li>Addresses --&gt; {@link MonotonicBlockPackedWriter MonotonicBlockPackedInts(blockSize=16k)}</li>
- *     <li>TableSize --&gt; {@link DataOutput#writeVInt vInt}</li>
- *     <li>MinValue --&gt; {@link DataOutput#writeLong Int64}</li>
- *     <li>GCD --&gt; {@link DataOutput#writeLong Int64}</li>
 *   </ul>
 *   <p>SortedSet entries store the list of ordinals in their BinaryData as a
 *      sequences of increasing {@link DataOutput#writeVLong vLong}s, delta-encoded.</p>
 * </ol>
 * @lucene.experimental
 */
-// nocommit: docs are incomplete
 public final class Lucene45DocValuesFormat extends DocValuesFormat {

+  /** Sole Constructor */
  public Lucene45DocValuesFormat() {
    super("Lucene45");
  }
@ -154,14 +173,14 @@ public final class Lucene45DocValuesFormat extends DocValuesFormat {
    return new Lucene45DocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION);
  }
  
-  public static final String DATA_CODEC = "Lucene45DocValuesData";
-  public static final String DATA_EXTENSION = "dvd";
-  public static final String META_CODEC = "Lucene45ValuesMetadata";
-  public static final String META_EXTENSION = "dvm";
-  public static final int VERSION_START = 0;
-  public static final int VERSION_CURRENT = VERSION_START;
-  public static final byte NUMERIC = 0;
-  public static final byte BINARY = 1;
-  public static final byte SORTED = 2;
-  public static final byte SORTED_SET = 3;
+  static final String DATA_CODEC = "Lucene45DocValuesData";
+  static final String DATA_EXTENSION = "dvd";
+  static final String META_CODEC = "Lucene45ValuesMetadata";
+  static final String META_EXTENSION = "dvm";
+  static final int VERSION_START = 0;
+  static final int VERSION_CURRENT = VERSION_START;
+  static final byte NUMERIC = 0;
+  static final byte BINARY = 1;
+  static final byte SORTED = 2;
+  static final byte SORTED_SET = 3;
 }
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesProducer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesProducer.java
@ -25,6 +25,7 @@ import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.BINARY
 import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.BINARY_VARIABLE_UNCOMPRESSED;
 import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.BINARY_PREFIX_COMPRESSED;

+import java.io.Closeable; // javadocs
 import java.io.IOException;
 import java.util.Comparator;
 import java.util.HashMap;
@ -53,7 +54,8 @@ import org.apache.lucene.util.packed.BlockPackedReader;
 import org.apache.lucene.util.packed.MonotonicBlockPackedReader;
 import org.apache.lucene.util.packed.PackedInts;

-public class Lucene45DocValuesProducer extends DocValuesProducer {
+/** reader for {@link Lucene45DocValuesFormat} */
+public class Lucene45DocValuesProducer extends DocValuesProducer implements Closeable {
  private final Map<Integer,NumericEntry> numerics;
  private final Map<Integer,BinaryEntry> binaries;
  private final Map<Integer,NumericEntry> ords;
@ -65,6 +67,7 @@ public class Lucene45DocValuesProducer extends DocValuesProducer {
  private final Map<Integer,MonotonicBlockPackedReader> addressInstances = new HashMap<Integer,MonotonicBlockPackedReader>();
  private final Map<Integer,MonotonicBlockPackedReader> ordIndexInstances = new HashMap<Integer,MonotonicBlockPackedReader>();
  
+  /** expert: instantiates a new reader */
  protected Lucene45DocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
    String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
    // read in the entries from the metadata file.
@ -317,6 +320,8 @@ public class Lucene45DocValuesProducer extends DocValuesProducer {
    };
  }
  
+  /** returns an address instance for variable-length binary values.
+   *  @lucene.internal */
  protected MonotonicBlockPackedReader getAddressInstance(IndexInput data, FieldInfo field, BinaryEntry bytes) throws IOException {
    final MonotonicBlockPackedReader addresses;
    synchronized (addressInstances) {
@ -358,6 +363,8 @@ public class Lucene45DocValuesProducer extends DocValuesProducer {
    };
  }
  
+  /** returns an address instance for prefix-compressed binary values. 
+   * @lucene.internal */
  protected MonotonicBlockPackedReader getIntervalInstance(IndexInput data, FieldInfo field, BinaryEntry bytes) throws IOException {
    final MonotonicBlockPackedReader addresses;
    final long interval = bytes.addressInterval;
@ -434,6 +441,8 @@ public class Lucene45DocValuesProducer extends DocValuesProducer {
    };
  }
  
+  /** returns an address instance for sortedset ordinal lists
+   * @lucene.internal */
  protected MonotonicBlockPackedReader getOrdIndexInstance(IndexInput data, FieldInfo field, NumericEntry entry) throws IOException {
    final MonotonicBlockPackedReader ordIndex;
    synchronized (ordIndexInstances) {
@ -509,7 +518,7 @@ public class Lucene45DocValuesProducer extends DocValuesProducer {
    };
  }
  
-  public Bits getMissingBits(final long offset) throws IOException {
+  private Bits getMissingBits(final long offset) throws IOException {
    if (offset == -1) {
      return new Bits.MatchAllBits(maxDoc);
    } else {
@ -557,13 +566,20 @@ public class Lucene45DocValuesProducer extends DocValuesProducer {
    data.close();
  }
  
+  /** metadata entry for a numeric docvalues field */
  protected static class NumericEntry {
+    private NumericEntry() {}
+    /** offset to the bitset representing docsWithField, or -1 if no documents have missing values */
    long missingOffset;
+    /** offset to the actual numeric values */
    public long offset;

-    public int format;
+    int format;
+    /** packed ints version used to encode these numerics */
    public int packedIntsVersion;
+    /** count of values written */
    public long count;
+    /** packed ints blocksize */
    public int blockSize;
    
    long minValue;
@ -571,17 +587,26 @@ public class Lucene45DocValuesProducer extends DocValuesProducer {
    long table[];
  }
  
+  /** metadata entry for a binary docvalues field */
  protected static class BinaryEntry {
+    private BinaryEntry() {}
+    /** offset to the bitset representing docsWithField, or -1 if no documents have missing values */
    long missingOffset;
+    /** offset to the actual binary values */
    long offset;

    int format;
+    /** count of values written */
    public long count;
    int minLength;
    int maxLength;
+    /** offset to the addressing data that maps a value to its slice of the byte[] */
    public long addressesOffset;
+    /** interval of shared prefix chunks (when using prefix-compressed binary) */
    public long addressInterval;
+    /** packed ints version used to encode addressing information */
    public int packedIntsVersion;
+    /** packed ints blocksize */
    public int blockSize;
  }
  
--- a/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java
@ -96,8 +96,8 @@ class BinaryDocValuesWriter extends DocValuesWriter {
  }
  
  private long docsWithFieldBytesUsed() {
-    // nocommit: this is not correct
-    return docsWithField.getBits().length*RamUsageEstimator.NUM_BYTES_LONG;
+    // size of the long[] + some overhead
+    return RamUsageEstimator.sizeOf(docsWithField.getBits()) + 64;
  }

  private void updateBytesUsed() {
--- a/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java
@ -70,8 +70,8 @@ class NumericDocValuesWriter extends DocValuesWriter {
  }
  
  private long docsWithFieldBytesUsed() {
-    // nocommit: this is not correct
-    return docsWithField.getBits().length*RamUsageEstimator.NUM_BYTES_LONG;
+    // size of the long[] + some overhead
+    return RamUsageEstimator.sizeOf(docsWithField.getBits()) + 64;
  }

  private void updateBytesUsed() {
--- a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingNormsFormat.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingNormsFormat.java
@ -28,7 +28,6 @@ import org.apache.lucene.codecs.lucene42.Lucene42NormsFormat;
 import org.apache.lucene.index.SegmentReadState;
 import org.apache.lucene.index.SegmentWriteState;

-// nocommit
 /**
 * Just like {@link Lucene42NormsFormat} but with additional asserts.
 */
--- a/lucene/test-framework/src/java/org/apache/lucene/codecs/package.html
+++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/package.html
@ -0,0 +1,25 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html>
+<head>
+   <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+</head>
+<body>
+Codecs for testing (simulate old disk formats, wacky theoretical use cases, etc)
+</body>
+</html>
--- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java
@ -2538,7 +2538,7 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase {
    d.close();
  }

-  // nocommit: get this out of here and into the deprecated codecs (4.0, 4.2)
+  // TODO: get this out of here and into the deprecated codecs (4.0, 4.2)
  public void testHugeBinaryValueLimit() throws Exception {
    // We only test DVFormats that have a limit
    assumeFalse("test requires codec with limits on max binary field length", codecAcceptsHugeBinaryValues("field"));
--- a/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java
@ -757,14 +757,13 @@ public class _TestUtil {
    }
  }

-  // nocommit: remove this, push this test to Lucene40/Lucene42 codec tests
+  // TODO: remove this, push this test to Lucene40/Lucene42 codec tests
  public static boolean fieldSupportsHugeBinaryDocValues(String field) {
    String dvFormat = getDocValuesFormat(field);
-    System.out.println(dvFormat);
-    return dvFormat.equals("Lucene45") ||
-      dvFormat.equals("Asserting") || 
-      dvFormat.equals("Disk") ||
-      dvFormat.equals("SimpleText");
+    if (dvFormat.equals("Lucene40") || dvFormat.equals("Lucene42")) {
+      return false;
+    }
+    return true;
  }

  public static boolean anyFilesExceptWriteLock(Directory dir) throws IOException {