javadocs/cleanups

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5178@1515563 13f79535-47bb-0310-9956-ffa450edef68
2013-08-19 19:07:49 +00:00 · 2013-08-19 19:07:49 +00:00 · c2f37e0de2
parent e925901014
commit c2f37e0de2
10 changed files with 120 additions and 48 deletions
--- a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesProducer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesProducer.java
@ -75,6 +75,7 @@ public abstract class DocValuesProducer implements Closeable {
    final SortedDocValues in;
    final int maxDoc;
    /** Creates a {@link Bits} returning true if the document has a value */
    public SortedDocsWithField(SortedDocValues in, int maxDoc) {
      this.in = in;
      this.maxDoc = maxDoc;
@ -102,6 +103,7 @@ public abstract class DocValuesProducer implements Closeable {
    final SortedSetDocValues in;
    final int maxDoc;
    /** Creates a {@link Bits} returning true if the document has a value */
    public SortedSetDocsWithField(SortedSetDocValues in, int maxDoc) {
      this.in = in;
      this.maxDoc = maxDoc;
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesConsumer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesConsumer.java
@ -17,6 +17,7 @@ package org.apache.lucene.codecs.lucene45;
 * limitations under the License.
 */
 import java.io.Closeable; // javadocs
 import java.io.IOException;
 import java.util.HashMap;
 import java.util.HashSet;
@ -37,7 +38,7 @@ import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
 import org.apache.lucene.util.packed.PackedInts;
 /** writer for {@link Lucene45DocValuesFormat} */
-public class Lucene45DocValuesConsumer extends DocValuesConsumer {
+public class Lucene45DocValuesConsumer extends DocValuesConsumer implements Closeable {
  static final int BLOCK_SIZE = 16384;
  static final int ADDRESS_INTERVAL = 16;
@ -59,6 +60,7 @@ public class Lucene45DocValuesConsumer extends DocValuesConsumer {
  final IndexOutput data, meta;
  final int maxDoc;
  /** expert: Creates a new writer */
  public Lucene45DocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
    boolean success = false;
    try {
@ -273,6 +275,7 @@ public class Lucene45DocValuesConsumer extends DocValuesConsumer {
    }
  }
  /** expert: writes a value dictionary for a sorted/sortedset field */
  protected void addTermsDict(FieldInfo field, final Iterable<BytesRef> values) throws IOException {
    // first check if its a "fixed-length" terms dict
    int minLength = Integer.MAX_VALUE;
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesFormat.java
@ -60,19 +60,23 @@ import org.apache.lucene.util.packed.PackedInts;
 *        for each document. The addresses are written in blocks of 16k, with the current absolute
 *        start for the block, and the average (expected) delta per entry. For each document the 
 *        deviation from the delta (actual - expected) is written.
- *    <li>Prefix-compressed Binary: nocommit
+ *    <li>Prefix-compressed Binary: values are written in chunks of 16, with the first value written
 *        completely and other values sharing prefixes. chunk addresses are written in blocks of 16k,
 *        with the current absolute start for the block, and the average (expected) delta per entry. 
 *        For each chunk the deviation from the delta (actual - expected) is written.
 * </ul>
 * <p>
 * {@link DocValuesType#SORTED SORTED}:
 * <ul>
- *    <li>Sorted: an FST mapping deduplicated terms to ordinals is written, along with the per-document
+ *    <li>Sorted: a mapping of ordinals to deduplicated terms is written as Prefix-Compressed Binary, 
- *        ordinals written using one of the numeric strategies above.
+ *        along with the per-document ordinals written using one of the numeric strategies above.
 * </ul>
 * <p>
 * {@link DocValuesType#SORTED_SET SORTED_SET}:
 * <ul>
- *    <li>SortedSet: an FST mapping deduplicated terms to ordinals is written, along with the per-document
+ *    <li>SortedSet: a mapping of ordinals to deduplicated terms is written as Prefix-Compressed Binary, 
- *        ordinal list written using one of the binary strategies above.  
+ *        an ordinal list and per-document index into this list are written using the numeric strategies 
 *        above. 
 * </ul>
 * <p>
 * Files:
@ -85,25 +89,35 @@ import org.apache.lucene.util.packed.PackedInts;
 *   <p>The DocValues metadata or .dvm file.</p>
 *   <p>For DocValues field, this stores metadata, such as the offset into the 
 *      DocValues data (.dvd)</p>
- *   <p>DocValues metadata (.dvm) --&gt; Header,&lt;FieldNumber,EntryType,Entry&gt;<sup>NumFields</sup></p>
+ *   <p>DocValues metadata (.dvm) --&gt; Header,&lt;Entry&gt;<sup>NumFields</sup></p>
 *   <ul>
- *     <li>Entry --&gt; NumericEntry | BinaryEntry | SortedEntry</li>
+ *     <li>Entry --&gt; NumericEntry | BinaryEntry | SortedEntry | SortedSetEntry</li>
- *     <li>NumericEntry --&gt; DataOffset,NumericCompressionType,PackedVersion</li>
+ *     <li>NumericEntry --&gt; GCDNumericEntry | TableNumericEntry | DeltaNumericEntry</li>
- *     <li>BinaryEntry --&gt; DataOffset,DataLength,MinLength,MaxLength,PackedVersion?,BlockSize?</li>
+ *     <li>GCDNumericEntry --&gt; NumericHeader,MinValue,GCD</li>
- *     <li>SortedEntry --&gt; DataOffset,ValueCount</li>
+ *     <li>TableNumericEntry --&gt; NumericHeader,TableSize,{@link DataOutput#writeLong Int64}<sup>TableSize</sup></li>
 *     <li>DeltaNumericEntry --&gt; NumericHeader</li>
 *     <li>NumericHeader --&gt; FieldNumber,EntryType,NumericType,MissingOffset,PackedVersion,DataOffset,Count,BlockSize</li>
 *     <li>BinaryEntry --&gt; FixedBinaryEntry | VariableBinaryEntry | PrefixBinaryEntry</li>
 *     <li>FixedBinaryEntry --&gt; BinaryHeader</li>
 *     <li>VariableBinaryEntry --&gt; BinaryHeader,AddressOffset,PackedVersion,BlockSize</li>
 *     <li>PrefixBinaryEntry --&gt; BinaryHeader,AddressInterval,AddressOffset,PackedVersion,BlockSize</li>
 *     <li>BinaryHeader --&gt; FieldNumber,EntryType,BinaryType,MissingOffset,MinLength,MaxLength,DataOffset</li>
 *     <li>SortedEntry --&gt; FieldNumber,EntryType,BinaryEntry,NumericEntry</li>
 *     <li>SortedSetEntry --&gt; EntryType,BinaryEntry,NumericEntry,NumericEntry</li>
 *     <li>FieldNumber,PackedVersion,MinLength,MaxLength,BlockSize,ValueCount --&gt; {@link DataOutput#writeVInt VInt}</li>
 *     <li>DataOffset,DataLength --&gt; {@link DataOutput#writeLong Int64}</li>
 *     <li>EntryType,CompressionType --&gt; {@link DataOutput#writeByte Byte}</li>
 *     <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
 *     <li>MinValue,GCD,MissingOffset,AddressOffset,DataOffset --&gt; {@link DataOutput#writeLong Int64}</li>
 *     <li>TableSize --&gt; {@link DataOutput#writeVInt vInt}</li>
 *   </ul>
- *   <p>Sorted fields have two entries: a SortedEntry with the FST metadata,
+ *   <p>Sorted fields have two entries: a BinaryEntry with the value metadata,
 *      and an ordinary NumericEntry for the document-to-ord metadata.</p>
- *   <p>SortedSet fields have two entries: a SortedEntry with the FST metadata,
+ *   <p>SortedSet fields have three entries: a BinaryEntry with the value metadata,
- *      and an ordinary BinaryEntry for the document-to-ord-list metadata.</p>
+ *      and two NumericEntries for the document-to-ord-index and ordinal list metadata.</p>
 *   <p>FieldNumber of -1 indicates the end of metadata.</p>
- *   <p>EntryType is a 0 (NumericEntry), 1 (BinaryEntry, or 2 (SortedEntry)</p>
+ *   <p>EntryType is a 0 (NumericEntry) or 1 (BinaryEntry)</p>
 *   <p>DataOffset is the pointer to the start of the data in the DocValues data (.dvd)</p>
- *   <p>NumericCompressionType indicates how Numeric values will be compressed:
+ *   <p>NumericType indicates how Numeric values will be compressed:
 *      <ul>
 *         <li>0 --&gt; delta-compressed. For each block of 16k integers, every integer is delta-encoded
 *             from the minimum value within the block. 
@ -112,10 +126,18 @@ import org.apache.lucene.util.packed.PackedInts;
 *         <li>2 --&gt; table-compressed. When the number of unique numeric values is small and it would save space,
 *             a lookup table of unique values is written, followed by the ordinal for each document.
 *      </ul>
 *   <p>BinaryType indicates how Binary values will be stored:
 *      <ul>
 *         <li>0 --&gt; fixed-width. All values have the same length, addressing by multiplication. 
 *         <li>1 --&gt, variable-width. An address for each value is stored.
 *         <li>2 --&gt; prefix-compressed. An address to the start of every interval'th value is stored.
 *      </ul>
 *   <p>MinLength and MaxLength represent the min and max byte[] value lengths for Binary values.
 *      If they are equal, then all values are of a fixed size, and can be addressed as DataOffset + (docID * length).
 *      Otherwise, the binary values are of variable size, and packed integer metadata (PackedVersion,BlockSize)
 *      is written for the addresses.
 *   <p>MissingOffset points to a byte[] containing a bitset of all documents that had a value for the field.
 *      If its -1, then there are no missing values.
 *   <li><a name="dvd" id="dvd"></a>
 *   <p>The DocValues data or .dvd file.</p>
 *   <p>For DocValues field, this stores the actual per-document data (the heavy-lifting)</p>
@ -125,21 +147,18 @@ import org.apache.lucene.util.packed.PackedInts;
 *     <li>BinaryData --&gt;  {@link DataOutput#writeByte Byte}<sup>DataLength</sup>,Addresses</li>
 *     <li>SortedData --&gt; {@link FST FST&lt;Int64&gt;}</li>
 *     <li>DeltaCompressedNumerics --&gt; {@link BlockPackedWriter BlockPackedInts(blockSize=16k)}</li>
- *     <li>TableCompressedNumerics --&gt; TableSize,{@link DataOutput#writeLong Int64}<sup>TableSize</sup>,{@link PackedInts PackedInts}</li>
+ *     <li>TableCompressedNumerics --&gt; {@link PackedInts PackedInts}</li>
- *     <li>GCDCompressedNumerics --&gt; MinValue,GCD,{@link BlockPackedWriter BlockPackedInts(blockSize=16k)}</li>
+ *     <li>GCDCompressedNumerics --&gt; {@link BlockPackedWriter BlockPackedInts(blockSize=16k)}</li>
 *     <li>Addresses --&gt; {@link MonotonicBlockPackedWriter MonotonicBlockPackedInts(blockSize=16k)}</li>
 *     <li>TableSize --&gt; {@link DataOutput#writeVInt vInt}</li>
 *     <li>MinValue --&gt; {@link DataOutput#writeLong Int64}</li>
 *     <li>GCD --&gt; {@link DataOutput#writeLong Int64}</li>
 *   </ul>
 *   <p>SortedSet entries store the list of ordinals in their BinaryData as a
 *      sequences of increasing {@link DataOutput#writeVLong vLong}s, delta-encoded.</p>
 * </ol>
 * @lucene.experimental
 */
 // nocommit: docs are incomplete
 public final class Lucene45DocValuesFormat extends DocValuesFormat {
  /** Sole Constructor */
  public Lucene45DocValuesFormat() {
    super("Lucene45");
  }
@ -154,14 +173,14 @@ public final class Lucene45DocValuesFormat extends DocValuesFormat {
    return new Lucene45DocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION);
  }
-  public static final String DATA_CODEC = "Lucene45DocValuesData";
+  static final String DATA_CODEC = "Lucene45DocValuesData";
-  public static final String DATA_EXTENSION = "dvd";
+  static final String DATA_EXTENSION = "dvd";
-  public static final String META_CODEC = "Lucene45ValuesMetadata";
+  static final String META_CODEC = "Lucene45ValuesMetadata";
-  public static final String META_EXTENSION = "dvm";
+  static final String META_EXTENSION = "dvm";
-  public static final int VERSION_START = 0;
+  static final int VERSION_START = 0;
-  public static final int VERSION_CURRENT = VERSION_START;
+  static final int VERSION_CURRENT = VERSION_START;
-  public static final byte NUMERIC = 0;
+  static final byte NUMERIC = 0;
-  public static final byte BINARY = 1;
+  static final byte BINARY = 1;
-  public static final byte SORTED = 2;
+  static final byte SORTED = 2;
-  public static final byte SORTED_SET = 3;
+  static final byte SORTED_SET = 3;
 }
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesProducer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesProducer.java
@ -25,6 +25,7 @@ import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.BINARY
 import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.BINARY_VARIABLE_UNCOMPRESSED;
 import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.BINARY_PREFIX_COMPRESSED;
 import java.io.Closeable; // javadocs
 import java.io.IOException;
 import java.util.Comparator;
 import java.util.HashMap;
@ -53,7 +54,8 @@ import org.apache.lucene.util.packed.BlockPackedReader;
 import org.apache.lucene.util.packed.MonotonicBlockPackedReader;
 import org.apache.lucene.util.packed.PackedInts;
-public class Lucene45DocValuesProducer extends DocValuesProducer {
+/** reader for {@link Lucene45DocValuesFormat} */
 public class Lucene45DocValuesProducer extends DocValuesProducer implements Closeable {
  private final Map<Integer,NumericEntry> numerics;
  private final Map<Integer,BinaryEntry> binaries;
  private final Map<Integer,NumericEntry> ords;
@ -65,6 +67,7 @@ public class Lucene45DocValuesProducer extends DocValuesProducer {
  private final Map<Integer,MonotonicBlockPackedReader> addressInstances = new HashMap<Integer,MonotonicBlockPackedReader>();
  private final Map<Integer,MonotonicBlockPackedReader> ordIndexInstances = new HashMap<Integer,MonotonicBlockPackedReader>();
  /** expert: instantiates a new reader */
  protected Lucene45DocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
    String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
    // read in the entries from the metadata file.
@ -317,6 +320,8 @@ public class Lucene45DocValuesProducer extends DocValuesProducer {
    };
  }
  /** returns an address instance for variable-length binary values.
   *  @lucene.internal */
  protected MonotonicBlockPackedReader getAddressInstance(IndexInput data, FieldInfo field, BinaryEntry bytes) throws IOException {
    final MonotonicBlockPackedReader addresses;
    synchronized (addressInstances) {
@ -358,6 +363,8 @@ public class Lucene45DocValuesProducer extends DocValuesProducer {
    };
  }
  /** returns an address instance for prefix-compressed binary values. 
   * @lucene.internal */
  protected MonotonicBlockPackedReader getIntervalInstance(IndexInput data, FieldInfo field, BinaryEntry bytes) throws IOException {
    final MonotonicBlockPackedReader addresses;
    final long interval = bytes.addressInterval;
@ -434,6 +441,8 @@ public class Lucene45DocValuesProducer extends DocValuesProducer {
    };
  }
  /** returns an address instance for sortedset ordinal lists
   * @lucene.internal */
  protected MonotonicBlockPackedReader getOrdIndexInstance(IndexInput data, FieldInfo field, NumericEntry entry) throws IOException {
    final MonotonicBlockPackedReader ordIndex;
    synchronized (ordIndexInstances) {
@ -509,7 +518,7 @@ public class Lucene45DocValuesProducer extends DocValuesProducer {
    };
  }
-  public Bits getMissingBits(final long offset) throws IOException {
+  private Bits getMissingBits(final long offset) throws IOException {
    if (offset == -1) {
      return new Bits.MatchAllBits(maxDoc);
    } else {
@ -557,13 +566,20 @@ public class Lucene45DocValuesProducer extends DocValuesProducer {
    data.close();
  }
  /** metadata entry for a numeric docvalues field */
  protected static class NumericEntry {
    private NumericEntry() {}
    /** offset to the bitset representing docsWithField, or -1 if no documents have missing values */
    long missingOffset;
    /** offset to the actual numeric values */
    public long offset;
-    public int format;
+    int format;
    /** packed ints version used to encode these numerics */
    public int packedIntsVersion;
    /** count of values written */
    public long count;
    /** packed ints blocksize */
    public int blockSize;
    long minValue;
@ -571,17 +587,26 @@ public class Lucene45DocValuesProducer extends DocValuesProducer {
    long table[];
  }
  /** metadata entry for a binary docvalues field */
  protected static class BinaryEntry {
    private BinaryEntry() {}
    /** offset to the bitset representing docsWithField, or -1 if no documents have missing values */
    long missingOffset;
    /** offset to the actual binary values */
    long offset;
    int format;
    /** count of values written */
    public long count;
    int minLength;
    int maxLength;
    /** offset to the addressing data that maps a value to its slice of the byte[] */
    public long addressesOffset;
    /** interval of shared prefix chunks (when using prefix-compressed binary) */
    public long addressInterval;
    /** packed ints version used to encode addressing information */
    public int packedIntsVersion;
    /** packed ints blocksize */
    public int blockSize;
  }
--- a/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java
@ -96,8 +96,8 @@ class BinaryDocValuesWriter extends DocValuesWriter {
  }
  private long docsWithFieldBytesUsed() {
-    // nocommit: this is not correct
+    // size of the long[] + some overhead
-    return docsWithField.getBits().length*RamUsageEstimator.NUM_BYTES_LONG;
+    return RamUsageEstimator.sizeOf(docsWithField.getBits()) + 64;
  }
  private void updateBytesUsed() {
--- a/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java
@ -70,8 +70,8 @@ class NumericDocValuesWriter extends DocValuesWriter {
  }
  private long docsWithFieldBytesUsed() {
-    // nocommit: this is not correct
+    // size of the long[] + some overhead
-    return docsWithField.getBits().length*RamUsageEstimator.NUM_BYTES_LONG;
+    return RamUsageEstimator.sizeOf(docsWithField.getBits()) + 64;
  }
  private void updateBytesUsed() {
--- a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingNormsFormat.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingNormsFormat.java
@ -28,7 +28,6 @@ import org.apache.lucene.codecs.lucene42.Lucene42NormsFormat;
 import org.apache.lucene.index.SegmentReadState;
 import org.apache.lucene.index.SegmentWriteState;
 // nocommit
 /**
 * Just like {@link Lucene42NormsFormat} but with additional asserts.
 */
--- a/lucene/test-framework/src/java/org/apache/lucene/codecs/package.html
+++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/package.html
@ -0,0 +1,25 @@
 <!doctype html public "-//w3c//dtd html 4.0 transitional//en">
 <!--
 Licensed to the Apache Software Foundation (ASF) under one or more
 contributor license agreements.  See the NOTICE file distributed with
 this work for additional information regarding copyright ownership.
 The ASF licenses this file to You under the Apache License, Version 2.0
 (the "License"); you may not use this file except in compliance with
 the License.  You may obtain a copy of the License at
     http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 -->
 <html>
 <head>
   <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
 </head>
 <body>
 Codecs for testing (simulate old disk formats, wacky theoretical use cases, etc)
 </body>
 </html>
--- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java
@ -2538,7 +2538,7 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase {
    d.close();
  }
-  // nocommit: get this out of here and into the deprecated codecs (4.0, 4.2)
+  // TODO: get this out of here and into the deprecated codecs (4.0, 4.2)
  public void testHugeBinaryValueLimit() throws Exception {
    // We only test DVFormats that have a limit
    assumeFalse("test requires codec with limits on max binary field length", codecAcceptsHugeBinaryValues("field"));
--- a/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java
@ -757,14 +757,13 @@ public class _TestUtil {
    }
  }
-  // nocommit: remove this, push this test to Lucene40/Lucene42 codec tests
+  // TODO: remove this, push this test to Lucene40/Lucene42 codec tests
  public static boolean fieldSupportsHugeBinaryDocValues(String field) {
    String dvFormat = getDocValuesFormat(field);
-    System.out.println(dvFormat);
+    if (dvFormat.equals("Lucene40") || dvFormat.equals("Lucene42")) {
-    return dvFormat.equals("Lucene45") ||
+      return false;
-      dvFormat.equals("Asserting") || 
+    }
-      dvFormat.equals("Disk") ||
+    return true;
      dvFormat.equals("SimpleText");
  }
  public static boolean anyFilesExceptWriteLock(Directory dir) throws IOException {