From c2f37e0de2d94fbe4c07f3d0aecfea250788eebb Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Mon, 19 Aug 2013 19:07:49 +0000 Subject: [PATCH] javadocs/cleanups git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5178@1515563 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/codecs/DocValuesProducer.java | 2 + .../lucene45/Lucene45DocValuesConsumer.java | 5 +- .../lucene45/Lucene45DocValuesFormat.java | 83 ++++++++++++------- .../lucene45/Lucene45DocValuesProducer.java | 31 ++++++- .../lucene/index/BinaryDocValuesWriter.java | 4 +- .../lucene/index/NumericDocValuesWriter.java | 4 +- .../asserting/AssertingNormsFormat.java | 1 - .../org/apache/lucene/codecs/package.html | 25 ++++++ .../index/BaseDocValuesFormatTestCase.java | 2 +- .../org/apache/lucene/util/_TestUtil.java | 11 ++- 10 files changed, 120 insertions(+), 48 deletions(-) create mode 100644 lucene/test-framework/src/java/org/apache/lucene/codecs/package.html diff --git a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesProducer.java index 04778aa1201..05dfcf1b0dd 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesProducer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesProducer.java @@ -75,6 +75,7 @@ public abstract class DocValuesProducer implements Closeable { final SortedDocValues in; final int maxDoc; + /** Creates a {@link Bits} returning true if the document has a value */ public SortedDocsWithField(SortedDocValues in, int maxDoc) { this.in = in; this.maxDoc = maxDoc; @@ -102,6 +103,7 @@ public abstract class DocValuesProducer implements Closeable { final SortedSetDocValues in; final int maxDoc; + /** Creates a {@link Bits} returning true if the document has a value */ public SortedSetDocsWithField(SortedSetDocValues in, int maxDoc) { this.in = in; this.maxDoc = maxDoc; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesConsumer.java index 21ee03075f5..e5afdf70abb 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesConsumer.java @@ -17,6 +17,7 @@ package org.apache.lucene.codecs.lucene45; * limitations under the License. */ +import java.io.Closeable; // javadocs import java.io.IOException; import java.util.HashMap; import java.util.HashSet; @@ -37,7 +38,7 @@ import org.apache.lucene.util.packed.MonotonicBlockPackedWriter; import org.apache.lucene.util.packed.PackedInts; /** writer for {@link Lucene45DocValuesFormat} */ -public class Lucene45DocValuesConsumer extends DocValuesConsumer { +public class Lucene45DocValuesConsumer extends DocValuesConsumer implements Closeable { static final int BLOCK_SIZE = 16384; static final int ADDRESS_INTERVAL = 16; @@ -59,6 +60,7 @@ public class Lucene45DocValuesConsumer extends DocValuesConsumer { final IndexOutput data, meta; final int maxDoc; + /** expert: Creates a new writer */ public Lucene45DocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException { boolean success = false; try { @@ -273,6 +275,7 @@ public class Lucene45DocValuesConsumer extends DocValuesConsumer { } } + /** expert: writes a value dictionary for a sorted/sortedset field */ protected void addTermsDict(FieldInfo field, final Iterable values) throws IOException { // first check if its a "fixed-length" terms dict int minLength = Integer.MAX_VALUE; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesFormat.java index 68a44370b30..3f3387ae244 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesFormat.java @@ -60,19 +60,23 @@ import org.apache.lucene.util.packed.PackedInts; * for each document. The addresses are written in blocks of 16k, with the current absolute * start for the block, and the average (expected) delta per entry. For each document the * deviation from the delta (actual - expected) is written. - *
  • Prefix-compressed Binary: nocommit + *
  • Prefix-compressed Binary: values are written in chunks of 16, with the first value written + * completely and other values sharing prefixes. chunk addresses are written in blocks of 16k, + * with the current absolute start for the block, and the average (expected) delta per entry. + * For each chunk the deviation from the delta (actual - expected) is written. * *

    * {@link DocValuesType#SORTED SORTED}: *

      - *
    • Sorted: an FST mapping deduplicated terms to ordinals is written, along with the per-document - * ordinals written using one of the numeric strategies above. + *
    • Sorted: a mapping of ordinals to deduplicated terms is written as Prefix-Compressed Binary, + * along with the per-document ordinals written using one of the numeric strategies above. *
    *

    * {@link DocValuesType#SORTED_SET SORTED_SET}: *

      - *
    • SortedSet: an FST mapping deduplicated terms to ordinals is written, along with the per-document - * ordinal list written using one of the binary strategies above. + *
    • SortedSet: a mapping of ordinals to deduplicated terms is written as Prefix-Compressed Binary, + * an ordinal list and per-document index into this list are written using the numeric strategies + * above. *
    *

    * Files: @@ -85,25 +89,35 @@ import org.apache.lucene.util.packed.PackedInts; *

    The DocValues metadata or .dvm file.

    *

    For DocValues field, this stores metadata, such as the offset into the * DocValues data (.dvd)

    - *

    DocValues metadata (.dvm) --> Header,<FieldNumber,EntryType,Entry>NumFields

    + *

    DocValues metadata (.dvm) --> Header,<Entry>NumFields

    *
      - *
    • Entry --> NumericEntry | BinaryEntry | SortedEntry
    • - *
    • NumericEntry --> DataOffset,NumericCompressionType,PackedVersion
    • - *
    • BinaryEntry --> DataOffset,DataLength,MinLength,MaxLength,PackedVersion?,BlockSize?
    • - *
    • SortedEntry --> DataOffset,ValueCount
    • + *
    • Entry --> NumericEntry | BinaryEntry | SortedEntry | SortedSetEntry
    • + *
    • NumericEntry --> GCDNumericEntry | TableNumericEntry | DeltaNumericEntry
    • + *
    • GCDNumericEntry --> NumericHeader,MinValue,GCD
    • + *
    • TableNumericEntry --> NumericHeader,TableSize,{@link DataOutput#writeLong Int64}TableSize
    • + *
    • DeltaNumericEntry --> NumericHeader
    • + *
    • NumericHeader --> FieldNumber,EntryType,NumericType,MissingOffset,PackedVersion,DataOffset,Count,BlockSize
    • + *
    • BinaryEntry --> FixedBinaryEntry | VariableBinaryEntry | PrefixBinaryEntry
    • + *
    • FixedBinaryEntry --> BinaryHeader
    • + *
    • VariableBinaryEntry --> BinaryHeader,AddressOffset,PackedVersion,BlockSize
    • + *
    • PrefixBinaryEntry --> BinaryHeader,AddressInterval,AddressOffset,PackedVersion,BlockSize
    • + *
    • BinaryHeader --> FieldNumber,EntryType,BinaryType,MissingOffset,MinLength,MaxLength,DataOffset
    • + *
    • SortedEntry --> FieldNumber,EntryType,BinaryEntry,NumericEntry
    • + *
    • SortedSetEntry --> EntryType,BinaryEntry,NumericEntry,NumericEntry
    • *
    • FieldNumber,PackedVersion,MinLength,MaxLength,BlockSize,ValueCount --> {@link DataOutput#writeVInt VInt}
    • - *
    • DataOffset,DataLength --> {@link DataOutput#writeLong Int64}
    • *
    • EntryType,CompressionType --> {@link DataOutput#writeByte Byte}
    • *
    • Header --> {@link CodecUtil#writeHeader CodecHeader}
    • + *
    • MinValue,GCD,MissingOffset,AddressOffset,DataOffset --> {@link DataOutput#writeLong Int64}
    • + *
    • TableSize --> {@link DataOutput#writeVInt vInt}
    • *
    - *

    Sorted fields have two entries: a SortedEntry with the FST metadata, + *

    Sorted fields have two entries: a BinaryEntry with the value metadata, * and an ordinary NumericEntry for the document-to-ord metadata.

    - *

    SortedSet fields have two entries: a SortedEntry with the FST metadata, - * and an ordinary BinaryEntry for the document-to-ord-list metadata.

    + *

    SortedSet fields have three entries: a BinaryEntry with the value metadata, + * and two NumericEntries for the document-to-ord-index and ordinal list metadata.

    *

    FieldNumber of -1 indicates the end of metadata.

    - *

    EntryType is a 0 (NumericEntry), 1 (BinaryEntry, or 2 (SortedEntry)

    + *

    EntryType is a 0 (NumericEntry) or 1 (BinaryEntry)

    *

    DataOffset is the pointer to the start of the data in the DocValues data (.dvd)

    - *

    NumericCompressionType indicates how Numeric values will be compressed: + *

    NumericType indicates how Numeric values will be compressed: *

      *
    • 0 --> delta-compressed. For each block of 16k integers, every integer is delta-encoded * from the minimum value within the block. @@ -112,10 +126,18 @@ import org.apache.lucene.util.packed.PackedInts; *
    • 2 --> table-compressed. When the number of unique numeric values is small and it would save space, * a lookup table of unique values is written, followed by the ordinal for each document. *
    + *

    BinaryType indicates how Binary values will be stored: + *

      + *
    • 0 --> fixed-width. All values have the same length, addressing by multiplication. + *
    • 1 -->, variable-width. An address for each value is stored. + *
    • 2 --> prefix-compressed. An address to the start of every interval'th value is stored. + *
    *

    MinLength and MaxLength represent the min and max byte[] value lengths for Binary values. * If they are equal, then all values are of a fixed size, and can be addressed as DataOffset + (docID * length). * Otherwise, the binary values are of variable size, and packed integer metadata (PackedVersion,BlockSize) * is written for the addresses. + *

    MissingOffset points to a byte[] containing a bitset of all documents that had a value for the field. + * If its -1, then there are no missing values. *

  • *

    The DocValues data or .dvd file.

    *

    For DocValues field, this stores the actual per-document data (the heavy-lifting)

    @@ -125,21 +147,18 @@ import org.apache.lucene.util.packed.PackedInts; *
  • BinaryData --> {@link DataOutput#writeByte Byte}DataLength,Addresses
  • *
  • SortedData --> {@link FST FST<Int64>}
  • *
  • DeltaCompressedNumerics --> {@link BlockPackedWriter BlockPackedInts(blockSize=16k)}
  • - *
  • TableCompressedNumerics --> TableSize,{@link DataOutput#writeLong Int64}TableSize,{@link PackedInts PackedInts}
  • - *
  • GCDCompressedNumerics --> MinValue,GCD,{@link BlockPackedWriter BlockPackedInts(blockSize=16k)}
  • + *
  • TableCompressedNumerics --> {@link PackedInts PackedInts}
  • + *
  • GCDCompressedNumerics --> {@link BlockPackedWriter BlockPackedInts(blockSize=16k)}
  • *
  • Addresses --> {@link MonotonicBlockPackedWriter MonotonicBlockPackedInts(blockSize=16k)}
  • - *
  • TableSize --> {@link DataOutput#writeVInt vInt}
  • - *
  • MinValue --> {@link DataOutput#writeLong Int64}
  • - *
  • GCD --> {@link DataOutput#writeLong Int64}
  • * *

    SortedSet entries store the list of ordinals in their BinaryData as a * sequences of increasing {@link DataOutput#writeVLong vLong}s, delta-encoded.

    * * @lucene.experimental */ -// nocommit: docs are incomplete public final class Lucene45DocValuesFormat extends DocValuesFormat { + /** Sole Constructor */ public Lucene45DocValuesFormat() { super("Lucene45"); } @@ -154,14 +173,14 @@ public final class Lucene45DocValuesFormat extends DocValuesFormat { return new Lucene45DocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION); } - public static final String DATA_CODEC = "Lucene45DocValuesData"; - public static final String DATA_EXTENSION = "dvd"; - public static final String META_CODEC = "Lucene45ValuesMetadata"; - public static final String META_EXTENSION = "dvm"; - public static final int VERSION_START = 0; - public static final int VERSION_CURRENT = VERSION_START; - public static final byte NUMERIC = 0; - public static final byte BINARY = 1; - public static final byte SORTED = 2; - public static final byte SORTED_SET = 3; + static final String DATA_CODEC = "Lucene45DocValuesData"; + static final String DATA_EXTENSION = "dvd"; + static final String META_CODEC = "Lucene45ValuesMetadata"; + static final String META_EXTENSION = "dvm"; + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + static final byte NUMERIC = 0; + static final byte BINARY = 1; + static final byte SORTED = 2; + static final byte SORTED_SET = 3; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesProducer.java index b1ca3a8cf60..b12fa6d75d9 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesProducer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesProducer.java @@ -25,6 +25,7 @@ import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.BINARY import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.BINARY_VARIABLE_UNCOMPRESSED; import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.BINARY_PREFIX_COMPRESSED; +import java.io.Closeable; // javadocs import java.io.IOException; import java.util.Comparator; import java.util.HashMap; @@ -53,7 +54,8 @@ import org.apache.lucene.util.packed.BlockPackedReader; import org.apache.lucene.util.packed.MonotonicBlockPackedReader; import org.apache.lucene.util.packed.PackedInts; -public class Lucene45DocValuesProducer extends DocValuesProducer { +/** reader for {@link Lucene45DocValuesFormat} */ +public class Lucene45DocValuesProducer extends DocValuesProducer implements Closeable { private final Map numerics; private final Map binaries; private final Map ords; @@ -65,6 +67,7 @@ public class Lucene45DocValuesProducer extends DocValuesProducer { private final Map addressInstances = new HashMap(); private final Map ordIndexInstances = new HashMap(); + /** expert: instantiates a new reader */ protected Lucene45DocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException { String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); // read in the entries from the metadata file. @@ -317,6 +320,8 @@ public class Lucene45DocValuesProducer extends DocValuesProducer { }; } + /** returns an address instance for variable-length binary values. + * @lucene.internal */ protected MonotonicBlockPackedReader getAddressInstance(IndexInput data, FieldInfo field, BinaryEntry bytes) throws IOException { final MonotonicBlockPackedReader addresses; synchronized (addressInstances) { @@ -358,6 +363,8 @@ public class Lucene45DocValuesProducer extends DocValuesProducer { }; } + /** returns an address instance for prefix-compressed binary values. + * @lucene.internal */ protected MonotonicBlockPackedReader getIntervalInstance(IndexInput data, FieldInfo field, BinaryEntry bytes) throws IOException { final MonotonicBlockPackedReader addresses; final long interval = bytes.addressInterval; @@ -434,6 +441,8 @@ public class Lucene45DocValuesProducer extends DocValuesProducer { }; } + /** returns an address instance for sortedset ordinal lists + * @lucene.internal */ protected MonotonicBlockPackedReader getOrdIndexInstance(IndexInput data, FieldInfo field, NumericEntry entry) throws IOException { final MonotonicBlockPackedReader ordIndex; synchronized (ordIndexInstances) { @@ -509,7 +518,7 @@ public class Lucene45DocValuesProducer extends DocValuesProducer { }; } - public Bits getMissingBits(final long offset) throws IOException { + private Bits getMissingBits(final long offset) throws IOException { if (offset == -1) { return new Bits.MatchAllBits(maxDoc); } else { @@ -557,13 +566,20 @@ public class Lucene45DocValuesProducer extends DocValuesProducer { data.close(); } + /** metadata entry for a numeric docvalues field */ protected static class NumericEntry { + private NumericEntry() {} + /** offset to the bitset representing docsWithField, or -1 if no documents have missing values */ long missingOffset; + /** offset to the actual numeric values */ public long offset; - public int format; + int format; + /** packed ints version used to encode these numerics */ public int packedIntsVersion; + /** count of values written */ public long count; + /** packed ints blocksize */ public int blockSize; long minValue; @@ -571,17 +587,26 @@ public class Lucene45DocValuesProducer extends DocValuesProducer { long table[]; } + /** metadata entry for a binary docvalues field */ protected static class BinaryEntry { + private BinaryEntry() {} + /** offset to the bitset representing docsWithField, or -1 if no documents have missing values */ long missingOffset; + /** offset to the actual binary values */ long offset; int format; + /** count of values written */ public long count; int minLength; int maxLength; + /** offset to the addressing data that maps a value to its slice of the byte[] */ public long addressesOffset; + /** interval of shared prefix chunks (when using prefix-compressed binary) */ public long addressInterval; + /** packed ints version used to encode addressing information */ public int packedIntsVersion; + /** packed ints blocksize */ public int blockSize; } diff --git a/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java index 3a3e301a9b2..f9f82317b62 100644 --- a/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java @@ -96,8 +96,8 @@ class BinaryDocValuesWriter extends DocValuesWriter { } private long docsWithFieldBytesUsed() { - // nocommit: this is not correct - return docsWithField.getBits().length*RamUsageEstimator.NUM_BYTES_LONG; + // size of the long[] + some overhead + return RamUsageEstimator.sizeOf(docsWithField.getBits()) + 64; } private void updateBytesUsed() { diff --git a/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java index 7c5aa83fdae..08f065e1df2 100644 --- a/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java @@ -70,8 +70,8 @@ class NumericDocValuesWriter extends DocValuesWriter { } private long docsWithFieldBytesUsed() { - // nocommit: this is not correct - return docsWithField.getBits().length*RamUsageEstimator.NUM_BYTES_LONG; + // size of the long[] + some overhead + return RamUsageEstimator.sizeOf(docsWithField.getBits()) + 64; } private void updateBytesUsed() { diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingNormsFormat.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingNormsFormat.java index 5579af6245f..8b64401b452 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingNormsFormat.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingNormsFormat.java @@ -28,7 +28,6 @@ import org.apache.lucene.codecs.lucene42.Lucene42NormsFormat; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; -// nocommit /** * Just like {@link Lucene42NormsFormat} but with additional asserts. */ diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/package.html b/lucene/test-framework/src/java/org/apache/lucene/codecs/package.html new file mode 100644 index 00000000000..ca70ffc3b2e --- /dev/null +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/package.html @@ -0,0 +1,25 @@ + + + + + + + +Codecs for testing (simulate old disk formats, wacky theoretical use cases, etc) + + diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java index 48e23848aea..c1902cd93eb 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java @@ -2538,7 +2538,7 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase { d.close(); } - // nocommit: get this out of here and into the deprecated codecs (4.0, 4.2) + // TODO: get this out of here and into the deprecated codecs (4.0, 4.2) public void testHugeBinaryValueLimit() throws Exception { // We only test DVFormats that have a limit assumeFalse("test requires codec with limits on max binary field length", codecAcceptsHugeBinaryValues("field")); diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java b/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java index d79c948ed7e..b1a43a0fa59 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java @@ -757,14 +757,13 @@ public class _TestUtil { } } - // nocommit: remove this, push this test to Lucene40/Lucene42 codec tests + // TODO: remove this, push this test to Lucene40/Lucene42 codec tests public static boolean fieldSupportsHugeBinaryDocValues(String field) { String dvFormat = getDocValuesFormat(field); - System.out.println(dvFormat); - return dvFormat.equals("Lucene45") || - dvFormat.equals("Asserting") || - dvFormat.equals("Disk") || - dvFormat.equals("SimpleText"); + if (dvFormat.equals("Lucene40") || dvFormat.equals("Lucene42")) { + return false; + } + return true; } public static boolean anyFilesExceptWriteLock(Directory dir) throws IOException {