mirror of https://github.com/apache/lucene.git
javadocs/cleanups
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5178@1515563 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
e925901014
commit
c2f37e0de2
|
@ -75,6 +75,7 @@ public abstract class DocValuesProducer implements Closeable {
|
|||
final SortedDocValues in;
|
||||
final int maxDoc;
|
||||
|
||||
/** Creates a {@link Bits} returning true if the document has a value */
|
||||
public SortedDocsWithField(SortedDocValues in, int maxDoc) {
|
||||
this.in = in;
|
||||
this.maxDoc = maxDoc;
|
||||
|
@ -102,6 +103,7 @@ public abstract class DocValuesProducer implements Closeable {
|
|||
final SortedSetDocValues in;
|
||||
final int maxDoc;
|
||||
|
||||
/** Creates a {@link Bits} returning true if the document has a value */
|
||||
public SortedSetDocsWithField(SortedSetDocValues in, int maxDoc) {
|
||||
this.in = in;
|
||||
this.maxDoc = maxDoc;
|
||||
|
|
|
@ -17,6 +17,7 @@ package org.apache.lucene.codecs.lucene45;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Closeable; // javadocs
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
|
@ -37,7 +38,7 @@ import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
|
|||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/** writer for {@link Lucene45DocValuesFormat} */
|
||||
public class Lucene45DocValuesConsumer extends DocValuesConsumer {
|
||||
public class Lucene45DocValuesConsumer extends DocValuesConsumer implements Closeable {
|
||||
|
||||
static final int BLOCK_SIZE = 16384;
|
||||
static final int ADDRESS_INTERVAL = 16;
|
||||
|
@ -59,6 +60,7 @@ public class Lucene45DocValuesConsumer extends DocValuesConsumer {
|
|||
final IndexOutput data, meta;
|
||||
final int maxDoc;
|
||||
|
||||
/** expert: Creates a new writer */
|
||||
public Lucene45DocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
|
||||
boolean success = false;
|
||||
try {
|
||||
|
@ -273,6 +275,7 @@ public class Lucene45DocValuesConsumer extends DocValuesConsumer {
|
|||
}
|
||||
}
|
||||
|
||||
/** expert: writes a value dictionary for a sorted/sortedset field */
|
||||
protected void addTermsDict(FieldInfo field, final Iterable<BytesRef> values) throws IOException {
|
||||
// first check if its a "fixed-length" terms dict
|
||||
int minLength = Integer.MAX_VALUE;
|
||||
|
|
|
@ -60,19 +60,23 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* for each document. The addresses are written in blocks of 16k, with the current absolute
|
||||
* start for the block, and the average (expected) delta per entry. For each document the
|
||||
* deviation from the delta (actual - expected) is written.
|
||||
* <li>Prefix-compressed Binary: nocommit
|
||||
* <li>Prefix-compressed Binary: values are written in chunks of 16, with the first value written
|
||||
* completely and other values sharing prefixes. chunk addresses are written in blocks of 16k,
|
||||
* with the current absolute start for the block, and the average (expected) delta per entry.
|
||||
* For each chunk the deviation from the delta (actual - expected) is written.
|
||||
* </ul>
|
||||
* <p>
|
||||
* {@link DocValuesType#SORTED SORTED}:
|
||||
* <ul>
|
||||
* <li>Sorted: an FST mapping deduplicated terms to ordinals is written, along with the per-document
|
||||
* ordinals written using one of the numeric strategies above.
|
||||
* <li>Sorted: a mapping of ordinals to deduplicated terms is written as Prefix-Compressed Binary,
|
||||
* along with the per-document ordinals written using one of the numeric strategies above.
|
||||
* </ul>
|
||||
* <p>
|
||||
* {@link DocValuesType#SORTED_SET SORTED_SET}:
|
||||
* <ul>
|
||||
* <li>SortedSet: an FST mapping deduplicated terms to ordinals is written, along with the per-document
|
||||
* ordinal list written using one of the binary strategies above.
|
||||
* <li>SortedSet: a mapping of ordinals to deduplicated terms is written as Prefix-Compressed Binary,
|
||||
* an ordinal list and per-document index into this list are written using the numeric strategies
|
||||
* above.
|
||||
* </ul>
|
||||
* <p>
|
||||
* Files:
|
||||
|
@ -85,25 +89,35 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* <p>The DocValues metadata or .dvm file.</p>
|
||||
* <p>For DocValues field, this stores metadata, such as the offset into the
|
||||
* DocValues data (.dvd)</p>
|
||||
* <p>DocValues metadata (.dvm) --> Header,<FieldNumber,EntryType,Entry><sup>NumFields</sup></p>
|
||||
* <p>DocValues metadata (.dvm) --> Header,<Entry><sup>NumFields</sup></p>
|
||||
* <ul>
|
||||
* <li>Entry --> NumericEntry | BinaryEntry | SortedEntry</li>
|
||||
* <li>NumericEntry --> DataOffset,NumericCompressionType,PackedVersion</li>
|
||||
* <li>BinaryEntry --> DataOffset,DataLength,MinLength,MaxLength,PackedVersion?,BlockSize?</li>
|
||||
* <li>SortedEntry --> DataOffset,ValueCount</li>
|
||||
* <li>Entry --> NumericEntry | BinaryEntry | SortedEntry | SortedSetEntry</li>
|
||||
* <li>NumericEntry --> GCDNumericEntry | TableNumericEntry | DeltaNumericEntry</li>
|
||||
* <li>GCDNumericEntry --> NumericHeader,MinValue,GCD</li>
|
||||
* <li>TableNumericEntry --> NumericHeader,TableSize,{@link DataOutput#writeLong Int64}<sup>TableSize</sup></li>
|
||||
* <li>DeltaNumericEntry --> NumericHeader</li>
|
||||
* <li>NumericHeader --> FieldNumber,EntryType,NumericType,MissingOffset,PackedVersion,DataOffset,Count,BlockSize</li>
|
||||
* <li>BinaryEntry --> FixedBinaryEntry | VariableBinaryEntry | PrefixBinaryEntry</li>
|
||||
* <li>FixedBinaryEntry --> BinaryHeader</li>
|
||||
* <li>VariableBinaryEntry --> BinaryHeader,AddressOffset,PackedVersion,BlockSize</li>
|
||||
* <li>PrefixBinaryEntry --> BinaryHeader,AddressInterval,AddressOffset,PackedVersion,BlockSize</li>
|
||||
* <li>BinaryHeader --> FieldNumber,EntryType,BinaryType,MissingOffset,MinLength,MaxLength,DataOffset</li>
|
||||
* <li>SortedEntry --> FieldNumber,EntryType,BinaryEntry,NumericEntry</li>
|
||||
* <li>SortedSetEntry --> EntryType,BinaryEntry,NumericEntry,NumericEntry</li>
|
||||
* <li>FieldNumber,PackedVersion,MinLength,MaxLength,BlockSize,ValueCount --> {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>DataOffset,DataLength --> {@link DataOutput#writeLong Int64}</li>
|
||||
* <li>EntryType,CompressionType --> {@link DataOutput#writeByte Byte}</li>
|
||||
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||
* <li>MinValue,GCD,MissingOffset,AddressOffset,DataOffset --> {@link DataOutput#writeLong Int64}</li>
|
||||
* <li>TableSize --> {@link DataOutput#writeVInt vInt}</li>
|
||||
* </ul>
|
||||
* <p>Sorted fields have two entries: a SortedEntry with the FST metadata,
|
||||
* <p>Sorted fields have two entries: a BinaryEntry with the value metadata,
|
||||
* and an ordinary NumericEntry for the document-to-ord metadata.</p>
|
||||
* <p>SortedSet fields have two entries: a SortedEntry with the FST metadata,
|
||||
* and an ordinary BinaryEntry for the document-to-ord-list metadata.</p>
|
||||
* <p>SortedSet fields have three entries: a BinaryEntry with the value metadata,
|
||||
* and two NumericEntries for the document-to-ord-index and ordinal list metadata.</p>
|
||||
* <p>FieldNumber of -1 indicates the end of metadata.</p>
|
||||
* <p>EntryType is a 0 (NumericEntry), 1 (BinaryEntry, or 2 (SortedEntry)</p>
|
||||
* <p>EntryType is a 0 (NumericEntry) or 1 (BinaryEntry)</p>
|
||||
* <p>DataOffset is the pointer to the start of the data in the DocValues data (.dvd)</p>
|
||||
* <p>NumericCompressionType indicates how Numeric values will be compressed:
|
||||
* <p>NumericType indicates how Numeric values will be compressed:
|
||||
* <ul>
|
||||
* <li>0 --> delta-compressed. For each block of 16k integers, every integer is delta-encoded
|
||||
* from the minimum value within the block.
|
||||
|
@ -112,10 +126,18 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* <li>2 --> table-compressed. When the number of unique numeric values is small and it would save space,
|
||||
* a lookup table of unique values is written, followed by the ordinal for each document.
|
||||
* </ul>
|
||||
* <p>BinaryType indicates how Binary values will be stored:
|
||||
* <ul>
|
||||
* <li>0 --> fixed-width. All values have the same length, addressing by multiplication.
|
||||
* <li>1 -->, variable-width. An address for each value is stored.
|
||||
* <li>2 --> prefix-compressed. An address to the start of every interval'th value is stored.
|
||||
* </ul>
|
||||
* <p>MinLength and MaxLength represent the min and max byte[] value lengths for Binary values.
|
||||
* If they are equal, then all values are of a fixed size, and can be addressed as DataOffset + (docID * length).
|
||||
* Otherwise, the binary values are of variable size, and packed integer metadata (PackedVersion,BlockSize)
|
||||
* is written for the addresses.
|
||||
* <p>MissingOffset points to a byte[] containing a bitset of all documents that had a value for the field.
|
||||
* If its -1, then there are no missing values.
|
||||
* <li><a name="dvd" id="dvd"></a>
|
||||
* <p>The DocValues data or .dvd file.</p>
|
||||
* <p>For DocValues field, this stores the actual per-document data (the heavy-lifting)</p>
|
||||
|
@ -125,21 +147,18 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* <li>BinaryData --> {@link DataOutput#writeByte Byte}<sup>DataLength</sup>,Addresses</li>
|
||||
* <li>SortedData --> {@link FST FST<Int64>}</li>
|
||||
* <li>DeltaCompressedNumerics --> {@link BlockPackedWriter BlockPackedInts(blockSize=16k)}</li>
|
||||
* <li>TableCompressedNumerics --> TableSize,{@link DataOutput#writeLong Int64}<sup>TableSize</sup>,{@link PackedInts PackedInts}</li>
|
||||
* <li>GCDCompressedNumerics --> MinValue,GCD,{@link BlockPackedWriter BlockPackedInts(blockSize=16k)}</li>
|
||||
* <li>TableCompressedNumerics --> {@link PackedInts PackedInts}</li>
|
||||
* <li>GCDCompressedNumerics --> {@link BlockPackedWriter BlockPackedInts(blockSize=16k)}</li>
|
||||
* <li>Addresses --> {@link MonotonicBlockPackedWriter MonotonicBlockPackedInts(blockSize=16k)}</li>
|
||||
* <li>TableSize --> {@link DataOutput#writeVInt vInt}</li>
|
||||
* <li>MinValue --> {@link DataOutput#writeLong Int64}</li>
|
||||
* <li>GCD --> {@link DataOutput#writeLong Int64}</li>
|
||||
* </ul>
|
||||
* <p>SortedSet entries store the list of ordinals in their BinaryData as a
|
||||
* sequences of increasing {@link DataOutput#writeVLong vLong}s, delta-encoded.</p>
|
||||
* </ol>
|
||||
* @lucene.experimental
|
||||
*/
|
||||
// nocommit: docs are incomplete
|
||||
public final class Lucene45DocValuesFormat extends DocValuesFormat {
|
||||
|
||||
/** Sole Constructor */
|
||||
public Lucene45DocValuesFormat() {
|
||||
super("Lucene45");
|
||||
}
|
||||
|
@ -154,14 +173,14 @@ public final class Lucene45DocValuesFormat extends DocValuesFormat {
|
|||
return new Lucene45DocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION);
|
||||
}
|
||||
|
||||
public static final String DATA_CODEC = "Lucene45DocValuesData";
|
||||
public static final String DATA_EXTENSION = "dvd";
|
||||
public static final String META_CODEC = "Lucene45ValuesMetadata";
|
||||
public static final String META_EXTENSION = "dvm";
|
||||
public static final int VERSION_START = 0;
|
||||
public static final int VERSION_CURRENT = VERSION_START;
|
||||
public static final byte NUMERIC = 0;
|
||||
public static final byte BINARY = 1;
|
||||
public static final byte SORTED = 2;
|
||||
public static final byte SORTED_SET = 3;
|
||||
static final String DATA_CODEC = "Lucene45DocValuesData";
|
||||
static final String DATA_EXTENSION = "dvd";
|
||||
static final String META_CODEC = "Lucene45ValuesMetadata";
|
||||
static final String META_EXTENSION = "dvm";
|
||||
static final int VERSION_START = 0;
|
||||
static final int VERSION_CURRENT = VERSION_START;
|
||||
static final byte NUMERIC = 0;
|
||||
static final byte BINARY = 1;
|
||||
static final byte SORTED = 2;
|
||||
static final byte SORTED_SET = 3;
|
||||
}
|
||||
|
|
|
@ -25,6 +25,7 @@ import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.BINARY
|
|||
import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.BINARY_VARIABLE_UNCOMPRESSED;
|
||||
import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.BINARY_PREFIX_COMPRESSED;
|
||||
|
||||
import java.io.Closeable; // javadocs
|
||||
import java.io.IOException;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
|
@ -53,7 +54,8 @@ import org.apache.lucene.util.packed.BlockPackedReader;
|
|||
import org.apache.lucene.util.packed.MonotonicBlockPackedReader;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
public class Lucene45DocValuesProducer extends DocValuesProducer {
|
||||
/** reader for {@link Lucene45DocValuesFormat} */
|
||||
public class Lucene45DocValuesProducer extends DocValuesProducer implements Closeable {
|
||||
private final Map<Integer,NumericEntry> numerics;
|
||||
private final Map<Integer,BinaryEntry> binaries;
|
||||
private final Map<Integer,NumericEntry> ords;
|
||||
|
@ -65,6 +67,7 @@ public class Lucene45DocValuesProducer extends DocValuesProducer {
|
|||
private final Map<Integer,MonotonicBlockPackedReader> addressInstances = new HashMap<Integer,MonotonicBlockPackedReader>();
|
||||
private final Map<Integer,MonotonicBlockPackedReader> ordIndexInstances = new HashMap<Integer,MonotonicBlockPackedReader>();
|
||||
|
||||
/** expert: instantiates a new reader */
|
||||
protected Lucene45DocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
|
||||
String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
|
||||
// read in the entries from the metadata file.
|
||||
|
@ -317,6 +320,8 @@ public class Lucene45DocValuesProducer extends DocValuesProducer {
|
|||
};
|
||||
}
|
||||
|
||||
/** returns an address instance for variable-length binary values.
|
||||
* @lucene.internal */
|
||||
protected MonotonicBlockPackedReader getAddressInstance(IndexInput data, FieldInfo field, BinaryEntry bytes) throws IOException {
|
||||
final MonotonicBlockPackedReader addresses;
|
||||
synchronized (addressInstances) {
|
||||
|
@ -358,6 +363,8 @@ public class Lucene45DocValuesProducer extends DocValuesProducer {
|
|||
};
|
||||
}
|
||||
|
||||
/** returns an address instance for prefix-compressed binary values.
|
||||
* @lucene.internal */
|
||||
protected MonotonicBlockPackedReader getIntervalInstance(IndexInput data, FieldInfo field, BinaryEntry bytes) throws IOException {
|
||||
final MonotonicBlockPackedReader addresses;
|
||||
final long interval = bytes.addressInterval;
|
||||
|
@ -434,6 +441,8 @@ public class Lucene45DocValuesProducer extends DocValuesProducer {
|
|||
};
|
||||
}
|
||||
|
||||
/** returns an address instance for sortedset ordinal lists
|
||||
* @lucene.internal */
|
||||
protected MonotonicBlockPackedReader getOrdIndexInstance(IndexInput data, FieldInfo field, NumericEntry entry) throws IOException {
|
||||
final MonotonicBlockPackedReader ordIndex;
|
||||
synchronized (ordIndexInstances) {
|
||||
|
@ -509,7 +518,7 @@ public class Lucene45DocValuesProducer extends DocValuesProducer {
|
|||
};
|
||||
}
|
||||
|
||||
public Bits getMissingBits(final long offset) throws IOException {
|
||||
private Bits getMissingBits(final long offset) throws IOException {
|
||||
if (offset == -1) {
|
||||
return new Bits.MatchAllBits(maxDoc);
|
||||
} else {
|
||||
|
@ -557,13 +566,20 @@ public class Lucene45DocValuesProducer extends DocValuesProducer {
|
|||
data.close();
|
||||
}
|
||||
|
||||
/** metadata entry for a numeric docvalues field */
|
||||
protected static class NumericEntry {
|
||||
private NumericEntry() {}
|
||||
/** offset to the bitset representing docsWithField, or -1 if no documents have missing values */
|
||||
long missingOffset;
|
||||
/** offset to the actual numeric values */
|
||||
public long offset;
|
||||
|
||||
public int format;
|
||||
int format;
|
||||
/** packed ints version used to encode these numerics */
|
||||
public int packedIntsVersion;
|
||||
/** count of values written */
|
||||
public long count;
|
||||
/** packed ints blocksize */
|
||||
public int blockSize;
|
||||
|
||||
long minValue;
|
||||
|
@ -571,17 +587,26 @@ public class Lucene45DocValuesProducer extends DocValuesProducer {
|
|||
long table[];
|
||||
}
|
||||
|
||||
/** metadata entry for a binary docvalues field */
|
||||
protected static class BinaryEntry {
|
||||
private BinaryEntry() {}
|
||||
/** offset to the bitset representing docsWithField, or -1 if no documents have missing values */
|
||||
long missingOffset;
|
||||
/** offset to the actual binary values */
|
||||
long offset;
|
||||
|
||||
int format;
|
||||
/** count of values written */
|
||||
public long count;
|
||||
int minLength;
|
||||
int maxLength;
|
||||
/** offset to the addressing data that maps a value to its slice of the byte[] */
|
||||
public long addressesOffset;
|
||||
/** interval of shared prefix chunks (when using prefix-compressed binary) */
|
||||
public long addressInterval;
|
||||
/** packed ints version used to encode addressing information */
|
||||
public int packedIntsVersion;
|
||||
/** packed ints blocksize */
|
||||
public int blockSize;
|
||||
}
|
||||
|
||||
|
|
|
@ -96,8 +96,8 @@ class BinaryDocValuesWriter extends DocValuesWriter {
|
|||
}
|
||||
|
||||
private long docsWithFieldBytesUsed() {
|
||||
// nocommit: this is not correct
|
||||
return docsWithField.getBits().length*RamUsageEstimator.NUM_BYTES_LONG;
|
||||
// size of the long[] + some overhead
|
||||
return RamUsageEstimator.sizeOf(docsWithField.getBits()) + 64;
|
||||
}
|
||||
|
||||
private void updateBytesUsed() {
|
||||
|
|
|
@ -70,8 +70,8 @@ class NumericDocValuesWriter extends DocValuesWriter {
|
|||
}
|
||||
|
||||
private long docsWithFieldBytesUsed() {
|
||||
// nocommit: this is not correct
|
||||
return docsWithField.getBits().length*RamUsageEstimator.NUM_BYTES_LONG;
|
||||
// size of the long[] + some overhead
|
||||
return RamUsageEstimator.sizeOf(docsWithField.getBits()) + 64;
|
||||
}
|
||||
|
||||
private void updateBytesUsed() {
|
||||
|
|
|
@ -28,7 +28,6 @@ import org.apache.lucene.codecs.lucene42.Lucene42NormsFormat;
|
|||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
|
||||
// nocommit
|
||||
/**
|
||||
* Just like {@link Lucene42NormsFormat} but with additional asserts.
|
||||
*/
|
||||
|
|
|
@ -0,0 +1,25 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
</head>
|
||||
<body>
|
||||
Codecs for testing (simulate old disk formats, wacky theoretical use cases, etc)
|
||||
</body>
|
||||
</html>
|
|
@ -2538,7 +2538,7 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase {
|
|||
d.close();
|
||||
}
|
||||
|
||||
// nocommit: get this out of here and into the deprecated codecs (4.0, 4.2)
|
||||
// TODO: get this out of here and into the deprecated codecs (4.0, 4.2)
|
||||
public void testHugeBinaryValueLimit() throws Exception {
|
||||
// We only test DVFormats that have a limit
|
||||
assumeFalse("test requires codec with limits on max binary field length", codecAcceptsHugeBinaryValues("field"));
|
||||
|
|
|
@ -757,14 +757,13 @@ public class _TestUtil {
|
|||
}
|
||||
}
|
||||
|
||||
// nocommit: remove this, push this test to Lucene40/Lucene42 codec tests
|
||||
// TODO: remove this, push this test to Lucene40/Lucene42 codec tests
|
||||
public static boolean fieldSupportsHugeBinaryDocValues(String field) {
|
||||
String dvFormat = getDocValuesFormat(field);
|
||||
System.out.println(dvFormat);
|
||||
return dvFormat.equals("Lucene45") ||
|
||||
dvFormat.equals("Asserting") ||
|
||||
dvFormat.equals("Disk") ||
|
||||
dvFormat.equals("SimpleText");
|
||||
if (dvFormat.equals("Lucene40") || dvFormat.equals("Lucene42")) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public static boolean anyFilesExceptWriteLock(Directory dir) throws IOException {
|
||||
|
|
Loading…
Reference in New Issue