mirror of https://github.com/apache/lucene.git
LUCENE-6863: Optimized storage requirements of doc values fields when less than 1% of documents have a value.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1712957 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
4ef2d43d58
commit
7c917a5ed8
|
@ -186,6 +186,9 @@ Optimizations
|
|||
* LUCENE-6885: StandardDirectoryReader (initialCapacity) tweaks
|
||||
(Christine Poerschke)
|
||||
|
||||
* LUCENE-6863: Optimized storage requirements of doc values fields when less
|
||||
than 1% of documents have a value. (Adrien Grand)
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-6817: ComplexPhraseQueryParser.ComplexPhraseQuery does not display
|
||||
|
|
|
@ -28,6 +28,7 @@ import java.util.Map;
|
|||
import java.util.Set;
|
||||
import java.util.SortedSet;
|
||||
import java.util.TreeSet;
|
||||
import java.util.stream.StreamSupport;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.DocValuesConsumer;
|
||||
|
@ -54,6 +55,13 @@ import static org.apache.lucene.codecs.lucene54.Lucene54DocValuesFormat.*;
|
|||
/** writer for {@link Lucene54DocValuesFormat} */
|
||||
final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Closeable {
|
||||
|
||||
enum NumberType {
|
||||
/** Dense ordinals */
|
||||
ORDINAL,
|
||||
/** Random long values */
|
||||
VALUE;
|
||||
}
|
||||
|
||||
IndexOutput data, meta;
|
||||
final int maxDoc;
|
||||
|
||||
|
@ -78,10 +86,10 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close
|
|||
|
||||
@Override
|
||||
public void addNumericField(FieldInfo field, Iterable<Number> values) throws IOException {
|
||||
addNumericField(field, values, true);
|
||||
addNumericField(field, values, NumberType.VALUE);
|
||||
}
|
||||
|
||||
void addNumericField(FieldInfo field, Iterable<Number> values, boolean optimizeStorage) throws IOException {
|
||||
void addNumericField(FieldInfo field, Iterable<Number> values, NumberType numberType) throws IOException {
|
||||
long count = 0;
|
||||
long minValue = Long.MAX_VALUE;
|
||||
long maxValue = Long.MIN_VALUE;
|
||||
|
@ -90,7 +98,8 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close
|
|||
long zeroCount = 0;
|
||||
// TODO: more efficient?
|
||||
HashSet<Long> uniqueValues = null;
|
||||
if (optimizeStorage) {
|
||||
long missingOrdCount = 0;
|
||||
if (numberType == NumberType.VALUE) {
|
||||
uniqueValues = new HashSet<>();
|
||||
|
||||
for (Number nv : values) {
|
||||
|
@ -133,6 +142,9 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close
|
|||
} else {
|
||||
for (Number nv : values) {
|
||||
long v = nv.longValue();
|
||||
if (v == -1L) {
|
||||
missingOrdCount++;
|
||||
}
|
||||
minValue = Math.min(minValue, v);
|
||||
maxValue = Math.max(maxValue, v);
|
||||
++count;
|
||||
|
@ -145,6 +157,18 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close
|
|||
? Integer.MAX_VALUE
|
||||
: DirectWriter.bitsRequired(uniqueValues.size() - 1);
|
||||
|
||||
final boolean sparse; // 1% of docs or less have a value
|
||||
switch (numberType) {
|
||||
case VALUE:
|
||||
sparse = (double) missingCount / count >= 0.99;
|
||||
break;
|
||||
case ORDINAL:
|
||||
sparse = (double) missingOrdCount / count >= 0.99;
|
||||
break;
|
||||
default:
|
||||
throw new AssertionError();
|
||||
}
|
||||
|
||||
final int format;
|
||||
if (uniqueValues != null
|
||||
&& count <= Integer.MAX_VALUE
|
||||
|
@ -152,6 +176,9 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close
|
|||
|| (uniqueValues.size() == 2 && missingCount > 0 && zeroCount == missingCount))) {
|
||||
// either one unique value C or two unique values: "missing" and C
|
||||
format = CONST_COMPRESSED;
|
||||
} else if (sparse && count >= 1024) {
|
||||
// require at least 1024 docs to avoid flipping back and forth when doing NRT search
|
||||
format = SPARSE_COMPRESSED;
|
||||
} else if (uniqueValues != null && tableBitsRequired < deltaBitsRequired) {
|
||||
format = TABLE_COMPRESSED;
|
||||
} else if (gcd != 0 && gcd != 1) {
|
||||
|
@ -164,7 +191,22 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close
|
|||
meta.writeVInt(field.number);
|
||||
meta.writeByte(Lucene54DocValuesFormat.NUMERIC);
|
||||
meta.writeVInt(format);
|
||||
if (missingCount == 0) {
|
||||
if (format == SPARSE_COMPRESSED) {
|
||||
meta.writeLong(data.getFilePointer());
|
||||
final long numDocsWithValue;
|
||||
switch (numberType) {
|
||||
case VALUE:
|
||||
numDocsWithValue = count - missingCount;
|
||||
break;
|
||||
case ORDINAL:
|
||||
numDocsWithValue = count - missingOrdCount;
|
||||
break;
|
||||
default:
|
||||
throw new AssertionError();
|
||||
}
|
||||
final long maxDoc = writeSparseMissingBitset(values, numberType, numDocsWithValue);
|
||||
assert maxDoc == count;
|
||||
} else if (missingCount == 0) {
|
||||
meta.writeLong(ALL_LIVE);
|
||||
} else if (missingCount == count) {
|
||||
meta.writeLong(ALL_MISSING);
|
||||
|
@ -220,6 +262,39 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close
|
|||
}
|
||||
ordsWriter.finish();
|
||||
break;
|
||||
case SPARSE_COMPRESSED:
|
||||
final Iterable<Number> filteredMissingValues;
|
||||
switch (numberType) {
|
||||
case VALUE:
|
||||
meta.writeByte((byte) 0);
|
||||
filteredMissingValues = new Iterable<Number>() {
|
||||
@Override
|
||||
public Iterator<Number> iterator() {
|
||||
return StreamSupport
|
||||
.stream(values.spliterator(), false)
|
||||
.filter(value -> value != null)
|
||||
.iterator();
|
||||
}
|
||||
};
|
||||
break;
|
||||
case ORDINAL:
|
||||
meta.writeByte((byte) 1);
|
||||
filteredMissingValues = new Iterable<Number>() {
|
||||
@Override
|
||||
public Iterator<Number> iterator() {
|
||||
return StreamSupport
|
||||
.stream(values.spliterator(), false)
|
||||
.filter(value -> value.longValue() != -1L)
|
||||
.iterator();
|
||||
}
|
||||
};
|
||||
break;
|
||||
default:
|
||||
throw new AssertionError();
|
||||
}
|
||||
// Write non-missing values as a numeric field
|
||||
addNumericField(field, filteredMissingValues, numberType);
|
||||
break;
|
||||
default:
|
||||
throw new AssertionError();
|
||||
}
|
||||
|
@ -247,6 +322,34 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close
|
|||
}
|
||||
}
|
||||
|
||||
long writeSparseMissingBitset(Iterable<Number> values, NumberType numberType, long numDocsWithValue) throws IOException {
|
||||
meta.writeVLong(numDocsWithValue);
|
||||
|
||||
// Write doc IDs that have a value
|
||||
meta.writeVInt(DIRECT_MONOTONIC_BLOCK_SHIFT);
|
||||
final DirectMonotonicWriter docIdsWriter = DirectMonotonicWriter.getInstance(meta, data, numDocsWithValue, DIRECT_MONOTONIC_BLOCK_SHIFT);
|
||||
long docID = 0;
|
||||
for (Number nv : values) {
|
||||
switch (numberType) {
|
||||
case VALUE:
|
||||
if (nv != null) {
|
||||
docIdsWriter.add(docID);
|
||||
}
|
||||
break;
|
||||
case ORDINAL:
|
||||
if (nv.longValue() != -1L) {
|
||||
docIdsWriter.add(docID);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
throw new AssertionError();
|
||||
}
|
||||
docID++;
|
||||
}
|
||||
docIdsWriter.finish();
|
||||
return docID;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addBinaryField(FieldInfo field, Iterable<BytesRef> values) throws IOException {
|
||||
// write the byte[] data
|
||||
|
@ -458,7 +561,7 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close
|
|||
meta.writeVInt(field.number);
|
||||
meta.writeByte(Lucene54DocValuesFormat.SORTED);
|
||||
addTermsDict(field, values);
|
||||
addNumericField(field, docToOrd, false);
|
||||
addNumericField(field, docToOrd, NumberType.ORDINAL);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -478,11 +581,11 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close
|
|||
writeDictionary(uniqueValueSets);
|
||||
|
||||
// write the doc -> set_id as a numeric field
|
||||
addNumericField(field, docToSetId(uniqueValueSets, docToValueCount, values), false);
|
||||
addNumericField(field, docToSetId(uniqueValueSets, docToValueCount, values), NumberType.ORDINAL);
|
||||
} else {
|
||||
meta.writeVInt(SORTED_WITH_ADDRESSES);
|
||||
// write the stream of values as a numeric field
|
||||
addNumericField(field, values, true);
|
||||
addNumericField(field, values, NumberType.VALUE);
|
||||
// write the doc -> ord count as a absolute index to the stream
|
||||
addOrdIndex(field, docToValueCount);
|
||||
}
|
||||
|
@ -510,7 +613,7 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close
|
|||
addTermsDict(field, values);
|
||||
|
||||
// write the doc -> set_id as a numeric field
|
||||
addNumericField(field, docToSetId(uniqueValueSets, docToOrdCount, ords), false);
|
||||
addNumericField(field, docToSetId(uniqueValueSets, docToOrdCount, ords), NumberType.ORDINAL);
|
||||
} else {
|
||||
meta.writeVInt(SORTED_WITH_ADDRESSES);
|
||||
|
||||
|
@ -519,7 +622,7 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close
|
|||
|
||||
// write the stream of ords as a numeric field
|
||||
// NOTE: we could return an iterator that delta-encodes these within a doc
|
||||
addNumericField(field, ords, false);
|
||||
addNumericField(field, ords, NumberType.ORDINAL);
|
||||
|
||||
// write the doc -> ord count as a absolute index to the stream
|
||||
addOrdIndex(field, docToOrdCount);
|
||||
|
|
|
@ -19,18 +19,14 @@ package org.apache.lucene.codecs.lucene54;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.DocValuesConsumer;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.codecs.DocValuesProducer;
|
||||
import org.apache.lucene.index.DocValuesType;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.SmallFloat;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.packed.DirectWriter;
|
||||
import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
|
||||
|
||||
/**
|
||||
* Lucene 5.4 DocValues format.
|
||||
|
@ -51,6 +47,8 @@ import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
|
|||
* as blocks of bitpacked integers, encoding the deviation from the expected delta.
|
||||
* <li>Const-compressed: when there is only one possible non-missing value, only the missing
|
||||
* bitset is encoded.
|
||||
* <li>Sparse-compressed: only documents with a value are stored, and lookups are performed
|
||||
* using binary search.
|
||||
* </ul>
|
||||
* <p>
|
||||
* {@link DocValuesType#BINARY BINARY}:
|
||||
|
@ -96,93 +94,6 @@ import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
|
|||
* <li><tt>.dvd</tt>: DocValues data</li>
|
||||
* <li><tt>.dvm</tt>: DocValues metadata</li>
|
||||
* </ol>
|
||||
* <ol>
|
||||
* <li><a name="dvm"></a>
|
||||
* <p>The DocValues metadata or .dvm file.</p>
|
||||
* <p>For DocValues field, this stores metadata, such as the offset into the
|
||||
* DocValues data (.dvd)</p>
|
||||
* <p>DocValues metadata (.dvm) --> Header,<Entry><sup>NumFields</sup>,Footer</p>
|
||||
* <ul>
|
||||
* <li>Entry --> NumericEntry | BinaryEntry | SortedEntry | SortedSetEntry | SortedNumericEntry</li>
|
||||
* <li>NumericEntry --> GCDNumericEntry | TableNumericEntry | DeltaNumericEntry</li>
|
||||
* <li>GCDNumericEntry --> NumericHeader,MinValue,GCD,BitsPerValue</li>
|
||||
* <li>TableNumericEntry --> NumericHeader,TableSize,{@link DataOutput#writeLong Int64}<sup>TableSize</sup>,BitsPerValue</li>
|
||||
* <li>DeltaNumericEntry --> NumericHeader,MinValue,BitsPerValue</li>
|
||||
* <li>MonotonicNumericEntry --> NumericHeader,PackedVersion,BlockSize</li>
|
||||
* <li>NumericHeader --> FieldNumber,EntryType,NumericType,MissingOffset,DataOffset,Count,EndOffset</li>
|
||||
* <li>BinaryEntry --> FixedBinaryEntry | VariableBinaryEntry | PrefixBinaryEntry</li>
|
||||
* <li>FixedBinaryEntry --> BinaryHeader</li>
|
||||
* <li>VariableBinaryEntry --> BinaryHeader,AddressOffset,PackedVersion,BlockSize</li>
|
||||
* <li>PrefixBinaryEntry --> BinaryHeader,AddressInterval,AddressOffset,PackedVersion,BlockSize</li>
|
||||
* <li>BinaryHeader --> FieldNumber,EntryType,BinaryType,MissingOffset,MinLength,MaxLength,DataOffset</li>
|
||||
* <li>SortedEntry --> FieldNumber,EntryType,BinaryEntry,NumericEntry</li>
|
||||
* <li>SortedSetEntry --> SingleSortedSetEntry | AddressesSortedSetEntry | TableSortedSetEntry</li>
|
||||
* <li>SingleSortedSetEntry --> SetHeader,SortedEntry</li>
|
||||
* <li>AddressesSortedSetEntry --> SetHeader,BinaryEntry,NumericEntry,NumericEntry</li>
|
||||
* <li>TableSortedSetEntry --> SetHeader,TotalTableLength,{@link DataOutput#writeLong Int64}<sup>TotalTableLength</sup>,TableSize,{@link DataOutput#writeInt Int32}<sup>TableSize</sup>,BinaryEntry,NumericEntry</li>
|
||||
* <li>SetHeader --> FieldNumber,EntryType,SetType</li>
|
||||
* <li>SortedNumericEntry --> SingleSortedNumericEntry | AddressesSortedNumericEntry | TableSortedNumericEntry</li>
|
||||
* <li>SingleNumericEntry --> SetHeader,NumericEntry</li>
|
||||
* <li>AddressesSortedNumericEntry --> SetHeader,NumericEntry,NumericEntry</li>
|
||||
* <li>TableSortedNumericEntry --> SetHeader,TotalTableLength,{@link DataOutput#writeLong Int64}<sup>TotalTableLength</sup>,TableSize,{@link DataOutput#writeInt Int32}<sup>TableSize</sup>,NumericEntry</li>
|
||||
* <li>FieldNumber,PackedVersion,MinLength,MaxLength,BlockSize,ValueCount --> {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>EntryType,CompressionType --> {@link DataOutput#writeByte Byte}</li>
|
||||
* <li>Header --> {@link CodecUtil#writeIndexHeader IndexHeader}</li>
|
||||
* <li>MinValue,GCD,MissingOffset,AddressOffset,DataOffset,EndOffset --> {@link DataOutput#writeLong Int64}</li>
|
||||
* <li>TableSize,BitsPerValue,TotalTableLength --> {@link DataOutput#writeVInt vInt}</li>
|
||||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li>
|
||||
* </ul>
|
||||
* <p>Sorted fields have two entries: a BinaryEntry with the value metadata,
|
||||
* and an ordinary NumericEntry for the document-to-ord metadata.</p>
|
||||
* <p>FieldNumber of -1 indicates the end of metadata.</p>
|
||||
* <p>EntryType is a 0 (NumericEntry) or 1 (BinaryEntry)</p>
|
||||
* <p>DataOffset is the pointer to the start of the data in the DocValues data (.dvd)</p>
|
||||
* <p>EndOffset is the pointer to the end of the data in the DocValues data (.dvd)</p>
|
||||
* <p>NumericType indicates how Numeric values will be compressed:
|
||||
* <ul>
|
||||
* <li>0 --> delta-compressed. For each block of 16k integers, every integer is delta-encoded
|
||||
* from the minimum value within the block.
|
||||
* <li>1 --> gcd-compressed. When all integers share a common divisor, only quotients are stored
|
||||
* using blocks of delta-encoded ints.
|
||||
* <li>2 --> table-compressed. When the number of unique numeric values is small and it would save space,
|
||||
* a lookup table of unique values is written, followed by the ordinal for each document.
|
||||
* <li>3 --> monotonic-compressed. Used to implement addressing for BINARY, SORTED_SET, SORTED_NUMERIC.
|
||||
* <li>4 --> const-compressed. Used when all non-missing values are the same.
|
||||
* </ul>
|
||||
* <p>BinaryType indicates how Binary values will be stored:
|
||||
* <ul>
|
||||
* <li>0 --> fixed-width. All values have the same length, addressing by multiplication.
|
||||
* <li>1 --> variable-width. An address for each value is stored.
|
||||
* <li>2 --> prefix-compressed. An address to the start of every interval'th value is stored.
|
||||
* </ul>
|
||||
* <p>SetType indicates how SortedSet and SortedNumeric values will be stored:
|
||||
* <ul>
|
||||
* <li>0 --> with addresses. There are two numeric entries: a first one from document to start
|
||||
* offset, and a second one from offset to ord/value.
|
||||
* <li>1 --> single-valued. Used when all documents have at most one value and is encoded like
|
||||
* a regular Sorted/Numeric entry.
|
||||
* <li>2 --> table-encoded. A lookup table of unique sets of values is written, followed by a
|
||||
* numeric entry that maps each document to an ordinal in this table.
|
||||
* </ul>
|
||||
* <p>MinLength and MaxLength represent the min and max byte[] value lengths for Binary values.
|
||||
* If they are equal, then all values are of a fixed size, and can be addressed as DataOffset + (docID * length).
|
||||
* Otherwise, the binary values are of variable size, and packed integer metadata (PackedVersion,BlockSize)
|
||||
* is written for the addresses.
|
||||
* <p>MissingOffset points to a byte[] containing a bitset of all documents that had a value for the field.
|
||||
* If it's -1, then there are no missing values. If it's -2, all values are missing.
|
||||
* <li><a name="dvd"></a>
|
||||
* <p>The DocValues data or .dvd file.</p>
|
||||
* <p>For DocValues field, this stores the actual per-document data (the heavy-lifting)</p>
|
||||
* <p>DocValues data (.dvd) --> Header,<NumericData | BinaryData | SortedData><sup>NumFields</sup>,Footer</p>
|
||||
* <ul>
|
||||
* <li>NumericData --> DeltaCompressedNumerics | TableCompressedNumerics | GCDCompressedNumerics</li>
|
||||
* <li>BinaryData --> {@link DataOutput#writeByte Byte}<sup>DataLength</sup>,Addresses</li>
|
||||
* <li>SortedData --> {@link FST FST<Int64>}</li>
|
||||
* <li>DeltaCompressedNumerics,TableCompressedNumerics,GCDCompressedNumerics --> {@link DirectWriter PackedInts}</li>
|
||||
* <li>Addresses --> {@link MonotonicBlockPackedWriter MonotonicBlockPackedInts(blockSize=16k)}</li>
|
||||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li>
|
||||
* </ul>
|
||||
* </ol>
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public final class Lucene54DocValuesFormat extends DocValuesFormat {
|
||||
|
@ -207,8 +118,7 @@ public final class Lucene54DocValuesFormat extends DocValuesFormat {
|
|||
static final String META_CODEC = "Lucene54DocValuesMetadata";
|
||||
static final String META_EXTENSION = "dvm";
|
||||
static final int VERSION_START = 0;
|
||||
static final int VERSION_SORTEDSET_TABLE = 1;
|
||||
static final int VERSION_CURRENT = VERSION_SORTEDSET_TABLE;
|
||||
static final int VERSION_CURRENT = VERSION_START;
|
||||
|
||||
// indicates docvalues type
|
||||
static final byte NUMERIC = 0;
|
||||
|
@ -242,7 +152,9 @@ public final class Lucene54DocValuesFormat extends DocValuesFormat {
|
|||
static final int MONOTONIC_COMPRESSED = 3;
|
||||
/** Compressed with constant value (uses only missing bitset) */
|
||||
static final int CONST_COMPRESSED = 4;
|
||||
|
||||
/** Compressed with sparse arrays. */
|
||||
static final int SPARSE_COMPRESSED = 5;
|
||||
|
||||
/** Uncompressed binary, written directly (fixed length). */
|
||||
static final int BINARY_FIXED_UNCOMPRESSED = 0;
|
||||
/** Uncompressed binary, written directly (variable length). */
|
||||
|
|
|
@ -29,6 +29,7 @@ import java.util.concurrent.atomic.AtomicLong;
|
|||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.DocValuesProducer;
|
||||
import org.apache.lucene.codecs.lucene54.Lucene54DocValuesConsumer.NumberType;
|
||||
import org.apache.lucene.index.BinaryDocValues;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.DocValues;
|
||||
|
@ -314,6 +315,14 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close
|
|||
NumericEntry entry = new NumericEntry();
|
||||
entry.format = meta.readVInt();
|
||||
entry.missingOffset = meta.readLong();
|
||||
if (entry.format == SPARSE_COMPRESSED) {
|
||||
// sparse bits need a bit more metadata
|
||||
entry.numDocsWithValue = meta.readVLong();
|
||||
final int blockShift = meta.readVInt();
|
||||
entry.monotonicMeta = DirectMonotonicReader.loadMeta(meta, entry.numDocsWithValue + 1, blockShift);
|
||||
ramBytesUsed.addAndGet(entry.monotonicMeta.ramBytesUsed());
|
||||
directAddressesMeta.put(info.name, entry.monotonicMeta);
|
||||
}
|
||||
entry.offset = meta.readLong();
|
||||
entry.count = meta.readVLong();
|
||||
switch(entry.format) {
|
||||
|
@ -351,6 +360,30 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close
|
|||
ramBytesUsed.addAndGet(entry.monotonicMeta.ramBytesUsed());
|
||||
directAddressesMeta.put(info.name, entry.monotonicMeta);
|
||||
break;
|
||||
case SPARSE_COMPRESSED:
|
||||
final byte numberType = meta.readByte();
|
||||
switch (numberType) {
|
||||
case 0:
|
||||
entry.numberType = NumberType.VALUE;
|
||||
break;
|
||||
case 1:
|
||||
entry.numberType = NumberType.ORDINAL;
|
||||
break;
|
||||
default:
|
||||
throw new CorruptIndexException("Number type can only be 0 or 1, got=" + numberType, meta);
|
||||
}
|
||||
|
||||
// now read the numeric entry for non-missing values
|
||||
final int fieldNumber = meta.readVInt();
|
||||
if (fieldNumber != info.number) {
|
||||
throw new CorruptIndexException("Field numbers mistmatch: " + fieldNumber + " != " + info.number, meta);
|
||||
}
|
||||
final int dvFormat = meta.readByte();
|
||||
if (dvFormat != NUMERIC) {
|
||||
throw new CorruptIndexException("Formats mistmatch: " + dvFormat + " != " + NUMERIC, meta);
|
||||
}
|
||||
entry.nonMissingValues = readNumericEntry(info, meta);
|
||||
break;
|
||||
default:
|
||||
throw new CorruptIndexException("Unknown format: " + entry.format + ", input=", meta);
|
||||
}
|
||||
|
@ -493,11 +526,162 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close
|
|||
}
|
||||
};
|
||||
}
|
||||
case SPARSE_COMPRESSED:
|
||||
final SparseBits docsWithField = getSparseLiveBits(entry);
|
||||
final LongValues values = getNumeric(entry.nonMissingValues);
|
||||
final long missingValue;
|
||||
switch (entry.numberType) {
|
||||
case ORDINAL:
|
||||
missingValue = -1L;
|
||||
break;
|
||||
case VALUE:
|
||||
missingValue = 0L;
|
||||
break;
|
||||
default:
|
||||
throw new AssertionError();
|
||||
}
|
||||
return new SparseLongValues(docsWithField, values, missingValue);
|
||||
default:
|
||||
throw new AssertionError();
|
||||
}
|
||||
}
|
||||
|
||||
static class SparseBits implements Bits {
|
||||
|
||||
final long maxDoc, docIDsLength, firstDocId;
|
||||
final LongValues docIds;
|
||||
|
||||
long index; // index of docId in docIds
|
||||
long docId; // doc ID at index
|
||||
long nextDocId; // doc ID at (index+1)
|
||||
|
||||
SparseBits(long maxDoc, long docIDsLength, LongValues docIDs) {
|
||||
if (docIDsLength > 0 && maxDoc <= docIDs.get(docIDsLength - 1)) {
|
||||
throw new IllegalArgumentException("maxDoc must be > the last element of docIDs");
|
||||
}
|
||||
this.maxDoc = maxDoc;
|
||||
this.docIDsLength = docIDsLength;
|
||||
this.docIds = docIDs;
|
||||
this.firstDocId = docIDsLength == 0 ? maxDoc : docIDs.get(0);
|
||||
reset();
|
||||
}
|
||||
|
||||
private void reset() {
|
||||
index = -1;
|
||||
this.docId = -1;
|
||||
this.nextDocId = firstDocId;
|
||||
}
|
||||
|
||||
/** Gallop forward and stop as soon as an index is found that is greater than
|
||||
* the given docId. {@code index} will store an index that stores a value
|
||||
* that is <= {@code docId} while the return value will give an index
|
||||
* that stores a value that is > {@code docId}. These indices can then be
|
||||
* used to binary search. */
|
||||
private long gallop(long docId) {
|
||||
index++;
|
||||
this.docId = nextDocId;
|
||||
long hiIndex = index + 1;
|
||||
|
||||
while (true) {
|
||||
if (hiIndex >= docIDsLength) {
|
||||
hiIndex = docIDsLength;
|
||||
nextDocId = maxDoc;
|
||||
break;
|
||||
}
|
||||
|
||||
final long hiDocId = docIds.get(hiIndex);
|
||||
if (hiDocId > docId) {
|
||||
nextDocId = hiDocId;
|
||||
break;
|
||||
}
|
||||
|
||||
final long delta = hiIndex - index;
|
||||
index = hiIndex;
|
||||
this.docId = hiDocId;
|
||||
hiIndex += delta << 1; // double the step each time
|
||||
}
|
||||
return hiIndex;
|
||||
}
|
||||
|
||||
private void binarySearch(long hiIndex, long docId) {
|
||||
while (index + 1 < hiIndex) {
|
||||
final long midIndex = (index + hiIndex) >>> 1;
|
||||
final long midDocId = docIds.get(midIndex);
|
||||
if (midDocId > docId) {
|
||||
hiIndex = midIndex;
|
||||
nextDocId = midDocId;
|
||||
} else {
|
||||
index = midIndex;
|
||||
this.docId = midDocId;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private boolean checkInvariants(long nextIndex, long docId) {
|
||||
assert this.docId <= docId;
|
||||
assert this.nextDocId > docId;
|
||||
assert (index == -1 && this.docId == -1) || this.docId == docIds.get(index);
|
||||
assert (nextIndex == docIDsLength && nextDocId == maxDoc) || nextDocId == docIds.get(nextIndex);
|
||||
return true;
|
||||
}
|
||||
|
||||
private void exponentialSearch(long docId) {
|
||||
// seek forward by doubling the interval on each iteration
|
||||
final long hiIndex = gallop(docId);
|
||||
assert checkInvariants(hiIndex, docId);
|
||||
|
||||
// now perform the actual binary search
|
||||
binarySearch(hiIndex, docId);
|
||||
}
|
||||
|
||||
boolean get(final long docId) {
|
||||
if (docId < this.docId) {
|
||||
// reading doc IDs backward, go back to the start
|
||||
reset();
|
||||
}
|
||||
|
||||
if (docId >= nextDocId) {
|
||||
exponentialSearch(docId);
|
||||
}
|
||||
|
||||
assert checkInvariants(index + 1, docId);
|
||||
return docId == this.docId;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean get(int index) {
|
||||
return get((long) index);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int length() {
|
||||
return Math.toIntExact(maxDoc);
|
||||
}
|
||||
}
|
||||
|
||||
static class SparseLongValues extends LongValues {
|
||||
|
||||
final SparseBits docsWithField;
|
||||
final LongValues values;
|
||||
final long missingValue;
|
||||
|
||||
SparseLongValues(SparseBits docsWithField, LongValues values, long missingValue) {
|
||||
this.docsWithField = docsWithField;
|
||||
this.values = values;
|
||||
this.missingValue = missingValue;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long get(long docId) {
|
||||
if (docsWithField.get(docId)) {
|
||||
return values.get(docsWithField.index);
|
||||
} else {
|
||||
return missingValue;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public BinaryDocValues getBinary(FieldInfo field) throws IOException {
|
||||
BinaryEntry bytes = binaries.get(field.name);
|
||||
|
@ -658,7 +842,12 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close
|
|||
if (ss.format == SORTED_SINGLE_VALUED) {
|
||||
NumericEntry numericEntry = numerics.get(field.name);
|
||||
final LongValues values = getNumeric(numericEntry);
|
||||
final Bits docsWithField = getLiveBits(numericEntry.missingOffset, maxDoc);
|
||||
final Bits docsWithField;
|
||||
if (numericEntry.format == SPARSE_COMPRESSED) {
|
||||
docsWithField = ((SparseLongValues) values).docsWithField;
|
||||
} else {
|
||||
docsWithField = getLiveBits(numericEntry.missingOffset, maxDoc);
|
||||
}
|
||||
return DocValues.singleton(values, docsWithField);
|
||||
} else if (ss.format == SORTED_WITH_ADDRESSES) {
|
||||
NumericEntry numericEntry = numerics.get(field.name);
|
||||
|
@ -898,6 +1087,12 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close
|
|||
}
|
||||
}
|
||||
|
||||
private SparseBits getSparseLiveBits(NumericEntry entry) throws IOException {
|
||||
final RandomAccessInput docIdsData = this.data.randomAccessSlice(entry.missingOffset, entry.offset - entry.missingOffset);
|
||||
final LongValues docIDs = DirectMonotonicReader.getInstance(entry.monotonicMeta, docIdsData);
|
||||
return new SparseBits(maxDoc, entry.numDocsWithValue, docIDs);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Bits getDocsWithField(FieldInfo field) throws IOException {
|
||||
switch(field.getDocValuesType()) {
|
||||
|
@ -912,7 +1107,11 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close
|
|||
return getLiveBits(be.missingOffset, maxDoc);
|
||||
case NUMERIC:
|
||||
NumericEntry ne = numerics.get(field.name);
|
||||
return getLiveBits(ne.missingOffset, maxDoc);
|
||||
if (ne.format == SPARSE_COMPRESSED) {
|
||||
return getSparseLiveBits(ne);
|
||||
} else {
|
||||
return getLiveBits(ne.missingOffset, maxDoc);
|
||||
}
|
||||
default:
|
||||
throw new AssertionError();
|
||||
}
|
||||
|
@ -950,6 +1149,12 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close
|
|||
long minValue;
|
||||
long gcd;
|
||||
long table[];
|
||||
|
||||
/** for sparse compression */
|
||||
long numDocsWithValue;
|
||||
NumericEntry nonMissingValues;
|
||||
NumberType numberType;
|
||||
|
||||
}
|
||||
|
||||
/** metadata entry for a binary docvalues field */
|
||||
|
|
|
@ -18,32 +18,52 @@ package org.apache.lucene.codecs.lucene54;
|
|||
*/
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.asserting.AssertingCodec;
|
||||
import org.apache.lucene.codecs.lucene54.Lucene54DocValuesProducer.SparseBits;
|
||||
import org.apache.lucene.codecs.lucene54.Lucene54DocValuesProducer.SparseLongValues;
|
||||
import org.apache.lucene.document.BinaryDocValuesField;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.NumericDocValuesField;
|
||||
import org.apache.lucene.document.SortedDocValuesField;
|
||||
import org.apache.lucene.document.SortedNumericDocValuesField;
|
||||
import org.apache.lucene.document.SortedSetDocValuesField;
|
||||
import org.apache.lucene.document.StoredField;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.index.BaseCompressingDocValuesFormatTestCase;
|
||||
import org.apache.lucene.index.BinaryDocValues;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.DocValues;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.SerialMergeScheduler;
|
||||
import org.apache.lucene.index.SortedDocValues;
|
||||
import org.apache.lucene.index.SortedNumericDocValues;
|
||||
import org.apache.lucene.index.SortedSetDocValues;
|
||||
import org.apache.lucene.index.StorableField;
|
||||
import org.apache.lucene.index.StoredDocument;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.TermsEnum.SeekStatus;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LongValues;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
/**
|
||||
|
@ -115,7 +135,141 @@ public class TestLucene54DocValuesFormat extends BaseCompressingDocValuesFormatT
|
|||
doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 8121), 1, 500);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Slow
|
||||
public void testSparseDocValuesVsStoredFields() throws Exception {
|
||||
int numIterations = atLeast(2);
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
doTestSparseDocValuesVsStoredFields();
|
||||
}
|
||||
}
|
||||
|
||||
private void doTestSparseDocValuesVsStoredFields() throws Exception {
|
||||
final long[] values = new long[TestUtil.nextInt(random(), 1, 500)];
|
||||
for (int i = 0; i < values.length; ++i) {
|
||||
values[i] = random().nextLong();
|
||||
}
|
||||
|
||||
Directory dir = newFSDirectory(createTempDir());
|
||||
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
|
||||
conf.setMergeScheduler(new SerialMergeScheduler());
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
|
||||
|
||||
// sparse compression is only enabled if less than 1% of docs have a value
|
||||
final int avgGap = 100;
|
||||
|
||||
final int numDocs = atLeast(100);
|
||||
for (int i = random().nextInt(avgGap * 2); i >= 0; --i) {
|
||||
writer.addDocument(new Document());
|
||||
}
|
||||
final int maxNumValuesPerDoc = random().nextBoolean() ? 1 : TestUtil.nextInt(random(), 2, 5);
|
||||
for (int i = 0; i < numDocs; ++i) {
|
||||
Document doc = new Document();
|
||||
|
||||
// single-valued
|
||||
long docValue = values[random().nextInt(values.length)];
|
||||
doc.add(new NumericDocValuesField("numeric", docValue));
|
||||
doc.add(new SortedDocValuesField("sorted", new BytesRef(Long.toString(docValue))));
|
||||
doc.add(new BinaryDocValuesField("binary", new BytesRef(Long.toString(docValue))));
|
||||
doc.add(new StoredField("value", docValue));
|
||||
|
||||
// multi-valued
|
||||
final int numValues = TestUtil.nextInt(random(), 1, maxNumValuesPerDoc);
|
||||
for (int j = 0; j < numValues; ++j) {
|
||||
docValue = values[random().nextInt(values.length)];
|
||||
doc.add(new SortedNumericDocValuesField("sorted_numeric", docValue));
|
||||
doc.add(new SortedSetDocValuesField("sorted_set", new BytesRef(Long.toString(docValue))));
|
||||
doc.add(new StoredField("values", docValue));
|
||||
}
|
||||
|
||||
writer.addDocument(doc);
|
||||
|
||||
// add a gap
|
||||
for (int j = random().nextInt(avgGap * 2); j >= 0; --j) {
|
||||
writer.addDocument(new Document());
|
||||
}
|
||||
}
|
||||
|
||||
if (random().nextBoolean()) {
|
||||
writer.forceMerge(1);
|
||||
}
|
||||
|
||||
final IndexReader indexReader = writer.getReader();
|
||||
writer.close();
|
||||
|
||||
for (LeafReaderContext context : indexReader.leaves()) {
|
||||
final LeafReader reader = context.reader();
|
||||
final NumericDocValues numeric = DocValues.getNumeric(reader, "numeric");
|
||||
final Bits numericBits = DocValues.getDocsWithField(reader, "numeric");
|
||||
|
||||
final SortedDocValues sorted = DocValues.getSorted(reader, "sorted");
|
||||
final Bits sortedBits = DocValues.getDocsWithField(reader, "sorted");
|
||||
|
||||
final BinaryDocValues binary = DocValues.getBinary(reader, "binary");
|
||||
final Bits binaryBits = DocValues.getDocsWithField(reader, "binary");
|
||||
|
||||
final SortedNumericDocValues sortedNumeric = DocValues.getSortedNumeric(reader, "sorted_numeric");
|
||||
final Bits sortedNumericBits = DocValues.getDocsWithField(reader, "sorted_numeric");
|
||||
|
||||
final SortedSetDocValues sortedSet = DocValues.getSortedSet(reader, "sorted_set");
|
||||
final Bits sortedSetBits = DocValues.getDocsWithField(reader, "sorted_set");
|
||||
|
||||
for (int i = 0; i < reader.maxDoc(); ++i) {
|
||||
final StoredDocument doc = reader.document(i);
|
||||
final StorableField valueField = doc.getField("value");
|
||||
final Long value = valueField == null ? null : valueField.numericValue().longValue();
|
||||
|
||||
if (value == null) {
|
||||
assertEquals(0, numeric.get(i));
|
||||
assertEquals(-1, sorted.getOrd(i));
|
||||
assertEquals(new BytesRef(), binary.get(i));
|
||||
|
||||
assertFalse(numericBits.get(i));
|
||||
assertFalse(sortedBits.get(i));
|
||||
assertFalse(binaryBits.get(i));
|
||||
} else {
|
||||
assertEquals(value.longValue(), numeric.get(i));
|
||||
assertTrue(sorted.getOrd(i) >= 0);
|
||||
assertEquals(new BytesRef(Long.toString(value)), sorted.lookupOrd(sorted.getOrd(i)));
|
||||
assertEquals(new BytesRef(Long.toString(value)), binary.get(i));
|
||||
|
||||
assertTrue(numericBits.get(i));
|
||||
assertTrue(sortedBits.get(i));
|
||||
assertTrue(binaryBits.get(i));
|
||||
}
|
||||
|
||||
final StorableField[] valuesFields = doc.getFields("values");
|
||||
final Set<Long> valueSet = new HashSet<>();
|
||||
for (StorableField sf : valuesFields) {
|
||||
valueSet.add(sf.numericValue().longValue());
|
||||
}
|
||||
|
||||
sortedNumeric.setDocument(i);
|
||||
assertEquals(valuesFields.length, sortedNumeric.count());
|
||||
for (int j = 0; j < sortedNumeric.count(); ++j) {
|
||||
assertTrue(valueSet.contains(sortedNumeric.valueAt(j)));
|
||||
}
|
||||
sortedSet.setDocument(i);
|
||||
int sortedSetCount = 0;
|
||||
while (true) {
|
||||
long ord = sortedSet.nextOrd();
|
||||
if (ord == SortedSetDocValues.NO_MORE_ORDS) {
|
||||
break;
|
||||
}
|
||||
assertTrue(valueSet.contains(Long.parseLong(sortedSet.lookupOrd(ord).utf8ToString())));
|
||||
sortedSetCount++;
|
||||
}
|
||||
assertEquals(valueSet.size(), sortedSetCount);
|
||||
|
||||
assertEquals(!valueSet.isEmpty(), sortedNumericBits.get(i));
|
||||
assertEquals(!valueSet.isEmpty(), sortedSetBits.get(i));
|
||||
}
|
||||
}
|
||||
|
||||
indexReader.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
// TODO: try to refactor this and some termsenum tests into the base class.
|
||||
// to do this we need to fix the test class to get a DVF not a Codec so we can setup
|
||||
// the postings format correctly.
|
||||
|
@ -278,4 +432,74 @@ public class TestLucene54DocValuesFormat extends BaseCompressingDocValuesFormatT
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void testSparseLongValues() {
|
||||
final int iters = atLeast(5);
|
||||
for (int iter = 0; iter < iters; ++iter) {
|
||||
final int numDocs = TestUtil.nextInt(random(), 0, 100);
|
||||
final long[] docIds = new long[numDocs];
|
||||
final long[] values = new long[numDocs];
|
||||
final long maxDoc;
|
||||
if (numDocs == 0) {
|
||||
maxDoc = 1 + random().nextInt(10);
|
||||
} else {
|
||||
docIds[0] = random().nextInt(10);
|
||||
for (int i = 1; i < docIds.length; ++i) {
|
||||
docIds[i] = docIds[i - 1] + 1 + random().nextInt(100);
|
||||
}
|
||||
maxDoc = docIds[numDocs - 1] + 1 + random().nextInt(10);
|
||||
}
|
||||
for (int i = 0; i < values.length; ++i) {
|
||||
values[i] = random().nextLong();
|
||||
}
|
||||
final long missingValue = random().nextLong();
|
||||
final LongValues docIdsValues = new LongValues() {
|
||||
@Override
|
||||
public long get(long index) {
|
||||
return docIds[Math.toIntExact(index)];
|
||||
}
|
||||
};
|
||||
final LongValues valuesValues = new LongValues() {
|
||||
@Override
|
||||
public long get(long index) {
|
||||
return values[Math.toIntExact(index)];
|
||||
}
|
||||
};
|
||||
final SparseBits liveBits = new SparseBits(maxDoc, numDocs, docIdsValues);
|
||||
// random-access
|
||||
for (int i = 0; i < 2000; ++i) {
|
||||
final long docId = TestUtil.nextLong(random(), 0, maxDoc - 1);
|
||||
final boolean exists = liveBits.get(Math.toIntExact(docId));
|
||||
assertEquals(Arrays.binarySearch(docIds, docId) >= 0, exists);
|
||||
}
|
||||
// sequential access
|
||||
for (int docId = 0; docId < maxDoc; docId += random().nextInt(3)) {
|
||||
final boolean exists = liveBits.get(Math.toIntExact(docId));
|
||||
assertEquals(Arrays.binarySearch(docIds, docId) >= 0, exists);
|
||||
}
|
||||
|
||||
final SparseLongValues sparseValues = new SparseLongValues(liveBits, valuesValues, missingValue);
|
||||
// random-access
|
||||
for (int i = 0; i < 2000; ++i) {
|
||||
final long docId = TestUtil.nextLong(random(), 0, maxDoc - 1);
|
||||
final int idx = Arrays.binarySearch(docIds, docId);
|
||||
final long value = sparseValues.get(docId);
|
||||
if (idx >= 0) {
|
||||
assertEquals(values[idx], value);
|
||||
} else {
|
||||
assertEquals(missingValue, value);
|
||||
}
|
||||
}
|
||||
// sequential access
|
||||
for (int docId = 0; docId < maxDoc; docId += random().nextInt(3)) {
|
||||
final int idx = Arrays.binarySearch(docIds, docId);
|
||||
final long value = sparseValues.get(docId);
|
||||
if (idx >= 0) {
|
||||
assertEquals(values[idx], value);
|
||||
} else {
|
||||
assertEquals(missingValue, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue