LUCENE-6863: Optimized storage requirements of doc values fields when less than 1% of documents have a value.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1712957 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Adrien Grand 2015-11-06 13:04:36 +00:00
parent 4ef2d43d58
commit 7c917a5ed8
5 changed files with 553 additions and 106 deletions

View File

@ -186,6 +186,9 @@ Optimizations
* LUCENE-6885: StandardDirectoryReader (initialCapacity) tweaks
(Christine Poerschke)
* LUCENE-6863: Optimized storage requirements of doc values fields when less
than 1% of documents have a value. (Adrien Grand)
Bug Fixes
* LUCENE-6817: ComplexPhraseQueryParser.ComplexPhraseQuery does not display

View File

@ -28,6 +28,7 @@ import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.stream.StreamSupport;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.DocValuesConsumer;
@ -54,6 +55,13 @@ import static org.apache.lucene.codecs.lucene54.Lucene54DocValuesFormat.*;
/** writer for {@link Lucene54DocValuesFormat} */
final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Closeable {
enum NumberType {
/** Dense ordinals */
ORDINAL,
/** Random long values */
VALUE;
}
IndexOutput data, meta;
final int maxDoc;
@ -78,10 +86,10 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close
@Override
public void addNumericField(FieldInfo field, Iterable<Number> values) throws IOException {
addNumericField(field, values, true);
addNumericField(field, values, NumberType.VALUE);
}
void addNumericField(FieldInfo field, Iterable<Number> values, boolean optimizeStorage) throws IOException {
void addNumericField(FieldInfo field, Iterable<Number> values, NumberType numberType) throws IOException {
long count = 0;
long minValue = Long.MAX_VALUE;
long maxValue = Long.MIN_VALUE;
@ -90,7 +98,8 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close
long zeroCount = 0;
// TODO: more efficient?
HashSet<Long> uniqueValues = null;
if (optimizeStorage) {
long missingOrdCount = 0;
if (numberType == NumberType.VALUE) {
uniqueValues = new HashSet<>();
for (Number nv : values) {
@ -133,6 +142,9 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close
} else {
for (Number nv : values) {
long v = nv.longValue();
if (v == -1L) {
missingOrdCount++;
}
minValue = Math.min(minValue, v);
maxValue = Math.max(maxValue, v);
++count;
@ -145,6 +157,18 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close
? Integer.MAX_VALUE
: DirectWriter.bitsRequired(uniqueValues.size() - 1);
final boolean sparse; // 1% of docs or less have a value
switch (numberType) {
case VALUE:
sparse = (double) missingCount / count >= 0.99;
break;
case ORDINAL:
sparse = (double) missingOrdCount / count >= 0.99;
break;
default:
throw new AssertionError();
}
final int format;
if (uniqueValues != null
&& count <= Integer.MAX_VALUE
@ -152,6 +176,9 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close
|| (uniqueValues.size() == 2 && missingCount > 0 && zeroCount == missingCount))) {
// either one unique value C or two unique values: "missing" and C
format = CONST_COMPRESSED;
} else if (sparse && count >= 1024) {
// require at least 1024 docs to avoid flipping back and forth when doing NRT search
format = SPARSE_COMPRESSED;
} else if (uniqueValues != null && tableBitsRequired < deltaBitsRequired) {
format = TABLE_COMPRESSED;
} else if (gcd != 0 && gcd != 1) {
@ -164,7 +191,22 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close
meta.writeVInt(field.number);
meta.writeByte(Lucene54DocValuesFormat.NUMERIC);
meta.writeVInt(format);
if (missingCount == 0) {
if (format == SPARSE_COMPRESSED) {
meta.writeLong(data.getFilePointer());
final long numDocsWithValue;
switch (numberType) {
case VALUE:
numDocsWithValue = count - missingCount;
break;
case ORDINAL:
numDocsWithValue = count - missingOrdCount;
break;
default:
throw new AssertionError();
}
final long maxDoc = writeSparseMissingBitset(values, numberType, numDocsWithValue);
assert maxDoc == count;
} else if (missingCount == 0) {
meta.writeLong(ALL_LIVE);
} else if (missingCount == count) {
meta.writeLong(ALL_MISSING);
@ -220,6 +262,39 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close
}
ordsWriter.finish();
break;
case SPARSE_COMPRESSED:
final Iterable<Number> filteredMissingValues;
switch (numberType) {
case VALUE:
meta.writeByte((byte) 0);
filteredMissingValues = new Iterable<Number>() {
@Override
public Iterator<Number> iterator() {
return StreamSupport
.stream(values.spliterator(), false)
.filter(value -> value != null)
.iterator();
}
};
break;
case ORDINAL:
meta.writeByte((byte) 1);
filteredMissingValues = new Iterable<Number>() {
@Override
public Iterator<Number> iterator() {
return StreamSupport
.stream(values.spliterator(), false)
.filter(value -> value.longValue() != -1L)
.iterator();
}
};
break;
default:
throw new AssertionError();
}
// Write non-missing values as a numeric field
addNumericField(field, filteredMissingValues, numberType);
break;
default:
throw new AssertionError();
}
@ -247,6 +322,34 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close
}
}
long writeSparseMissingBitset(Iterable<Number> values, NumberType numberType, long numDocsWithValue) throws IOException {
meta.writeVLong(numDocsWithValue);
// Write doc IDs that have a value
meta.writeVInt(DIRECT_MONOTONIC_BLOCK_SHIFT);
final DirectMonotonicWriter docIdsWriter = DirectMonotonicWriter.getInstance(meta, data, numDocsWithValue, DIRECT_MONOTONIC_BLOCK_SHIFT);
long docID = 0;
for (Number nv : values) {
switch (numberType) {
case VALUE:
if (nv != null) {
docIdsWriter.add(docID);
}
break;
case ORDINAL:
if (nv.longValue() != -1L) {
docIdsWriter.add(docID);
}
break;
default:
throw new AssertionError();
}
docID++;
}
docIdsWriter.finish();
return docID;
}
@Override
public void addBinaryField(FieldInfo field, Iterable<BytesRef> values) throws IOException {
// write the byte[] data
@ -458,7 +561,7 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close
meta.writeVInt(field.number);
meta.writeByte(Lucene54DocValuesFormat.SORTED);
addTermsDict(field, values);
addNumericField(field, docToOrd, false);
addNumericField(field, docToOrd, NumberType.ORDINAL);
}
@Override
@ -478,11 +581,11 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close
writeDictionary(uniqueValueSets);
// write the doc -> set_id as a numeric field
addNumericField(field, docToSetId(uniqueValueSets, docToValueCount, values), false);
addNumericField(field, docToSetId(uniqueValueSets, docToValueCount, values), NumberType.ORDINAL);
} else {
meta.writeVInt(SORTED_WITH_ADDRESSES);
// write the stream of values as a numeric field
addNumericField(field, values, true);
addNumericField(field, values, NumberType.VALUE);
// write the doc -> ord count as a absolute index to the stream
addOrdIndex(field, docToValueCount);
}
@ -510,7 +613,7 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close
addTermsDict(field, values);
// write the doc -> set_id as a numeric field
addNumericField(field, docToSetId(uniqueValueSets, docToOrdCount, ords), false);
addNumericField(field, docToSetId(uniqueValueSets, docToOrdCount, ords), NumberType.ORDINAL);
} else {
meta.writeVInt(SORTED_WITH_ADDRESSES);
@ -519,7 +622,7 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close
// write the stream of ords as a numeric field
// NOTE: we could return an iterator that delta-encodes these within a doc
addNumericField(field, ords, false);
addNumericField(field, ords, NumberType.ORDINAL);
// write the doc -> ord count as a absolute index to the stream
addOrdIndex(field, docToOrdCount);

View File

@ -19,18 +19,14 @@ package org.apache.lucene.codecs.lucene54;
import java.io.IOException;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.SmallFloat;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.packed.DirectWriter;
import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
/**
* Lucene 5.4 DocValues format.
@ -51,6 +47,8 @@ import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
* as blocks of bitpacked integers, encoding the deviation from the expected delta.
* <li>Const-compressed: when there is only one possible non-missing value, only the missing
* bitset is encoded.
* <li>Sparse-compressed: only documents with a value are stored, and lookups are performed
* using binary search.
* </ul>
* <p>
* {@link DocValuesType#BINARY BINARY}:
@ -96,93 +94,6 @@ import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
* <li><tt>.dvd</tt>: DocValues data</li>
* <li><tt>.dvm</tt>: DocValues metadata</li>
* </ol>
* <ol>
* <li><a name="dvm"></a>
* <p>The DocValues metadata or .dvm file.</p>
* <p>For DocValues field, this stores metadata, such as the offset into the
* DocValues data (.dvd)</p>
* <p>DocValues metadata (.dvm) --&gt; Header,&lt;Entry&gt;<sup>NumFields</sup>,Footer</p>
* <ul>
* <li>Entry --&gt; NumericEntry | BinaryEntry | SortedEntry | SortedSetEntry | SortedNumericEntry</li>
* <li>NumericEntry --&gt; GCDNumericEntry | TableNumericEntry | DeltaNumericEntry</li>
* <li>GCDNumericEntry --&gt; NumericHeader,MinValue,GCD,BitsPerValue</li>
* <li>TableNumericEntry --&gt; NumericHeader,TableSize,{@link DataOutput#writeLong Int64}<sup>TableSize</sup>,BitsPerValue</li>
* <li>DeltaNumericEntry --&gt; NumericHeader,MinValue,BitsPerValue</li>
* <li>MonotonicNumericEntry --&gt; NumericHeader,PackedVersion,BlockSize</li>
* <li>NumericHeader --&gt; FieldNumber,EntryType,NumericType,MissingOffset,DataOffset,Count,EndOffset</li>
* <li>BinaryEntry --&gt; FixedBinaryEntry | VariableBinaryEntry | PrefixBinaryEntry</li>
* <li>FixedBinaryEntry --&gt; BinaryHeader</li>
* <li>VariableBinaryEntry --&gt; BinaryHeader,AddressOffset,PackedVersion,BlockSize</li>
* <li>PrefixBinaryEntry --&gt; BinaryHeader,AddressInterval,AddressOffset,PackedVersion,BlockSize</li>
* <li>BinaryHeader --&gt; FieldNumber,EntryType,BinaryType,MissingOffset,MinLength,MaxLength,DataOffset</li>
* <li>SortedEntry --&gt; FieldNumber,EntryType,BinaryEntry,NumericEntry</li>
* <li>SortedSetEntry --&gt; SingleSortedSetEntry | AddressesSortedSetEntry | TableSortedSetEntry</li>
* <li>SingleSortedSetEntry --&gt; SetHeader,SortedEntry</li>
* <li>AddressesSortedSetEntry --&gt; SetHeader,BinaryEntry,NumericEntry,NumericEntry</li>
* <li>TableSortedSetEntry --&gt; SetHeader,TotalTableLength,{@link DataOutput#writeLong Int64}<sup>TotalTableLength</sup>,TableSize,{@link DataOutput#writeInt Int32}<sup>TableSize</sup>,BinaryEntry,NumericEntry</li>
* <li>SetHeader --&gt; FieldNumber,EntryType,SetType</li>
* <li>SortedNumericEntry --&gt; SingleSortedNumericEntry | AddressesSortedNumericEntry | TableSortedNumericEntry</li>
* <li>SingleNumericEntry --&gt; SetHeader,NumericEntry</li>
* <li>AddressesSortedNumericEntry --&gt; SetHeader,NumericEntry,NumericEntry</li>
* <li>TableSortedNumericEntry --&gt; SetHeader,TotalTableLength,{@link DataOutput#writeLong Int64}<sup>TotalTableLength</sup>,TableSize,{@link DataOutput#writeInt Int32}<sup>TableSize</sup>,NumericEntry</li>
* <li>FieldNumber,PackedVersion,MinLength,MaxLength,BlockSize,ValueCount --&gt; {@link DataOutput#writeVInt VInt}</li>
* <li>EntryType,CompressionType --&gt; {@link DataOutput#writeByte Byte}</li>
* <li>Header --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}</li>
* <li>MinValue,GCD,MissingOffset,AddressOffset,DataOffset,EndOffset --&gt; {@link DataOutput#writeLong Int64}</li>
* <li>TableSize,BitsPerValue,TotalTableLength --&gt; {@link DataOutput#writeVInt vInt}</li>
* <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}</li>
* </ul>
* <p>Sorted fields have two entries: a BinaryEntry with the value metadata,
* and an ordinary NumericEntry for the document-to-ord metadata.</p>
* <p>FieldNumber of -1 indicates the end of metadata.</p>
* <p>EntryType is a 0 (NumericEntry) or 1 (BinaryEntry)</p>
* <p>DataOffset is the pointer to the start of the data in the DocValues data (.dvd)</p>
* <p>EndOffset is the pointer to the end of the data in the DocValues data (.dvd)</p>
* <p>NumericType indicates how Numeric values will be compressed:
* <ul>
* <li>0 --&gt; delta-compressed. For each block of 16k integers, every integer is delta-encoded
* from the minimum value within the block.
* <li>1 --&gt; gcd-compressed. When all integers share a common divisor, only quotients are stored
* using blocks of delta-encoded ints.
* <li>2 --&gt; table-compressed. When the number of unique numeric values is small and it would save space,
* a lookup table of unique values is written, followed by the ordinal for each document.
* <li>3 --&gt; monotonic-compressed. Used to implement addressing for BINARY, SORTED_SET, SORTED_NUMERIC.
* <li>4 --&gt; const-compressed. Used when all non-missing values are the same.
* </ul>
* <p>BinaryType indicates how Binary values will be stored:
* <ul>
* <li>0 --&gt; fixed-width. All values have the same length, addressing by multiplication.
* <li>1 --&gt; variable-width. An address for each value is stored.
* <li>2 --&gt; prefix-compressed. An address to the start of every interval'th value is stored.
* </ul>
* <p>SetType indicates how SortedSet and SortedNumeric values will be stored:
* <ul>
* <li>0 --&gt; with addresses. There are two numeric entries: a first one from document to start
* offset, and a second one from offset to ord/value.
* <li>1 --&gt; single-valued. Used when all documents have at most one value and is encoded like
* a regular Sorted/Numeric entry.
* <li>2 --&gt; table-encoded. A lookup table of unique sets of values is written, followed by a
* numeric entry that maps each document to an ordinal in this table.
* </ul>
* <p>MinLength and MaxLength represent the min and max byte[] value lengths for Binary values.
* If they are equal, then all values are of a fixed size, and can be addressed as DataOffset + (docID * length).
* Otherwise, the binary values are of variable size, and packed integer metadata (PackedVersion,BlockSize)
* is written for the addresses.
* <p>MissingOffset points to a byte[] containing a bitset of all documents that had a value for the field.
* If it's -1, then there are no missing values. If it's -2, all values are missing.
* <li><a name="dvd"></a>
* <p>The DocValues data or .dvd file.</p>
* <p>For DocValues field, this stores the actual per-document data (the heavy-lifting)</p>
* <p>DocValues data (.dvd) --&gt; Header,&lt;NumericData | BinaryData | SortedData&gt;<sup>NumFields</sup>,Footer</p>
* <ul>
* <li>NumericData --&gt; DeltaCompressedNumerics | TableCompressedNumerics | GCDCompressedNumerics</li>
* <li>BinaryData --&gt; {@link DataOutput#writeByte Byte}<sup>DataLength</sup>,Addresses</li>
* <li>SortedData --&gt; {@link FST FST&lt;Int64&gt;}</li>
* <li>DeltaCompressedNumerics,TableCompressedNumerics,GCDCompressedNumerics --&gt; {@link DirectWriter PackedInts}</li>
* <li>Addresses --&gt; {@link MonotonicBlockPackedWriter MonotonicBlockPackedInts(blockSize=16k)}</li>
* <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}</li>
* </ul>
* </ol>
* @lucene.experimental
*/
public final class Lucene54DocValuesFormat extends DocValuesFormat {
@ -207,8 +118,7 @@ public final class Lucene54DocValuesFormat extends DocValuesFormat {
static final String META_CODEC = "Lucene54DocValuesMetadata";
static final String META_EXTENSION = "dvm";
static final int VERSION_START = 0;
static final int VERSION_SORTEDSET_TABLE = 1;
static final int VERSION_CURRENT = VERSION_SORTEDSET_TABLE;
static final int VERSION_CURRENT = VERSION_START;
// indicates docvalues type
static final byte NUMERIC = 0;
@ -242,7 +152,9 @@ public final class Lucene54DocValuesFormat extends DocValuesFormat {
static final int MONOTONIC_COMPRESSED = 3;
/** Compressed with constant value (uses only missing bitset) */
static final int CONST_COMPRESSED = 4;
/** Compressed with sparse arrays. */
static final int SPARSE_COMPRESSED = 5;
/** Uncompressed binary, written directly (fixed length). */
static final int BINARY_FIXED_UNCOMPRESSED = 0;
/** Uncompressed binary, written directly (variable length). */

View File

@ -29,6 +29,7 @@ import java.util.concurrent.atomic.AtomicLong;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.codecs.lucene54.Lucene54DocValuesConsumer.NumberType;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DocValues;
@ -314,6 +315,14 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close
NumericEntry entry = new NumericEntry();
entry.format = meta.readVInt();
entry.missingOffset = meta.readLong();
if (entry.format == SPARSE_COMPRESSED) {
// sparse bits need a bit more metadata
entry.numDocsWithValue = meta.readVLong();
final int blockShift = meta.readVInt();
entry.monotonicMeta = DirectMonotonicReader.loadMeta(meta, entry.numDocsWithValue + 1, blockShift);
ramBytesUsed.addAndGet(entry.monotonicMeta.ramBytesUsed());
directAddressesMeta.put(info.name, entry.monotonicMeta);
}
entry.offset = meta.readLong();
entry.count = meta.readVLong();
switch(entry.format) {
@ -351,6 +360,30 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close
ramBytesUsed.addAndGet(entry.monotonicMeta.ramBytesUsed());
directAddressesMeta.put(info.name, entry.monotonicMeta);
break;
case SPARSE_COMPRESSED:
final byte numberType = meta.readByte();
switch (numberType) {
case 0:
entry.numberType = NumberType.VALUE;
break;
case 1:
entry.numberType = NumberType.ORDINAL;
break;
default:
throw new CorruptIndexException("Number type can only be 0 or 1, got=" + numberType, meta);
}
// now read the numeric entry for non-missing values
final int fieldNumber = meta.readVInt();
if (fieldNumber != info.number) {
throw new CorruptIndexException("Field numbers mistmatch: " + fieldNumber + " != " + info.number, meta);
}
final int dvFormat = meta.readByte();
if (dvFormat != NUMERIC) {
throw new CorruptIndexException("Formats mistmatch: " + dvFormat + " != " + NUMERIC, meta);
}
entry.nonMissingValues = readNumericEntry(info, meta);
break;
default:
throw new CorruptIndexException("Unknown format: " + entry.format + ", input=", meta);
}
@ -493,11 +526,162 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close
}
};
}
case SPARSE_COMPRESSED:
final SparseBits docsWithField = getSparseLiveBits(entry);
final LongValues values = getNumeric(entry.nonMissingValues);
final long missingValue;
switch (entry.numberType) {
case ORDINAL:
missingValue = -1L;
break;
case VALUE:
missingValue = 0L;
break;
default:
throw new AssertionError();
}
return new SparseLongValues(docsWithField, values, missingValue);
default:
throw new AssertionError();
}
}
static class SparseBits implements Bits {
final long maxDoc, docIDsLength, firstDocId;
final LongValues docIds;
long index; // index of docId in docIds
long docId; // doc ID at index
long nextDocId; // doc ID at (index+1)
SparseBits(long maxDoc, long docIDsLength, LongValues docIDs) {
if (docIDsLength > 0 && maxDoc <= docIDs.get(docIDsLength - 1)) {
throw new IllegalArgumentException("maxDoc must be > the last element of docIDs");
}
this.maxDoc = maxDoc;
this.docIDsLength = docIDsLength;
this.docIds = docIDs;
this.firstDocId = docIDsLength == 0 ? maxDoc : docIDs.get(0);
reset();
}
private void reset() {
index = -1;
this.docId = -1;
this.nextDocId = firstDocId;
}
/** Gallop forward and stop as soon as an index is found that is greater than
* the given docId. {@code index} will store an index that stores a value
* that is &lt;= {@code docId} while the return value will give an index
* that stores a value that is &gt; {@code docId}. These indices can then be
* used to binary search. */
private long gallop(long docId) {
index++;
this.docId = nextDocId;
long hiIndex = index + 1;
while (true) {
if (hiIndex >= docIDsLength) {
hiIndex = docIDsLength;
nextDocId = maxDoc;
break;
}
final long hiDocId = docIds.get(hiIndex);
if (hiDocId > docId) {
nextDocId = hiDocId;
break;
}
final long delta = hiIndex - index;
index = hiIndex;
this.docId = hiDocId;
hiIndex += delta << 1; // double the step each time
}
return hiIndex;
}
private void binarySearch(long hiIndex, long docId) {
while (index + 1 < hiIndex) {
final long midIndex = (index + hiIndex) >>> 1;
final long midDocId = docIds.get(midIndex);
if (midDocId > docId) {
hiIndex = midIndex;
nextDocId = midDocId;
} else {
index = midIndex;
this.docId = midDocId;
}
}
}
private boolean checkInvariants(long nextIndex, long docId) {
assert this.docId <= docId;
assert this.nextDocId > docId;
assert (index == -1 && this.docId == -1) || this.docId == docIds.get(index);
assert (nextIndex == docIDsLength && nextDocId == maxDoc) || nextDocId == docIds.get(nextIndex);
return true;
}
private void exponentialSearch(long docId) {
// seek forward by doubling the interval on each iteration
final long hiIndex = gallop(docId);
assert checkInvariants(hiIndex, docId);
// now perform the actual binary search
binarySearch(hiIndex, docId);
}
boolean get(final long docId) {
if (docId < this.docId) {
// reading doc IDs backward, go back to the start
reset();
}
if (docId >= nextDocId) {
exponentialSearch(docId);
}
assert checkInvariants(index + 1, docId);
return docId == this.docId;
}
@Override
public boolean get(int index) {
return get((long) index);
}
@Override
public int length() {
return Math.toIntExact(maxDoc);
}
}
static class SparseLongValues extends LongValues {
final SparseBits docsWithField;
final LongValues values;
final long missingValue;
SparseLongValues(SparseBits docsWithField, LongValues values, long missingValue) {
this.docsWithField = docsWithField;
this.values = values;
this.missingValue = missingValue;
}
@Override
public long get(long docId) {
if (docsWithField.get(docId)) {
return values.get(docsWithField.index);
} else {
return missingValue;
}
}
}
@Override
public BinaryDocValues getBinary(FieldInfo field) throws IOException {
BinaryEntry bytes = binaries.get(field.name);
@ -658,7 +842,12 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close
if (ss.format == SORTED_SINGLE_VALUED) {
NumericEntry numericEntry = numerics.get(field.name);
final LongValues values = getNumeric(numericEntry);
final Bits docsWithField = getLiveBits(numericEntry.missingOffset, maxDoc);
final Bits docsWithField;
if (numericEntry.format == SPARSE_COMPRESSED) {
docsWithField = ((SparseLongValues) values).docsWithField;
} else {
docsWithField = getLiveBits(numericEntry.missingOffset, maxDoc);
}
return DocValues.singleton(values, docsWithField);
} else if (ss.format == SORTED_WITH_ADDRESSES) {
NumericEntry numericEntry = numerics.get(field.name);
@ -898,6 +1087,12 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close
}
}
private SparseBits getSparseLiveBits(NumericEntry entry) throws IOException {
final RandomAccessInput docIdsData = this.data.randomAccessSlice(entry.missingOffset, entry.offset - entry.missingOffset);
final LongValues docIDs = DirectMonotonicReader.getInstance(entry.monotonicMeta, docIdsData);
return new SparseBits(maxDoc, entry.numDocsWithValue, docIDs);
}
@Override
public Bits getDocsWithField(FieldInfo field) throws IOException {
switch(field.getDocValuesType()) {
@ -912,7 +1107,11 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close
return getLiveBits(be.missingOffset, maxDoc);
case NUMERIC:
NumericEntry ne = numerics.get(field.name);
return getLiveBits(ne.missingOffset, maxDoc);
if (ne.format == SPARSE_COMPRESSED) {
return getSparseLiveBits(ne);
} else {
return getLiveBits(ne.missingOffset, maxDoc);
}
default:
throw new AssertionError();
}
@ -950,6 +1149,12 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close
long minValue;
long gcd;
long table[];
/** for sparse compression */
long numDocsWithValue;
NumericEntry nonMissingValues;
NumberType numberType;
}
/** metadata entry for a binary docvalues field */

View File

@ -18,32 +18,52 @@ package org.apache.lucene.codecs.lucene54;
*/
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.asserting.AssertingCodec;
import org.apache.lucene.codecs.lucene54.Lucene54DocValuesProducer.SparseBits;
import org.apache.lucene.codecs.lucene54.Lucene54DocValuesProducer.SparseLongValues;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.SortedNumericDocValuesField;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.BaseCompressingDocValuesFormatTestCase;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.SerialMergeScheduler;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.StorableField;
import org.apache.lucene.index.StoredDocument;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.TermsEnum.SeekStatus;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LongValues;
import org.apache.lucene.util.TestUtil;
/**
@ -115,7 +135,141 @@ public class TestLucene54DocValuesFormat extends BaseCompressingDocValuesFormatT
doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 8121), 1, 500);
}
}
@Slow
public void testSparseDocValuesVsStoredFields() throws Exception {
int numIterations = atLeast(2);
for (int i = 0; i < numIterations; i++) {
doTestSparseDocValuesVsStoredFields();
}
}
private void doTestSparseDocValuesVsStoredFields() throws Exception {
final long[] values = new long[TestUtil.nextInt(random(), 1, 500)];
for (int i = 0; i < values.length; ++i) {
values[i] = random().nextLong();
}
Directory dir = newFSDirectory(createTempDir());
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
conf.setMergeScheduler(new SerialMergeScheduler());
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
// sparse compression is only enabled if less than 1% of docs have a value
final int avgGap = 100;
final int numDocs = atLeast(100);
for (int i = random().nextInt(avgGap * 2); i >= 0; --i) {
writer.addDocument(new Document());
}
final int maxNumValuesPerDoc = random().nextBoolean() ? 1 : TestUtil.nextInt(random(), 2, 5);
for (int i = 0; i < numDocs; ++i) {
Document doc = new Document();
// single-valued
long docValue = values[random().nextInt(values.length)];
doc.add(new NumericDocValuesField("numeric", docValue));
doc.add(new SortedDocValuesField("sorted", new BytesRef(Long.toString(docValue))));
doc.add(new BinaryDocValuesField("binary", new BytesRef(Long.toString(docValue))));
doc.add(new StoredField("value", docValue));
// multi-valued
final int numValues = TestUtil.nextInt(random(), 1, maxNumValuesPerDoc);
for (int j = 0; j < numValues; ++j) {
docValue = values[random().nextInt(values.length)];
doc.add(new SortedNumericDocValuesField("sorted_numeric", docValue));
doc.add(new SortedSetDocValuesField("sorted_set", new BytesRef(Long.toString(docValue))));
doc.add(new StoredField("values", docValue));
}
writer.addDocument(doc);
// add a gap
for (int j = random().nextInt(avgGap * 2); j >= 0; --j) {
writer.addDocument(new Document());
}
}
if (random().nextBoolean()) {
writer.forceMerge(1);
}
final IndexReader indexReader = writer.getReader();
writer.close();
for (LeafReaderContext context : indexReader.leaves()) {
final LeafReader reader = context.reader();
final NumericDocValues numeric = DocValues.getNumeric(reader, "numeric");
final Bits numericBits = DocValues.getDocsWithField(reader, "numeric");
final SortedDocValues sorted = DocValues.getSorted(reader, "sorted");
final Bits sortedBits = DocValues.getDocsWithField(reader, "sorted");
final BinaryDocValues binary = DocValues.getBinary(reader, "binary");
final Bits binaryBits = DocValues.getDocsWithField(reader, "binary");
final SortedNumericDocValues sortedNumeric = DocValues.getSortedNumeric(reader, "sorted_numeric");
final Bits sortedNumericBits = DocValues.getDocsWithField(reader, "sorted_numeric");
final SortedSetDocValues sortedSet = DocValues.getSortedSet(reader, "sorted_set");
final Bits sortedSetBits = DocValues.getDocsWithField(reader, "sorted_set");
for (int i = 0; i < reader.maxDoc(); ++i) {
final StoredDocument doc = reader.document(i);
final StorableField valueField = doc.getField("value");
final Long value = valueField == null ? null : valueField.numericValue().longValue();
if (value == null) {
assertEquals(0, numeric.get(i));
assertEquals(-1, sorted.getOrd(i));
assertEquals(new BytesRef(), binary.get(i));
assertFalse(numericBits.get(i));
assertFalse(sortedBits.get(i));
assertFalse(binaryBits.get(i));
} else {
assertEquals(value.longValue(), numeric.get(i));
assertTrue(sorted.getOrd(i) >= 0);
assertEquals(new BytesRef(Long.toString(value)), sorted.lookupOrd(sorted.getOrd(i)));
assertEquals(new BytesRef(Long.toString(value)), binary.get(i));
assertTrue(numericBits.get(i));
assertTrue(sortedBits.get(i));
assertTrue(binaryBits.get(i));
}
final StorableField[] valuesFields = doc.getFields("values");
final Set<Long> valueSet = new HashSet<>();
for (StorableField sf : valuesFields) {
valueSet.add(sf.numericValue().longValue());
}
sortedNumeric.setDocument(i);
assertEquals(valuesFields.length, sortedNumeric.count());
for (int j = 0; j < sortedNumeric.count(); ++j) {
assertTrue(valueSet.contains(sortedNumeric.valueAt(j)));
}
sortedSet.setDocument(i);
int sortedSetCount = 0;
while (true) {
long ord = sortedSet.nextOrd();
if (ord == SortedSetDocValues.NO_MORE_ORDS) {
break;
}
assertTrue(valueSet.contains(Long.parseLong(sortedSet.lookupOrd(ord).utf8ToString())));
sortedSetCount++;
}
assertEquals(valueSet.size(), sortedSetCount);
assertEquals(!valueSet.isEmpty(), sortedNumericBits.get(i));
assertEquals(!valueSet.isEmpty(), sortedSetBits.get(i));
}
}
indexReader.close();
dir.close();
}
// TODO: try to refactor this and some termsenum tests into the base class.
// to do this we need to fix the test class to get a DVF not a Codec so we can setup
// the postings format correctly.
@ -278,4 +432,74 @@ public class TestLucene54DocValuesFormat extends BaseCompressingDocValuesFormatT
}
}
}
public void testSparseLongValues() {
final int iters = atLeast(5);
for (int iter = 0; iter < iters; ++iter) {
final int numDocs = TestUtil.nextInt(random(), 0, 100);
final long[] docIds = new long[numDocs];
final long[] values = new long[numDocs];
final long maxDoc;
if (numDocs == 0) {
maxDoc = 1 + random().nextInt(10);
} else {
docIds[0] = random().nextInt(10);
for (int i = 1; i < docIds.length; ++i) {
docIds[i] = docIds[i - 1] + 1 + random().nextInt(100);
}
maxDoc = docIds[numDocs - 1] + 1 + random().nextInt(10);
}
for (int i = 0; i < values.length; ++i) {
values[i] = random().nextLong();
}
final long missingValue = random().nextLong();
final LongValues docIdsValues = new LongValues() {
@Override
public long get(long index) {
return docIds[Math.toIntExact(index)];
}
};
final LongValues valuesValues = new LongValues() {
@Override
public long get(long index) {
return values[Math.toIntExact(index)];
}
};
final SparseBits liveBits = new SparseBits(maxDoc, numDocs, docIdsValues);
// random-access
for (int i = 0; i < 2000; ++i) {
final long docId = TestUtil.nextLong(random(), 0, maxDoc - 1);
final boolean exists = liveBits.get(Math.toIntExact(docId));
assertEquals(Arrays.binarySearch(docIds, docId) >= 0, exists);
}
// sequential access
for (int docId = 0; docId < maxDoc; docId += random().nextInt(3)) {
final boolean exists = liveBits.get(Math.toIntExact(docId));
assertEquals(Arrays.binarySearch(docIds, docId) >= 0, exists);
}
final SparseLongValues sparseValues = new SparseLongValues(liveBits, valuesValues, missingValue);
// random-access
for (int i = 0; i < 2000; ++i) {
final long docId = TestUtil.nextLong(random(), 0, maxDoc - 1);
final int idx = Arrays.binarySearch(docIds, docId);
final long value = sparseValues.get(docId);
if (idx >= 0) {
assertEquals(values[idx], value);
} else {
assertEquals(missingValue, value);
}
}
// sequential access
for (int docId = 0; docId < maxDoc; docId += random().nextInt(3)) {
final int idx = Arrays.binarySearch(docIds, docId);
final long value = sparseValues.get(docId);
if (idx >= 0) {
assertEquals(values[idx], value);
} else {
assertEquals(missingValue, value);
}
}
}
}
}