diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 5f69675962e..24809ce35a3 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -186,6 +186,9 @@ Optimizations * LUCENE-6885: StandardDirectoryReader (initialCapacity) tweaks (Christine Poerschke) +* LUCENE-6863: Optimized storage requirements of doc values fields when less + than 1% of documents have a value. (Adrien Grand) + Bug Fixes * LUCENE-6817: ComplexPhraseQueryParser.ComplexPhraseQuery does not display diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesConsumer.java index e3957004e98..f53d0377e22 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesConsumer.java @@ -28,6 +28,7 @@ import java.util.Map; import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; +import java.util.stream.StreamSupport; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.DocValuesConsumer; @@ -54,6 +55,13 @@ import static org.apache.lucene.codecs.lucene54.Lucene54DocValuesFormat.*; /** writer for {@link Lucene54DocValuesFormat} */ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Closeable { + enum NumberType { + /** Dense ordinals */ + ORDINAL, + /** Random long values */ + VALUE; + } + IndexOutput data, meta; final int maxDoc; @@ -78,10 +86,10 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close @Override public void addNumericField(FieldInfo field, Iterable values) throws IOException { - addNumericField(field, values, true); + addNumericField(field, values, NumberType.VALUE); } - void addNumericField(FieldInfo field, Iterable values, boolean optimizeStorage) throws IOException { + void addNumericField(FieldInfo field, Iterable values, NumberType numberType) throws IOException { long count = 0; long minValue = Long.MAX_VALUE; long maxValue = Long.MIN_VALUE; @@ -90,7 +98,8 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close long zeroCount = 0; // TODO: more efficient? HashSet uniqueValues = null; - if (optimizeStorage) { + long missingOrdCount = 0; + if (numberType == NumberType.VALUE) { uniqueValues = new HashSet<>(); for (Number nv : values) { @@ -133,6 +142,9 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close } else { for (Number nv : values) { long v = nv.longValue(); + if (v == -1L) { + missingOrdCount++; + } minValue = Math.min(minValue, v); maxValue = Math.max(maxValue, v); ++count; @@ -145,6 +157,18 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close ? Integer.MAX_VALUE : DirectWriter.bitsRequired(uniqueValues.size() - 1); + final boolean sparse; // 1% of docs or less have a value + switch (numberType) { + case VALUE: + sparse = (double) missingCount / count >= 0.99; + break; + case ORDINAL: + sparse = (double) missingOrdCount / count >= 0.99; + break; + default: + throw new AssertionError(); + } + final int format; if (uniqueValues != null && count <= Integer.MAX_VALUE @@ -152,6 +176,9 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close || (uniqueValues.size() == 2 && missingCount > 0 && zeroCount == missingCount))) { // either one unique value C or two unique values: "missing" and C format = CONST_COMPRESSED; + } else if (sparse && count >= 1024) { + // require at least 1024 docs to avoid flipping back and forth when doing NRT search + format = SPARSE_COMPRESSED; } else if (uniqueValues != null && tableBitsRequired < deltaBitsRequired) { format = TABLE_COMPRESSED; } else if (gcd != 0 && gcd != 1) { @@ -164,7 +191,22 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close meta.writeVInt(field.number); meta.writeByte(Lucene54DocValuesFormat.NUMERIC); meta.writeVInt(format); - if (missingCount == 0) { + if (format == SPARSE_COMPRESSED) { + meta.writeLong(data.getFilePointer()); + final long numDocsWithValue; + switch (numberType) { + case VALUE: + numDocsWithValue = count - missingCount; + break; + case ORDINAL: + numDocsWithValue = count - missingOrdCount; + break; + default: + throw new AssertionError(); + } + final long maxDoc = writeSparseMissingBitset(values, numberType, numDocsWithValue); + assert maxDoc == count; + } else if (missingCount == 0) { meta.writeLong(ALL_LIVE); } else if (missingCount == count) { meta.writeLong(ALL_MISSING); @@ -220,6 +262,39 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close } ordsWriter.finish(); break; + case SPARSE_COMPRESSED: + final Iterable filteredMissingValues; + switch (numberType) { + case VALUE: + meta.writeByte((byte) 0); + filteredMissingValues = new Iterable() { + @Override + public Iterator iterator() { + return StreamSupport + .stream(values.spliterator(), false) + .filter(value -> value != null) + .iterator(); + } + }; + break; + case ORDINAL: + meta.writeByte((byte) 1); + filteredMissingValues = new Iterable() { + @Override + public Iterator iterator() { + return StreamSupport + .stream(values.spliterator(), false) + .filter(value -> value.longValue() != -1L) + .iterator(); + } + }; + break; + default: + throw new AssertionError(); + } + // Write non-missing values as a numeric field + addNumericField(field, filteredMissingValues, numberType); + break; default: throw new AssertionError(); } @@ -247,6 +322,34 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close } } + long writeSparseMissingBitset(Iterable values, NumberType numberType, long numDocsWithValue) throws IOException { + meta.writeVLong(numDocsWithValue); + + // Write doc IDs that have a value + meta.writeVInt(DIRECT_MONOTONIC_BLOCK_SHIFT); + final DirectMonotonicWriter docIdsWriter = DirectMonotonicWriter.getInstance(meta, data, numDocsWithValue, DIRECT_MONOTONIC_BLOCK_SHIFT); + long docID = 0; + for (Number nv : values) { + switch (numberType) { + case VALUE: + if (nv != null) { + docIdsWriter.add(docID); + } + break; + case ORDINAL: + if (nv.longValue() != -1L) { + docIdsWriter.add(docID); + } + break; + default: + throw new AssertionError(); + } + docID++; + } + docIdsWriter.finish(); + return docID; + } + @Override public void addBinaryField(FieldInfo field, Iterable values) throws IOException { // write the byte[] data @@ -458,7 +561,7 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close meta.writeVInt(field.number); meta.writeByte(Lucene54DocValuesFormat.SORTED); addTermsDict(field, values); - addNumericField(field, docToOrd, false); + addNumericField(field, docToOrd, NumberType.ORDINAL); } @Override @@ -478,11 +581,11 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close writeDictionary(uniqueValueSets); // write the doc -> set_id as a numeric field - addNumericField(field, docToSetId(uniqueValueSets, docToValueCount, values), false); + addNumericField(field, docToSetId(uniqueValueSets, docToValueCount, values), NumberType.ORDINAL); } else { meta.writeVInt(SORTED_WITH_ADDRESSES); // write the stream of values as a numeric field - addNumericField(field, values, true); + addNumericField(field, values, NumberType.VALUE); // write the doc -> ord count as a absolute index to the stream addOrdIndex(field, docToValueCount); } @@ -510,7 +613,7 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close addTermsDict(field, values); // write the doc -> set_id as a numeric field - addNumericField(field, docToSetId(uniqueValueSets, docToOrdCount, ords), false); + addNumericField(field, docToSetId(uniqueValueSets, docToOrdCount, ords), NumberType.ORDINAL); } else { meta.writeVInt(SORTED_WITH_ADDRESSES); @@ -519,7 +622,7 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close // write the stream of ords as a numeric field // NOTE: we could return an iterator that delta-encodes these within a doc - addNumericField(field, ords, false); + addNumericField(field, ords, NumberType.ORDINAL); // write the doc -> ord count as a absolute index to the stream addOrdIndex(field, docToOrdCount); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesFormat.java index d3b071e50f8..c6e55cd2de4 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesFormat.java @@ -19,18 +19,14 @@ package org.apache.lucene.codecs.lucene54; import java.io.IOException; -import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; -import org.apache.lucene.store.DataOutput; import org.apache.lucene.util.SmallFloat; -import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.packed.DirectWriter; -import org.apache.lucene.util.packed.MonotonicBlockPackedWriter; /** * Lucene 5.4 DocValues format. @@ -51,6 +47,8 @@ import org.apache.lucene.util.packed.MonotonicBlockPackedWriter; * as blocks of bitpacked integers, encoding the deviation from the expected delta. *
  • Const-compressed: when there is only one possible non-missing value, only the missing * bitset is encoded. + *
  • Sparse-compressed: only documents with a value are stored, and lookups are performed + * using binary search. * *

    * {@link DocValuesType#BINARY BINARY}: @@ -96,93 +94,6 @@ import org.apache.lucene.util.packed.MonotonicBlockPackedWriter; *

  • .dvd: DocValues data
  • *
  • .dvm: DocValues metadata
  • * - *
      - *
    1. - *

      The DocValues metadata or .dvm file.

      - *

      For DocValues field, this stores metadata, such as the offset into the - * DocValues data (.dvd)

      - *

      DocValues metadata (.dvm) --> Header,<Entry>NumFields,Footer

      - *
        - *
      • Entry --> NumericEntry | BinaryEntry | SortedEntry | SortedSetEntry | SortedNumericEntry
      • - *
      • NumericEntry --> GCDNumericEntry | TableNumericEntry | DeltaNumericEntry
      • - *
      • GCDNumericEntry --> NumericHeader,MinValue,GCD,BitsPerValue
      • - *
      • TableNumericEntry --> NumericHeader,TableSize,{@link DataOutput#writeLong Int64}TableSize,BitsPerValue
      • - *
      • DeltaNumericEntry --> NumericHeader,MinValue,BitsPerValue
      • - *
      • MonotonicNumericEntry --> NumericHeader,PackedVersion,BlockSize
      • - *
      • NumericHeader --> FieldNumber,EntryType,NumericType,MissingOffset,DataOffset,Count,EndOffset
      • - *
      • BinaryEntry --> FixedBinaryEntry | VariableBinaryEntry | PrefixBinaryEntry
      • - *
      • FixedBinaryEntry --> BinaryHeader
      • - *
      • VariableBinaryEntry --> BinaryHeader,AddressOffset,PackedVersion,BlockSize
      • - *
      • PrefixBinaryEntry --> BinaryHeader,AddressInterval,AddressOffset,PackedVersion,BlockSize
      • - *
      • BinaryHeader --> FieldNumber,EntryType,BinaryType,MissingOffset,MinLength,MaxLength,DataOffset
      • - *
      • SortedEntry --> FieldNumber,EntryType,BinaryEntry,NumericEntry
      • - *
      • SortedSetEntry --> SingleSortedSetEntry | AddressesSortedSetEntry | TableSortedSetEntry
      • - *
      • SingleSortedSetEntry --> SetHeader,SortedEntry
      • - *
      • AddressesSortedSetEntry --> SetHeader,BinaryEntry,NumericEntry,NumericEntry
      • - *
      • TableSortedSetEntry --> SetHeader,TotalTableLength,{@link DataOutput#writeLong Int64}TotalTableLength,TableSize,{@link DataOutput#writeInt Int32}TableSize,BinaryEntry,NumericEntry
      • - *
      • SetHeader --> FieldNumber,EntryType,SetType
      • - *
      • SortedNumericEntry --> SingleSortedNumericEntry | AddressesSortedNumericEntry | TableSortedNumericEntry
      • - *
      • SingleNumericEntry --> SetHeader,NumericEntry
      • - *
      • AddressesSortedNumericEntry --> SetHeader,NumericEntry,NumericEntry
      • - *
      • TableSortedNumericEntry --> SetHeader,TotalTableLength,{@link DataOutput#writeLong Int64}TotalTableLength,TableSize,{@link DataOutput#writeInt Int32}TableSize,NumericEntry
      • - *
      • FieldNumber,PackedVersion,MinLength,MaxLength,BlockSize,ValueCount --> {@link DataOutput#writeVInt VInt}
      • - *
      • EntryType,CompressionType --> {@link DataOutput#writeByte Byte}
      • - *
      • Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
      • - *
      • MinValue,GCD,MissingOffset,AddressOffset,DataOffset,EndOffset --> {@link DataOutput#writeLong Int64}
      • - *
      • TableSize,BitsPerValue,TotalTableLength --> {@link DataOutput#writeVInt vInt}
      • - *
      • Footer --> {@link CodecUtil#writeFooter CodecFooter}
      • - *
      - *

      Sorted fields have two entries: a BinaryEntry with the value metadata, - * and an ordinary NumericEntry for the document-to-ord metadata.

      - *

      FieldNumber of -1 indicates the end of metadata.

      - *

      EntryType is a 0 (NumericEntry) or 1 (BinaryEntry)

      - *

      DataOffset is the pointer to the start of the data in the DocValues data (.dvd)

      - *

      EndOffset is the pointer to the end of the data in the DocValues data (.dvd)

      - *

      NumericType indicates how Numeric values will be compressed: - *

        - *
      • 0 --> delta-compressed. For each block of 16k integers, every integer is delta-encoded - * from the minimum value within the block. - *
      • 1 --> gcd-compressed. When all integers share a common divisor, only quotients are stored - * using blocks of delta-encoded ints. - *
      • 2 --> table-compressed. When the number of unique numeric values is small and it would save space, - * a lookup table of unique values is written, followed by the ordinal for each document. - *
      • 3 --> monotonic-compressed. Used to implement addressing for BINARY, SORTED_SET, SORTED_NUMERIC. - *
      • 4 --> const-compressed. Used when all non-missing values are the same. - *
      - *

      BinaryType indicates how Binary values will be stored: - *

        - *
      • 0 --> fixed-width. All values have the same length, addressing by multiplication. - *
      • 1 --> variable-width. An address for each value is stored. - *
      • 2 --> prefix-compressed. An address to the start of every interval'th value is stored. - *
      - *

      SetType indicates how SortedSet and SortedNumeric values will be stored: - *

        - *
      • 0 --> with addresses. There are two numeric entries: a first one from document to start - * offset, and a second one from offset to ord/value. - *
      • 1 --> single-valued. Used when all documents have at most one value and is encoded like - * a regular Sorted/Numeric entry. - *
      • 2 --> table-encoded. A lookup table of unique sets of values is written, followed by a - * numeric entry that maps each document to an ordinal in this table. - *
      - *

      MinLength and MaxLength represent the min and max byte[] value lengths for Binary values. - * If they are equal, then all values are of a fixed size, and can be addressed as DataOffset + (docID * length). - * Otherwise, the binary values are of variable size, and packed integer metadata (PackedVersion,BlockSize) - * is written for the addresses. - *

      MissingOffset points to a byte[] containing a bitset of all documents that had a value for the field. - * If it's -1, then there are no missing values. If it's -2, all values are missing. - *

    2. - *

      The DocValues data or .dvd file.

      - *

      For DocValues field, this stores the actual per-document data (the heavy-lifting)

      - *

      DocValues data (.dvd) --> Header,<NumericData | BinaryData | SortedData>NumFields,Footer

      - *
        - *
      • NumericData --> DeltaCompressedNumerics | TableCompressedNumerics | GCDCompressedNumerics
      • - *
      • BinaryData --> {@link DataOutput#writeByte Byte}DataLength,Addresses
      • - *
      • SortedData --> {@link FST FST<Int64>}
      • - *
      • DeltaCompressedNumerics,TableCompressedNumerics,GCDCompressedNumerics --> {@link DirectWriter PackedInts}
      • - *
      • Addresses --> {@link MonotonicBlockPackedWriter MonotonicBlockPackedInts(blockSize=16k)}
      • - *
      • Footer --> {@link CodecUtil#writeFooter CodecFooter}
      • - *
      - *
    * @lucene.experimental */ public final class Lucene54DocValuesFormat extends DocValuesFormat { @@ -207,8 +118,7 @@ public final class Lucene54DocValuesFormat extends DocValuesFormat { static final String META_CODEC = "Lucene54DocValuesMetadata"; static final String META_EXTENSION = "dvm"; static final int VERSION_START = 0; - static final int VERSION_SORTEDSET_TABLE = 1; - static final int VERSION_CURRENT = VERSION_SORTEDSET_TABLE; + static final int VERSION_CURRENT = VERSION_START; // indicates docvalues type static final byte NUMERIC = 0; @@ -242,7 +152,9 @@ public final class Lucene54DocValuesFormat extends DocValuesFormat { static final int MONOTONIC_COMPRESSED = 3; /** Compressed with constant value (uses only missing bitset) */ static final int CONST_COMPRESSED = 4; - + /** Compressed with sparse arrays. */ + static final int SPARSE_COMPRESSED = 5; + /** Uncompressed binary, written directly (fixed length). */ static final int BINARY_FIXED_UNCOMPRESSED = 0; /** Uncompressed binary, written directly (variable length). */ diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesProducer.java index 9d46c6d0e26..5fac6ed3d24 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesProducer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesProducer.java @@ -29,6 +29,7 @@ import java.util.concurrent.atomic.AtomicLong; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.DocValuesProducer; +import org.apache.lucene.codecs.lucene54.Lucene54DocValuesConsumer.NumberType; import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.DocValues; @@ -314,6 +315,14 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close NumericEntry entry = new NumericEntry(); entry.format = meta.readVInt(); entry.missingOffset = meta.readLong(); + if (entry.format == SPARSE_COMPRESSED) { + // sparse bits need a bit more metadata + entry.numDocsWithValue = meta.readVLong(); + final int blockShift = meta.readVInt(); + entry.monotonicMeta = DirectMonotonicReader.loadMeta(meta, entry.numDocsWithValue + 1, blockShift); + ramBytesUsed.addAndGet(entry.monotonicMeta.ramBytesUsed()); + directAddressesMeta.put(info.name, entry.monotonicMeta); + } entry.offset = meta.readLong(); entry.count = meta.readVLong(); switch(entry.format) { @@ -351,6 +360,30 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close ramBytesUsed.addAndGet(entry.monotonicMeta.ramBytesUsed()); directAddressesMeta.put(info.name, entry.monotonicMeta); break; + case SPARSE_COMPRESSED: + final byte numberType = meta.readByte(); + switch (numberType) { + case 0: + entry.numberType = NumberType.VALUE; + break; + case 1: + entry.numberType = NumberType.ORDINAL; + break; + default: + throw new CorruptIndexException("Number type can only be 0 or 1, got=" + numberType, meta); + } + + // now read the numeric entry for non-missing values + final int fieldNumber = meta.readVInt(); + if (fieldNumber != info.number) { + throw new CorruptIndexException("Field numbers mistmatch: " + fieldNumber + " != " + info.number, meta); + } + final int dvFormat = meta.readByte(); + if (dvFormat != NUMERIC) { + throw new CorruptIndexException("Formats mistmatch: " + dvFormat + " != " + NUMERIC, meta); + } + entry.nonMissingValues = readNumericEntry(info, meta); + break; default: throw new CorruptIndexException("Unknown format: " + entry.format + ", input=", meta); } @@ -493,11 +526,162 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close } }; } + case SPARSE_COMPRESSED: + final SparseBits docsWithField = getSparseLiveBits(entry); + final LongValues values = getNumeric(entry.nonMissingValues); + final long missingValue; + switch (entry.numberType) { + case ORDINAL: + missingValue = -1L; + break; + case VALUE: + missingValue = 0L; + break; + default: + throw new AssertionError(); + } + return new SparseLongValues(docsWithField, values, missingValue); default: throw new AssertionError(); } } + static class SparseBits implements Bits { + + final long maxDoc, docIDsLength, firstDocId; + final LongValues docIds; + + long index; // index of docId in docIds + long docId; // doc ID at index + long nextDocId; // doc ID at (index+1) + + SparseBits(long maxDoc, long docIDsLength, LongValues docIDs) { + if (docIDsLength > 0 && maxDoc <= docIDs.get(docIDsLength - 1)) { + throw new IllegalArgumentException("maxDoc must be > the last element of docIDs"); + } + this.maxDoc = maxDoc; + this.docIDsLength = docIDsLength; + this.docIds = docIDs; + this.firstDocId = docIDsLength == 0 ? maxDoc : docIDs.get(0); + reset(); + } + + private void reset() { + index = -1; + this.docId = -1; + this.nextDocId = firstDocId; + } + + /** Gallop forward and stop as soon as an index is found that is greater than + * the given docId. {@code index} will store an index that stores a value + * that is <= {@code docId} while the return value will give an index + * that stores a value that is > {@code docId}. These indices can then be + * used to binary search. */ + private long gallop(long docId) { + index++; + this.docId = nextDocId; + long hiIndex = index + 1; + + while (true) { + if (hiIndex >= docIDsLength) { + hiIndex = docIDsLength; + nextDocId = maxDoc; + break; + } + + final long hiDocId = docIds.get(hiIndex); + if (hiDocId > docId) { + nextDocId = hiDocId; + break; + } + + final long delta = hiIndex - index; + index = hiIndex; + this.docId = hiDocId; + hiIndex += delta << 1; // double the step each time + } + return hiIndex; + } + + private void binarySearch(long hiIndex, long docId) { + while (index + 1 < hiIndex) { + final long midIndex = (index + hiIndex) >>> 1; + final long midDocId = docIds.get(midIndex); + if (midDocId > docId) { + hiIndex = midIndex; + nextDocId = midDocId; + } else { + index = midIndex; + this.docId = midDocId; + } + } + } + + private boolean checkInvariants(long nextIndex, long docId) { + assert this.docId <= docId; + assert this.nextDocId > docId; + assert (index == -1 && this.docId == -1) || this.docId == docIds.get(index); + assert (nextIndex == docIDsLength && nextDocId == maxDoc) || nextDocId == docIds.get(nextIndex); + return true; + } + + private void exponentialSearch(long docId) { + // seek forward by doubling the interval on each iteration + final long hiIndex = gallop(docId); + assert checkInvariants(hiIndex, docId); + + // now perform the actual binary search + binarySearch(hiIndex, docId); + } + + boolean get(final long docId) { + if (docId < this.docId) { + // reading doc IDs backward, go back to the start + reset(); + } + + if (docId >= nextDocId) { + exponentialSearch(docId); + } + + assert checkInvariants(index + 1, docId); + return docId == this.docId; + } + + @Override + public boolean get(int index) { + return get((long) index); + } + + @Override + public int length() { + return Math.toIntExact(maxDoc); + } + } + + static class SparseLongValues extends LongValues { + + final SparseBits docsWithField; + final LongValues values; + final long missingValue; + + SparseLongValues(SparseBits docsWithField, LongValues values, long missingValue) { + this.docsWithField = docsWithField; + this.values = values; + this.missingValue = missingValue; + } + + @Override + public long get(long docId) { + if (docsWithField.get(docId)) { + return values.get(docsWithField.index); + } else { + return missingValue; + } + } + + } + @Override public BinaryDocValues getBinary(FieldInfo field) throws IOException { BinaryEntry bytes = binaries.get(field.name); @@ -658,7 +842,12 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close if (ss.format == SORTED_SINGLE_VALUED) { NumericEntry numericEntry = numerics.get(field.name); final LongValues values = getNumeric(numericEntry); - final Bits docsWithField = getLiveBits(numericEntry.missingOffset, maxDoc); + final Bits docsWithField; + if (numericEntry.format == SPARSE_COMPRESSED) { + docsWithField = ((SparseLongValues) values).docsWithField; + } else { + docsWithField = getLiveBits(numericEntry.missingOffset, maxDoc); + } return DocValues.singleton(values, docsWithField); } else if (ss.format == SORTED_WITH_ADDRESSES) { NumericEntry numericEntry = numerics.get(field.name); @@ -898,6 +1087,12 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close } } + private SparseBits getSparseLiveBits(NumericEntry entry) throws IOException { + final RandomAccessInput docIdsData = this.data.randomAccessSlice(entry.missingOffset, entry.offset - entry.missingOffset); + final LongValues docIDs = DirectMonotonicReader.getInstance(entry.monotonicMeta, docIdsData); + return new SparseBits(maxDoc, entry.numDocsWithValue, docIDs); + } + @Override public Bits getDocsWithField(FieldInfo field) throws IOException { switch(field.getDocValuesType()) { @@ -912,7 +1107,11 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close return getLiveBits(be.missingOffset, maxDoc); case NUMERIC: NumericEntry ne = numerics.get(field.name); - return getLiveBits(ne.missingOffset, maxDoc); + if (ne.format == SPARSE_COMPRESSED) { + return getSparseLiveBits(ne); + } else { + return getLiveBits(ne.missingOffset, maxDoc); + } default: throw new AssertionError(); } @@ -950,6 +1149,12 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close long minValue; long gcd; long table[]; + + /** for sparse compression */ + long numDocsWithValue; + NumericEntry nonMissingValues; + NumberType numberType; + } /** metadata entry for a binary docvalues field */ diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene54/TestLucene54DocValuesFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene54/TestLucene54DocValuesFormat.java index fc847dd5f68..2027ee8eb20 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene54/TestLucene54DocValuesFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene54/TestLucene54DocValuesFormat.java @@ -18,32 +18,52 @@ package org.apache.lucene.codecs.lucene54; */ import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; +import java.util.HashSet; import java.util.List; +import java.util.Set; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.asserting.AssertingCodec; +import org.apache.lucene.codecs.lucene54.Lucene54DocValuesProducer.SparseBits; +import org.apache.lucene.codecs.lucene54.Lucene54DocValuesProducer.SparseLongValues; +import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.document.SortedDocValuesField; +import org.apache.lucene.document.SortedNumericDocValuesField; import org.apache.lucene.document.SortedSetDocValuesField; +import org.apache.lucene.document.StoredField; import org.apache.lucene.document.StringField; import org.apache.lucene.index.BaseCompressingDocValuesFormatTestCase; +import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.SerialMergeScheduler; +import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.SortedNumericDocValues; import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.StorableField; +import org.apache.lucene.index.StoredDocument; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.TermsEnum.SeekStatus; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LongValues; import org.apache.lucene.util.TestUtil; /** @@ -115,7 +135,141 @@ public class TestLucene54DocValuesFormat extends BaseCompressingDocValuesFormatT doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 8121), 1, 500); } } - + + @Slow + public void testSparseDocValuesVsStoredFields() throws Exception { + int numIterations = atLeast(2); + for (int i = 0; i < numIterations; i++) { + doTestSparseDocValuesVsStoredFields(); + } + } + + private void doTestSparseDocValuesVsStoredFields() throws Exception { + final long[] values = new long[TestUtil.nextInt(random(), 1, 500)]; + for (int i = 0; i < values.length; ++i) { + values[i] = random().nextLong(); + } + + Directory dir = newFSDirectory(createTempDir()); + IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random())); + conf.setMergeScheduler(new SerialMergeScheduler()); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf); + + // sparse compression is only enabled if less than 1% of docs have a value + final int avgGap = 100; + + final int numDocs = atLeast(100); + for (int i = random().nextInt(avgGap * 2); i >= 0; --i) { + writer.addDocument(new Document()); + } + final int maxNumValuesPerDoc = random().nextBoolean() ? 1 : TestUtil.nextInt(random(), 2, 5); + for (int i = 0; i < numDocs; ++i) { + Document doc = new Document(); + + // single-valued + long docValue = values[random().nextInt(values.length)]; + doc.add(new NumericDocValuesField("numeric", docValue)); + doc.add(new SortedDocValuesField("sorted", new BytesRef(Long.toString(docValue)))); + doc.add(new BinaryDocValuesField("binary", new BytesRef(Long.toString(docValue)))); + doc.add(new StoredField("value", docValue)); + + // multi-valued + final int numValues = TestUtil.nextInt(random(), 1, maxNumValuesPerDoc); + for (int j = 0; j < numValues; ++j) { + docValue = values[random().nextInt(values.length)]; + doc.add(new SortedNumericDocValuesField("sorted_numeric", docValue)); + doc.add(new SortedSetDocValuesField("sorted_set", new BytesRef(Long.toString(docValue)))); + doc.add(new StoredField("values", docValue)); + } + + writer.addDocument(doc); + + // add a gap + for (int j = random().nextInt(avgGap * 2); j >= 0; --j) { + writer.addDocument(new Document()); + } + } + + if (random().nextBoolean()) { + writer.forceMerge(1); + } + + final IndexReader indexReader = writer.getReader(); + writer.close(); + + for (LeafReaderContext context : indexReader.leaves()) { + final LeafReader reader = context.reader(); + final NumericDocValues numeric = DocValues.getNumeric(reader, "numeric"); + final Bits numericBits = DocValues.getDocsWithField(reader, "numeric"); + + final SortedDocValues sorted = DocValues.getSorted(reader, "sorted"); + final Bits sortedBits = DocValues.getDocsWithField(reader, "sorted"); + + final BinaryDocValues binary = DocValues.getBinary(reader, "binary"); + final Bits binaryBits = DocValues.getDocsWithField(reader, "binary"); + + final SortedNumericDocValues sortedNumeric = DocValues.getSortedNumeric(reader, "sorted_numeric"); + final Bits sortedNumericBits = DocValues.getDocsWithField(reader, "sorted_numeric"); + + final SortedSetDocValues sortedSet = DocValues.getSortedSet(reader, "sorted_set"); + final Bits sortedSetBits = DocValues.getDocsWithField(reader, "sorted_set"); + + for (int i = 0; i < reader.maxDoc(); ++i) { + final StoredDocument doc = reader.document(i); + final StorableField valueField = doc.getField("value"); + final Long value = valueField == null ? null : valueField.numericValue().longValue(); + + if (value == null) { + assertEquals(0, numeric.get(i)); + assertEquals(-1, sorted.getOrd(i)); + assertEquals(new BytesRef(), binary.get(i)); + + assertFalse(numericBits.get(i)); + assertFalse(sortedBits.get(i)); + assertFalse(binaryBits.get(i)); + } else { + assertEquals(value.longValue(), numeric.get(i)); + assertTrue(sorted.getOrd(i) >= 0); + assertEquals(new BytesRef(Long.toString(value)), sorted.lookupOrd(sorted.getOrd(i))); + assertEquals(new BytesRef(Long.toString(value)), binary.get(i)); + + assertTrue(numericBits.get(i)); + assertTrue(sortedBits.get(i)); + assertTrue(binaryBits.get(i)); + } + + final StorableField[] valuesFields = doc.getFields("values"); + final Set valueSet = new HashSet<>(); + for (StorableField sf : valuesFields) { + valueSet.add(sf.numericValue().longValue()); + } + + sortedNumeric.setDocument(i); + assertEquals(valuesFields.length, sortedNumeric.count()); + for (int j = 0; j < sortedNumeric.count(); ++j) { + assertTrue(valueSet.contains(sortedNumeric.valueAt(j))); + } + sortedSet.setDocument(i); + int sortedSetCount = 0; + while (true) { + long ord = sortedSet.nextOrd(); + if (ord == SortedSetDocValues.NO_MORE_ORDS) { + break; + } + assertTrue(valueSet.contains(Long.parseLong(sortedSet.lookupOrd(ord).utf8ToString()))); + sortedSetCount++; + } + assertEquals(valueSet.size(), sortedSetCount); + + assertEquals(!valueSet.isEmpty(), sortedNumericBits.get(i)); + assertEquals(!valueSet.isEmpty(), sortedSetBits.get(i)); + } + } + + indexReader.close(); + dir.close(); + } + // TODO: try to refactor this and some termsenum tests into the base class. // to do this we need to fix the test class to get a DVF not a Codec so we can setup // the postings format correctly. @@ -278,4 +432,74 @@ public class TestLucene54DocValuesFormat extends BaseCompressingDocValuesFormatT } } } + + public void testSparseLongValues() { + final int iters = atLeast(5); + for (int iter = 0; iter < iters; ++iter) { + final int numDocs = TestUtil.nextInt(random(), 0, 100); + final long[] docIds = new long[numDocs]; + final long[] values = new long[numDocs]; + final long maxDoc; + if (numDocs == 0) { + maxDoc = 1 + random().nextInt(10); + } else { + docIds[0] = random().nextInt(10); + for (int i = 1; i < docIds.length; ++i) { + docIds[i] = docIds[i - 1] + 1 + random().nextInt(100); + } + maxDoc = docIds[numDocs - 1] + 1 + random().nextInt(10); + } + for (int i = 0; i < values.length; ++i) { + values[i] = random().nextLong(); + } + final long missingValue = random().nextLong(); + final LongValues docIdsValues = new LongValues() { + @Override + public long get(long index) { + return docIds[Math.toIntExact(index)]; + } + }; + final LongValues valuesValues = new LongValues() { + @Override + public long get(long index) { + return values[Math.toIntExact(index)]; + } + }; + final SparseBits liveBits = new SparseBits(maxDoc, numDocs, docIdsValues); + // random-access + for (int i = 0; i < 2000; ++i) { + final long docId = TestUtil.nextLong(random(), 0, maxDoc - 1); + final boolean exists = liveBits.get(Math.toIntExact(docId)); + assertEquals(Arrays.binarySearch(docIds, docId) >= 0, exists); + } + // sequential access + for (int docId = 0; docId < maxDoc; docId += random().nextInt(3)) { + final boolean exists = liveBits.get(Math.toIntExact(docId)); + assertEquals(Arrays.binarySearch(docIds, docId) >= 0, exists); + } + + final SparseLongValues sparseValues = new SparseLongValues(liveBits, valuesValues, missingValue); + // random-access + for (int i = 0; i < 2000; ++i) { + final long docId = TestUtil.nextLong(random(), 0, maxDoc - 1); + final int idx = Arrays.binarySearch(docIds, docId); + final long value = sparseValues.get(docId); + if (idx >= 0) { + assertEquals(values[idx], value); + } else { + assertEquals(missingValue, value); + } + } + // sequential access + for (int docId = 0; docId < maxDoc; docId += random().nextInt(3)) { + final int idx = Arrays.binarySearch(docIds, docId); + final long value = sparseValues.get(docId); + if (idx >= 0) { + assertEquals(values[idx], value); + } else { + assertEquals(missingValue, value); + } + } + } + } }