LUCENE-6863: Optimized storage requirements of doc values fields when less than 1% of documents have a value.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1712957 13f79535-47bb-0310-9956-ffa450edef68
2015-11-06 13:04:36 +00:00 · 2015-11-06 13:04:36 +00:00 · 7c917a5ed8
parent 4ef2d43d58
commit 7c917a5ed8
5 changed files with 553 additions and 106 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -186,6 +186,9 @@ Optimizations
 * LUCENE-6885: StandardDirectoryReader (initialCapacity) tweaks
  (Christine Poerschke)

+* LUCENE-6863: Optimized storage requirements of doc values fields when less
+  than 1% of documents have a value. (Adrien Grand)
+
 Bug Fixes

 * LUCENE-6817: ComplexPhraseQueryParser.ComplexPhraseQuery does not display 
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesConsumer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesConsumer.java
@ -28,6 +28,7 @@ import java.util.Map;
 import java.util.Set;
 import java.util.SortedSet;
 import java.util.TreeSet;
+import java.util.stream.StreamSupport;

 import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.codecs.DocValuesConsumer;
@ -54,6 +55,13 @@ import static org.apache.lucene.codecs.lucene54.Lucene54DocValuesFormat.*;
 /** writer for {@link Lucene54DocValuesFormat} */
 final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Closeable {

+  enum NumberType {
+    /** Dense ordinals */
+    ORDINAL,
+    /** Random long values */
+    VALUE;
+  }
+
  IndexOutput data, meta;
  final int maxDoc;
  
@ -78,10 +86,10 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close
  
  @Override
  public void addNumericField(FieldInfo field, Iterable<Number> values) throws IOException {
-    addNumericField(field, values, true);
+    addNumericField(field, values, NumberType.VALUE);
  }

-  void addNumericField(FieldInfo field, Iterable<Number> values, boolean optimizeStorage) throws IOException {
+  void addNumericField(FieldInfo field, Iterable<Number> values, NumberType numberType) throws IOException {
    long count = 0;
    long minValue = Long.MAX_VALUE;
    long maxValue = Long.MIN_VALUE;
@ -90,7 +98,8 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close
    long zeroCount = 0;
    // TODO: more efficient?
    HashSet<Long> uniqueValues = null;
-    if (optimizeStorage) {
+    long missingOrdCount = 0;
+    if (numberType == NumberType.VALUE) {
      uniqueValues = new HashSet<>();

      for (Number nv : values) {
@ -133,6 +142,9 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close
    } else {
      for (Number nv : values) {
        long v = nv.longValue();
+        if (v == -1L) {
+          missingOrdCount++;
+        }
        minValue = Math.min(minValue, v);
        maxValue = Math.max(maxValue, v);
        ++count;
@ -145,6 +157,18 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close
        ? Integer.MAX_VALUE
        : DirectWriter.bitsRequired(uniqueValues.size() - 1);

+    final boolean sparse; // 1% of docs or less have a value
+    switch (numberType) {
+      case VALUE:
+        sparse = (double) missingCount / count >= 0.99;
+        break;
+      case ORDINAL:
+        sparse = (double) missingOrdCount / count >= 0.99;
+        break;
+      default:
+        throw new AssertionError();
+    }
+
    final int format;
    if (uniqueValues != null 
        && count <= Integer.MAX_VALUE
@ -152,6 +176,9 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close
           || (uniqueValues.size() == 2 && missingCount > 0 && zeroCount == missingCount))) {
      // either one unique value C or two unique values: "missing" and C
      format = CONST_COMPRESSED;
+    } else if (sparse && count >= 1024) {
+      // require at least 1024 docs to avoid flipping back and forth when doing NRT search
+      format = SPARSE_COMPRESSED;
    } else if (uniqueValues != null && tableBitsRequired < deltaBitsRequired) {
      format = TABLE_COMPRESSED;
    } else if (gcd != 0 && gcd != 1) {
@ -164,7 +191,22 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close
    meta.writeVInt(field.number);
    meta.writeByte(Lucene54DocValuesFormat.NUMERIC);
    meta.writeVInt(format);
-    if (missingCount == 0) {
+    if (format == SPARSE_COMPRESSED) {
+      meta.writeLong(data.getFilePointer());
+      final long numDocsWithValue;
+      switch (numberType) {
+        case VALUE:
+          numDocsWithValue = count - missingCount;
+          break;
+        case ORDINAL:
+          numDocsWithValue = count - missingOrdCount;
+          break;
+        default:
+          throw new AssertionError();
+      }
+      final long maxDoc = writeSparseMissingBitset(values, numberType, numDocsWithValue);
+      assert maxDoc == count;
+    } else if (missingCount == 0) {
      meta.writeLong(ALL_LIVE);
    } else if (missingCount == count) {
      meta.writeLong(ALL_MISSING);
@ -220,6 +262,39 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close
        }
        ordsWriter.finish();
        break;
+      case SPARSE_COMPRESSED:
+        final Iterable<Number> filteredMissingValues;
+        switch (numberType) {
+          case VALUE:
+            meta.writeByte((byte) 0);
+            filteredMissingValues = new Iterable<Number>() {
+              @Override
+              public Iterator<Number> iterator() {
+                return StreamSupport
+                    .stream(values.spliterator(), false)
+                    .filter(value -> value != null)
+                    .iterator();
+              }
+            };
+            break;
+          case ORDINAL:
+            meta.writeByte((byte) 1);
+            filteredMissingValues = new Iterable<Number>() {
+              @Override
+              public Iterator<Number> iterator() {
+                return StreamSupport
+                    .stream(values.spliterator(), false)
+                    .filter(value -> value.longValue() != -1L)
+                    .iterator();
+              }
+            };
+            break;
+          default:
+            throw new AssertionError();
+        }
+        // Write non-missing values as a numeric field
+        addNumericField(field, filteredMissingValues, numberType);
+        break;
      default:
        throw new AssertionError();
    }
@ -247,6 +322,34 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close
    }
  }

+  long writeSparseMissingBitset(Iterable<Number> values, NumberType numberType, long numDocsWithValue) throws IOException {
+    meta.writeVLong(numDocsWithValue);
+
+    // Write doc IDs that have a value
+    meta.writeVInt(DIRECT_MONOTONIC_BLOCK_SHIFT);
+    final DirectMonotonicWriter docIdsWriter = DirectMonotonicWriter.getInstance(meta, data, numDocsWithValue, DIRECT_MONOTONIC_BLOCK_SHIFT);
+    long docID = 0;
+    for (Number nv : values) {
+      switch (numberType) {
+        case VALUE:
+          if (nv != null) {
+            docIdsWriter.add(docID);
+          }
+          break;
+        case ORDINAL:
+          if (nv.longValue() != -1L) {
+            docIdsWriter.add(docID);
+          }
+          break;
+        default:
+          throw new AssertionError();
+      }
+      docID++;
+    }
+    docIdsWriter.finish();
+    return docID;
+  }
+
  @Override
  public void addBinaryField(FieldInfo field, Iterable<BytesRef> values) throws IOException {
    // write the byte[] data
@ -458,7 +561,7 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close
    meta.writeVInt(field.number);
    meta.writeByte(Lucene54DocValuesFormat.SORTED);
    addTermsDict(field, values);
-    addNumericField(field, docToOrd, false);
+    addNumericField(field, docToOrd, NumberType.ORDINAL);
  }

  @Override
@ -478,11 +581,11 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close
        writeDictionary(uniqueValueSets);

        // write the doc -> set_id as a numeric field
-        addNumericField(field, docToSetId(uniqueValueSets, docToValueCount, values), false);
+        addNumericField(field, docToSetId(uniqueValueSets, docToValueCount, values), NumberType.ORDINAL);
      } else {
        meta.writeVInt(SORTED_WITH_ADDRESSES);
        // write the stream of values as a numeric field
-        addNumericField(field, values, true);
+        addNumericField(field, values, NumberType.VALUE);
        // write the doc -> ord count as a absolute index to the stream
        addOrdIndex(field, docToValueCount);
      }
@ -510,7 +613,7 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close
        addTermsDict(field, values);

        // write the doc -> set_id as a numeric field
-        addNumericField(field, docToSetId(uniqueValueSets, docToOrdCount, ords), false);
+        addNumericField(field, docToSetId(uniqueValueSets, docToOrdCount, ords), NumberType.ORDINAL);
      } else {
        meta.writeVInt(SORTED_WITH_ADDRESSES);

@ -519,7 +622,7 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close

        // write the stream of ords as a numeric field
        // NOTE: we could return an iterator that delta-encodes these within a doc
-        addNumericField(field, ords, false);
+        addNumericField(field, ords, NumberType.ORDINAL);

        // write the doc -> ord count as a absolute index to the stream
        addOrdIndex(field, docToOrdCount);
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesFormat.java
@ -19,18 +19,14 @@ package org.apache.lucene.codecs.lucene54;

 import java.io.IOException;

-import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.codecs.DocValuesConsumer;
 import org.apache.lucene.codecs.DocValuesFormat;
 import org.apache.lucene.codecs.DocValuesProducer;
 import org.apache.lucene.index.DocValuesType;
 import org.apache.lucene.index.SegmentReadState;
 import org.apache.lucene.index.SegmentWriteState;
-import org.apache.lucene.store.DataOutput;
 import org.apache.lucene.util.SmallFloat;
-import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.packed.DirectWriter;
-import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;

 /**
 * Lucene 5.4 DocValues format.
@ -51,6 +47,8 @@ import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
 *        as blocks of bitpacked integers, encoding the deviation from the expected delta.
 *    <li>Const-compressed: when there is only one possible non-missing value, only the missing
 *        bitset is encoded.
+ *    <li>Sparse-compressed: only documents with a value are stored, and lookups are performed
+ *        using binary search.
 * </ul>
 * <p>
 * {@link DocValuesType#BINARY BINARY}:
@ -96,93 +94,6 @@ import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
 *   <li><tt>.dvd</tt>: DocValues data</li>
 *   <li><tt>.dvm</tt>: DocValues metadata</li>
 * </ol>
- * <ol>
- *   <li><a name="dvm"></a>
- *   <p>The DocValues metadata or .dvm file.</p>
- *   <p>For DocValues field, this stores metadata, such as the offset into the 
- *      DocValues data (.dvd)</p>
- *   <p>DocValues metadata (.dvm) --&gt; Header,&lt;Entry&gt;<sup>NumFields</sup>,Footer</p>
- *   <ul>
- *     <li>Entry --&gt; NumericEntry | BinaryEntry | SortedEntry | SortedSetEntry | SortedNumericEntry</li>
- *     <li>NumericEntry --&gt; GCDNumericEntry | TableNumericEntry | DeltaNumericEntry</li>
- *     <li>GCDNumericEntry --&gt; NumericHeader,MinValue,GCD,BitsPerValue</li>
- *     <li>TableNumericEntry --&gt; NumericHeader,TableSize,{@link DataOutput#writeLong Int64}<sup>TableSize</sup>,BitsPerValue</li>
- *     <li>DeltaNumericEntry --&gt; NumericHeader,MinValue,BitsPerValue</li>
- *     <li>MonotonicNumericEntry --&gt; NumericHeader,PackedVersion,BlockSize</li>
- *     <li>NumericHeader --&gt; FieldNumber,EntryType,NumericType,MissingOffset,DataOffset,Count,EndOffset</li>
- *     <li>BinaryEntry --&gt; FixedBinaryEntry | VariableBinaryEntry | PrefixBinaryEntry</li>
- *     <li>FixedBinaryEntry --&gt; BinaryHeader</li>
- *     <li>VariableBinaryEntry --&gt; BinaryHeader,AddressOffset,PackedVersion,BlockSize</li>
- *     <li>PrefixBinaryEntry --&gt; BinaryHeader,AddressInterval,AddressOffset,PackedVersion,BlockSize</li>
- *     <li>BinaryHeader --&gt; FieldNumber,EntryType,BinaryType,MissingOffset,MinLength,MaxLength,DataOffset</li>
- *     <li>SortedEntry --&gt; FieldNumber,EntryType,BinaryEntry,NumericEntry</li>
- *     <li>SortedSetEntry --&gt; SingleSortedSetEntry | AddressesSortedSetEntry | TableSortedSetEntry</li>
- *     <li>SingleSortedSetEntry --&gt; SetHeader,SortedEntry</li>
- *     <li>AddressesSortedSetEntry --&gt; SetHeader,BinaryEntry,NumericEntry,NumericEntry</li>
- *     <li>TableSortedSetEntry --&gt; SetHeader,TotalTableLength,{@link DataOutput#writeLong Int64}<sup>TotalTableLength</sup>,TableSize,{@link DataOutput#writeInt Int32}<sup>TableSize</sup>,BinaryEntry,NumericEntry</li>
- *     <li>SetHeader --&gt; FieldNumber,EntryType,SetType</li>
- *     <li>SortedNumericEntry --&gt; SingleSortedNumericEntry | AddressesSortedNumericEntry | TableSortedNumericEntry</li>
- *     <li>SingleNumericEntry --&gt; SetHeader,NumericEntry</li>
- *     <li>AddressesSortedNumericEntry --&gt; SetHeader,NumericEntry,NumericEntry</li>
- *     <li>TableSortedNumericEntry --&gt; SetHeader,TotalTableLength,{@link DataOutput#writeLong Int64}<sup>TotalTableLength</sup>,TableSize,{@link DataOutput#writeInt Int32}<sup>TableSize</sup>,NumericEntry</li>
- *     <li>FieldNumber,PackedVersion,MinLength,MaxLength,BlockSize,ValueCount --&gt; {@link DataOutput#writeVInt VInt}</li>
- *     <li>EntryType,CompressionType --&gt; {@link DataOutput#writeByte Byte}</li>
- *     <li>Header --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}</li>
- *     <li>MinValue,GCD,MissingOffset,AddressOffset,DataOffset,EndOffset --&gt; {@link DataOutput#writeLong Int64}</li>
- *     <li>TableSize,BitsPerValue,TotalTableLength --&gt; {@link DataOutput#writeVInt vInt}</li>
- *     <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}</li>
- *   </ul>
- *   <p>Sorted fields have two entries: a BinaryEntry with the value metadata,
- *      and an ordinary NumericEntry for the document-to-ord metadata.</p>
- *   <p>FieldNumber of -1 indicates the end of metadata.</p>
- *   <p>EntryType is a 0 (NumericEntry) or 1 (BinaryEntry)</p>
- *   <p>DataOffset is the pointer to the start of the data in the DocValues data (.dvd)</p>
- *   <p>EndOffset is the pointer to the end of the data in the DocValues data (.dvd)</p>
- *   <p>NumericType indicates how Numeric values will be compressed:
- *      <ul>
- *         <li>0 --&gt; delta-compressed. For each block of 16k integers, every integer is delta-encoded
- *             from the minimum value within the block. 
- *         <li>1 --&gt; gcd-compressed. When all integers share a common divisor, only quotients are stored
- *             using blocks of delta-encoded ints.
- *         <li>2 --&gt; table-compressed. When the number of unique numeric values is small and it would save space,
- *             a lookup table of unique values is written, followed by the ordinal for each document.
- *         <li>3 --&gt; monotonic-compressed. Used to implement addressing for BINARY, SORTED_SET, SORTED_NUMERIC.
- *         <li>4 --&gt; const-compressed. Used when all non-missing values are the same.
- *      </ul>
- *   <p>BinaryType indicates how Binary values will be stored:
- *      <ul>
- *         <li>0 --&gt; fixed-width. All values have the same length, addressing by multiplication. 
- *         <li>1 --&gt; variable-width. An address for each value is stored.
- *         <li>2 --&gt; prefix-compressed. An address to the start of every interval'th value is stored.
- *      </ul>
- *   <p>SetType indicates how SortedSet and SortedNumeric values will be stored:
- *       <ul>
- *         <li>0 --&gt; with addresses. There are two numeric entries: a first one from document to start
- *             offset, and a second one from offset to ord/value.
- *         <li>1 --&gt; single-valued. Used when all documents have at most one value and is encoded like
- *             a regular Sorted/Numeric entry.
- *         <li>2 --&gt; table-encoded. A lookup table of unique sets of values is written, followed by a
- *             numeric entry that maps each document to an ordinal in this table.
- *       </ul>
- *   <p>MinLength and MaxLength represent the min and max byte[] value lengths for Binary values.
- *      If they are equal, then all values are of a fixed size, and can be addressed as DataOffset + (docID * length).
- *      Otherwise, the binary values are of variable size, and packed integer metadata (PackedVersion,BlockSize)
- *      is written for the addresses.
- *   <p>MissingOffset points to a byte[] containing a bitset of all documents that had a value for the field.
- *      If it's -1, then there are no missing values. If it's -2, all values are missing.
- *   <li><a name="dvd"></a>
- *   <p>The DocValues data or .dvd file.</p>
- *   <p>For DocValues field, this stores the actual per-document data (the heavy-lifting)</p>
- *   <p>DocValues data (.dvd) --&gt; Header,&lt;NumericData | BinaryData | SortedData&gt;<sup>NumFields</sup>,Footer</p>
- *   <ul>
- *     <li>NumericData --&gt; DeltaCompressedNumerics | TableCompressedNumerics | GCDCompressedNumerics</li>
- *     <li>BinaryData --&gt;  {@link DataOutput#writeByte Byte}<sup>DataLength</sup>,Addresses</li>
- *     <li>SortedData --&gt; {@link FST FST&lt;Int64&gt;}</li>
- *     <li>DeltaCompressedNumerics,TableCompressedNumerics,GCDCompressedNumerics --&gt; {@link DirectWriter PackedInts}</li>
- *     <li>Addresses --&gt; {@link MonotonicBlockPackedWriter MonotonicBlockPackedInts(blockSize=16k)}</li>
- *     <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}</li>
- *   </ul>
- * </ol>
 * @lucene.experimental
 */
 public final class Lucene54DocValuesFormat extends DocValuesFormat {
@ -207,8 +118,7 @@ public final class Lucene54DocValuesFormat extends DocValuesFormat {
  static final String META_CODEC = "Lucene54DocValuesMetadata";
  static final String META_EXTENSION = "dvm";
  static final int VERSION_START = 0;
-  static final int VERSION_SORTEDSET_TABLE = 1;
-  static final int VERSION_CURRENT = VERSION_SORTEDSET_TABLE;
+  static final int VERSION_CURRENT = VERSION_START;
  
  // indicates docvalues type
  static final byte NUMERIC = 0;
@ -242,7 +152,9 @@ public final class Lucene54DocValuesFormat extends DocValuesFormat {
  static final int MONOTONIC_COMPRESSED = 3;
  /** Compressed with constant value (uses only missing bitset) */
  static final int CONST_COMPRESSED = 4;
-  
+  /** Compressed with sparse arrays. */
+  static final int SPARSE_COMPRESSED = 5;
+
  /** Uncompressed binary, written directly (fixed length). */
  static final int BINARY_FIXED_UNCOMPRESSED = 0;
  /** Uncompressed binary, written directly (variable length). */
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesProducer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesProducer.java
@ -29,6 +29,7 @@ import java.util.concurrent.atomic.AtomicLong;

 import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.codecs.DocValuesProducer;
+import org.apache.lucene.codecs.lucene54.Lucene54DocValuesConsumer.NumberType;
 import org.apache.lucene.index.BinaryDocValues;
 import org.apache.lucene.index.CorruptIndexException;
 import org.apache.lucene.index.DocValues;
@ -314,6 +315,14 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close
    NumericEntry entry = new NumericEntry();
    entry.format = meta.readVInt();
    entry.missingOffset = meta.readLong();
+    if (entry.format == SPARSE_COMPRESSED) {
+      // sparse bits need a bit more metadata
+      entry.numDocsWithValue = meta.readVLong();
+      final int blockShift = meta.readVInt();
+      entry.monotonicMeta = DirectMonotonicReader.loadMeta(meta, entry.numDocsWithValue + 1, blockShift);
+      ramBytesUsed.addAndGet(entry.monotonicMeta.ramBytesUsed());
+      directAddressesMeta.put(info.name, entry.monotonicMeta);
+    }
    entry.offset = meta.readLong();
    entry.count = meta.readVLong();
    switch(entry.format) {
@ -351,6 +360,30 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close
        ramBytesUsed.addAndGet(entry.monotonicMeta.ramBytesUsed());
        directAddressesMeta.put(info.name, entry.monotonicMeta);
        break;
+      case SPARSE_COMPRESSED:
+        final byte numberType = meta.readByte();
+        switch (numberType) {
+          case 0:
+            entry.numberType = NumberType.VALUE;
+            break;
+          case 1:
+            entry.numberType = NumberType.ORDINAL;
+            break;
+          default:
+            throw new CorruptIndexException("Number type can only be 0 or 1, got=" + numberType, meta);
+        }
+
+        // now read the numeric entry for non-missing values
+        final int fieldNumber = meta.readVInt();
+        if (fieldNumber != info.number) {
+          throw new CorruptIndexException("Field numbers mistmatch: " + fieldNumber + " != " + info.number, meta);
+        }
+        final int dvFormat = meta.readByte();
+        if (dvFormat != NUMERIC) {
+          throw new CorruptIndexException("Formats mistmatch: " + dvFormat + " != " + NUMERIC, meta);
+        }
+        entry.nonMissingValues = readNumericEntry(info, meta);
+        break;
      default:
        throw new CorruptIndexException("Unknown format: " + entry.format + ", input=", meta);
    }
@ -493,11 +526,162 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close
          }
        };
      }
+      case SPARSE_COMPRESSED:
+        final SparseBits docsWithField = getSparseLiveBits(entry);
+        final LongValues values = getNumeric(entry.nonMissingValues);
+        final long missingValue;
+        switch (entry.numberType) {
+          case ORDINAL:
+            missingValue = -1L;
+            break;
+          case VALUE:
+            missingValue = 0L;
+            break;
+          default:
+            throw new AssertionError();
+        }
+        return new SparseLongValues(docsWithField, values, missingValue);
      default:
        throw new AssertionError();
    }
  }

+  static class SparseBits implements Bits {
+
+    final long maxDoc, docIDsLength, firstDocId;
+    final LongValues docIds;
+
+    long index;     // index of docId in docIds
+    long docId;     // doc ID at index
+    long nextDocId; // doc ID at (index+1)
+
+    SparseBits(long maxDoc, long docIDsLength, LongValues docIDs) {
+      if (docIDsLength > 0 && maxDoc <= docIDs.get(docIDsLength - 1)) {
+        throw new IllegalArgumentException("maxDoc must be > the last element of docIDs");
+      }
+      this.maxDoc = maxDoc;
+      this.docIDsLength = docIDsLength;
+      this.docIds = docIDs;
+      this.firstDocId = docIDsLength == 0 ? maxDoc : docIDs.get(0);
+      reset();
+    }
+
+    private void reset() {
+      index = -1;
+      this.docId = -1;
+      this.nextDocId = firstDocId;
+    }
+
+    /** Gallop forward and stop as soon as an index is found that is greater than
+     *  the given docId. {@code index} will store an index that stores a value
+     *  that is &lt;= {@code docId} while the return value will give an index
+     *  that stores a value that is &gt; {@code docId}. These indices can then be
+     *  used to binary search. */
+    private long gallop(long docId) {
+      index++;
+      this.docId = nextDocId;
+      long hiIndex = index + 1;
+
+      while (true) {
+        if (hiIndex >= docIDsLength) {
+          hiIndex = docIDsLength;
+          nextDocId = maxDoc;
+          break;
+        }
+
+        final long hiDocId = docIds.get(hiIndex);
+        if (hiDocId > docId) {
+          nextDocId = hiDocId;
+          break;
+        }
+
+        final long delta = hiIndex - index;
+        index = hiIndex;
+        this.docId = hiDocId;
+        hiIndex += delta << 1; // double the step each time
+      }
+      return hiIndex;
+    }
+
+    private void binarySearch(long hiIndex, long docId) {
+      while (index + 1 < hiIndex) {
+        final long midIndex = (index + hiIndex) >>> 1;
+        final long midDocId = docIds.get(midIndex);
+        if (midDocId > docId) {
+          hiIndex = midIndex;
+          nextDocId = midDocId;
+        } else {
+          index = midIndex;
+          this.docId = midDocId;
+        }
+      }
+    }
+
+    private boolean checkInvariants(long nextIndex, long docId) {
+      assert this.docId <= docId;
+      assert this.nextDocId > docId;
+      assert (index == -1 && this.docId == -1) || this.docId == docIds.get(index);
+      assert (nextIndex == docIDsLength && nextDocId == maxDoc) || nextDocId == docIds.get(nextIndex);
+      return true;
+    }
+
+    private void exponentialSearch(long docId) {
+      // seek forward by doubling the interval on each iteration
+      final long hiIndex = gallop(docId);
+      assert checkInvariants(hiIndex, docId);
+
+      // now perform the actual binary search
+      binarySearch(hiIndex, docId);
+    }
+
+    boolean get(final long docId) {
+      if (docId < this.docId) {
+        // reading doc IDs backward, go back to the start
+        reset();
+      }
+
+      if (docId >= nextDocId) {
+        exponentialSearch(docId);
+      }
+
+      assert checkInvariants(index + 1, docId);
+      return docId == this.docId;
+    }
+
+    @Override
+    public boolean get(int index) {
+      return get((long) index);
+    }
+
+    @Override
+    public int length() {
+      return Math.toIntExact(maxDoc);
+    }
+  }
+
+  static class SparseLongValues extends LongValues {
+
+    final SparseBits docsWithField;
+    final LongValues values;
+    final long missingValue;
+
+    SparseLongValues(SparseBits docsWithField, LongValues values, long missingValue) {
+      this.docsWithField = docsWithField;
+      this.values = values;
+      this.missingValue = missingValue;
+    }
+
+    @Override
+    public long get(long docId) {
+      if (docsWithField.get(docId)) {
+        return values.get(docsWithField.index);
+      } else {
+        return missingValue;
+      }
+    }
+
+  }
+
  @Override
  public BinaryDocValues getBinary(FieldInfo field) throws IOException {
    BinaryEntry bytes = binaries.get(field.name);
@ -658,7 +842,12 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close
    if (ss.format == SORTED_SINGLE_VALUED) {
      NumericEntry numericEntry = numerics.get(field.name);
      final LongValues values = getNumeric(numericEntry);
-      final Bits docsWithField = getLiveBits(numericEntry.missingOffset, maxDoc);
+      final Bits docsWithField;
+      if (numericEntry.format == SPARSE_COMPRESSED) {
+        docsWithField = ((SparseLongValues) values).docsWithField;
+      } else {
+        docsWithField = getLiveBits(numericEntry.missingOffset, maxDoc);
+      }
      return DocValues.singleton(values, docsWithField);
    } else if (ss.format == SORTED_WITH_ADDRESSES) {
      NumericEntry numericEntry = numerics.get(field.name);
@ -898,6 +1087,12 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close
    }
  }

+  private SparseBits getSparseLiveBits(NumericEntry entry) throws IOException {
+    final RandomAccessInput docIdsData = this.data.randomAccessSlice(entry.missingOffset, entry.offset - entry.missingOffset);
+    final LongValues docIDs = DirectMonotonicReader.getInstance(entry.monotonicMeta, docIdsData);
+    return new SparseBits(maxDoc, entry.numDocsWithValue, docIDs);
+  }
+
  @Override
  public Bits getDocsWithField(FieldInfo field) throws IOException {
    switch(field.getDocValuesType()) {
@ -912,7 +1107,11 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close
        return getLiveBits(be.missingOffset, maxDoc);
      case NUMERIC:
        NumericEntry ne = numerics.get(field.name);
-        return getLiveBits(ne.missingOffset, maxDoc);
+        if (ne.format == SPARSE_COMPRESSED) {
+          return getSparseLiveBits(ne);
+        } else {
+          return getLiveBits(ne.missingOffset, maxDoc);
+        }
      default:
        throw new AssertionError();
    }
@ -950,6 +1149,12 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close
    long minValue;
    long gcd;
    long table[];
+
+    /** for sparse compression */
+    long numDocsWithValue;
+    NumericEntry nonMissingValues;
+    NumberType numberType;
+
  }

  /** metadata entry for a binary docvalues field */
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene54/TestLucene54DocValuesFormat.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene54/TestLucene54DocValuesFormat.java
@ -18,32 +18,52 @@ package org.apache.lucene.codecs.lucene54;
 */

 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collections;
+import java.util.HashSet;
 import java.util.List;
+import java.util.Set;

 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.codecs.Codec;
 import org.apache.lucene.codecs.DocValuesFormat;
 import org.apache.lucene.codecs.PostingsFormat;
 import org.apache.lucene.codecs.asserting.AssertingCodec;
+import org.apache.lucene.codecs.lucene54.Lucene54DocValuesProducer.SparseBits;
+import org.apache.lucene.codecs.lucene54.Lucene54DocValuesProducer.SparseLongValues;
+import org.apache.lucene.document.BinaryDocValuesField;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
+import org.apache.lucene.document.NumericDocValuesField;
+import org.apache.lucene.document.SortedDocValuesField;
+import org.apache.lucene.document.SortedNumericDocValuesField;
 import org.apache.lucene.document.SortedSetDocValuesField;
+import org.apache.lucene.document.StoredField;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.index.BaseCompressingDocValuesFormatTestCase;
+import org.apache.lucene.index.BinaryDocValues;
 import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.DocValues;
+import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.NumericDocValues;
 import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.SerialMergeScheduler;
+import org.apache.lucene.index.SortedDocValues;
+import org.apache.lucene.index.SortedNumericDocValues;
 import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.lucene.index.StorableField;
+import org.apache.lucene.index.StoredDocument;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.index.TermsEnum.SeekStatus;
 import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LongValues;
 import org.apache.lucene.util.TestUtil;

 /**
@ -115,7 +135,141 @@ public class TestLucene54DocValuesFormat extends BaseCompressingDocValuesFormatT
      doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 8121), 1, 500);
    }
  }
-  
+
+  @Slow
+  public void testSparseDocValuesVsStoredFields() throws Exception {
+    int numIterations = atLeast(2);
+    for (int i = 0; i < numIterations; i++) {
+      doTestSparseDocValuesVsStoredFields();
+    }
+  }
+
+  private void doTestSparseDocValuesVsStoredFields() throws Exception {
+    final long[] values = new long[TestUtil.nextInt(random(), 1, 500)];
+    for (int i = 0; i < values.length; ++i) {
+      values[i] = random().nextLong();
+    }
+
+    Directory dir = newFSDirectory(createTempDir());
+    IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
+    conf.setMergeScheduler(new SerialMergeScheduler());
+    RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
+
+    // sparse compression is only enabled if less than 1% of docs have a value
+    final int avgGap = 100;
+
+    final int numDocs = atLeast(100);
+    for (int i = random().nextInt(avgGap * 2); i >= 0; --i) {
+      writer.addDocument(new Document());
+    }
+    final int maxNumValuesPerDoc = random().nextBoolean() ? 1 : TestUtil.nextInt(random(), 2, 5);
+    for (int i = 0; i < numDocs; ++i) {
+      Document doc = new Document();
+
+      // single-valued
+      long docValue = values[random().nextInt(values.length)];
+      doc.add(new NumericDocValuesField("numeric", docValue));
+      doc.add(new SortedDocValuesField("sorted", new BytesRef(Long.toString(docValue))));
+      doc.add(new BinaryDocValuesField("binary", new BytesRef(Long.toString(docValue))));
+      doc.add(new StoredField("value", docValue));
+
+      // multi-valued
+      final int numValues = TestUtil.nextInt(random(), 1, maxNumValuesPerDoc);
+      for (int j = 0; j < numValues; ++j) {
+        docValue = values[random().nextInt(values.length)];
+        doc.add(new SortedNumericDocValuesField("sorted_numeric", docValue));
+        doc.add(new SortedSetDocValuesField("sorted_set", new BytesRef(Long.toString(docValue))));
+        doc.add(new StoredField("values", docValue));
+      }
+
+      writer.addDocument(doc);
+
+      // add a gap
+      for (int j = random().nextInt(avgGap * 2); j >= 0; --j) {
+        writer.addDocument(new Document());
+      }
+    }
+
+    if (random().nextBoolean()) {
+      writer.forceMerge(1);
+    }
+
+    final IndexReader indexReader = writer.getReader();
+    writer.close();
+
+    for (LeafReaderContext context : indexReader.leaves()) {
+      final LeafReader reader = context.reader();
+      final NumericDocValues numeric = DocValues.getNumeric(reader, "numeric");
+      final Bits numericBits = DocValues.getDocsWithField(reader, "numeric");
+
+      final SortedDocValues sorted = DocValues.getSorted(reader, "sorted");
+      final Bits sortedBits = DocValues.getDocsWithField(reader, "sorted");
+
+      final BinaryDocValues binary = DocValues.getBinary(reader, "binary");
+      final Bits binaryBits = DocValues.getDocsWithField(reader, "binary");
+
+      final SortedNumericDocValues sortedNumeric = DocValues.getSortedNumeric(reader, "sorted_numeric");
+      final Bits sortedNumericBits = DocValues.getDocsWithField(reader, "sorted_numeric");
+
+      final SortedSetDocValues sortedSet = DocValues.getSortedSet(reader, "sorted_set");
+      final Bits sortedSetBits = DocValues.getDocsWithField(reader, "sorted_set");
+
+      for (int i = 0; i < reader.maxDoc(); ++i) {
+        final StoredDocument doc = reader.document(i);
+        final StorableField valueField = doc.getField("value");
+        final Long value = valueField == null ? null : valueField.numericValue().longValue();
+
+        if (value == null) {
+          assertEquals(0, numeric.get(i));
+          assertEquals(-1, sorted.getOrd(i));
+          assertEquals(new BytesRef(), binary.get(i));
+
+          assertFalse(numericBits.get(i));
+          assertFalse(sortedBits.get(i));
+          assertFalse(binaryBits.get(i));
+        } else {
+          assertEquals(value.longValue(), numeric.get(i));
+          assertTrue(sorted.getOrd(i) >= 0);
+          assertEquals(new BytesRef(Long.toString(value)), sorted.lookupOrd(sorted.getOrd(i)));
+          assertEquals(new BytesRef(Long.toString(value)), binary.get(i));
+
+          assertTrue(numericBits.get(i));
+          assertTrue(sortedBits.get(i));
+          assertTrue(binaryBits.get(i));
+        }
+
+        final StorableField[] valuesFields = doc.getFields("values");
+        final Set<Long> valueSet = new HashSet<>();
+        for (StorableField sf : valuesFields) {
+          valueSet.add(sf.numericValue().longValue());
+        }
+
+        sortedNumeric.setDocument(i);
+        assertEquals(valuesFields.length, sortedNumeric.count());
+        for (int j = 0; j < sortedNumeric.count(); ++j) {
+          assertTrue(valueSet.contains(sortedNumeric.valueAt(j)));
+        }
+        sortedSet.setDocument(i);
+        int sortedSetCount = 0;
+        while (true) {
+          long ord = sortedSet.nextOrd();
+          if (ord == SortedSetDocValues.NO_MORE_ORDS) {
+            break;
+          }
+          assertTrue(valueSet.contains(Long.parseLong(sortedSet.lookupOrd(ord).utf8ToString())));
+          sortedSetCount++;
+        }
+        assertEquals(valueSet.size(), sortedSetCount);
+
+        assertEquals(!valueSet.isEmpty(), sortedNumericBits.get(i));
+        assertEquals(!valueSet.isEmpty(), sortedSetBits.get(i));
+      }
+    }
+
+    indexReader.close();
+    dir.close();
+  }
+
  // TODO: try to refactor this and some termsenum tests into the base class.
  // to do this we need to fix the test class to get a DVF not a Codec so we can setup
  // the postings format correctly.
@ -278,4 +432,74 @@ public class TestLucene54DocValuesFormat extends BaseCompressingDocValuesFormatT
      }
    }
  }
+
+  public void testSparseLongValues() {
+    final int iters = atLeast(5);
+    for (int iter = 0; iter < iters; ++iter) {
+      final int numDocs = TestUtil.nextInt(random(), 0, 100);
+      final long[] docIds = new long[numDocs];
+      final long[] values = new long[numDocs];
+      final long maxDoc;
+      if (numDocs == 0) {
+        maxDoc = 1 + random().nextInt(10);
+      } else {
+        docIds[0] = random().nextInt(10);
+        for (int i = 1; i < docIds.length; ++i) {
+          docIds[i] = docIds[i - 1] + 1 + random().nextInt(100);
+        }
+        maxDoc = docIds[numDocs - 1] + 1 + random().nextInt(10);
+      }
+      for (int i = 0; i < values.length; ++i) {
+        values[i] = random().nextLong();
+      }
+      final long missingValue = random().nextLong();
+      final LongValues docIdsValues = new LongValues() {
+        @Override
+        public long get(long index) {
+          return docIds[Math.toIntExact(index)];
+        }
+      };
+      final LongValues valuesValues = new LongValues() {
+        @Override
+        public long get(long index) {
+          return values[Math.toIntExact(index)];
+        }
+      };
+      final SparseBits liveBits = new SparseBits(maxDoc, numDocs, docIdsValues);
+      // random-access
+      for (int i = 0; i < 2000; ++i) {
+        final long docId = TestUtil.nextLong(random(), 0, maxDoc - 1);
+        final boolean exists = liveBits.get(Math.toIntExact(docId));
+        assertEquals(Arrays.binarySearch(docIds, docId) >= 0, exists);
+      }
+      // sequential access
+      for (int docId = 0; docId < maxDoc; docId += random().nextInt(3)) {
+        final boolean exists = liveBits.get(Math.toIntExact(docId));
+        assertEquals(Arrays.binarySearch(docIds, docId) >= 0, exists);
+      }
+
+      final SparseLongValues sparseValues = new SparseLongValues(liveBits, valuesValues, missingValue);
+      // random-access
+      for (int i = 0; i < 2000; ++i) {
+        final long docId = TestUtil.nextLong(random(), 0, maxDoc - 1);
+        final int idx = Arrays.binarySearch(docIds, docId);
+        final long value = sparseValues.get(docId);
+        if (idx >= 0) {
+          assertEquals(values[idx], value);
+        } else {
+          assertEquals(missingValue, value);
+        }
+      }
+      // sequential access
+      for (int docId = 0; docId < maxDoc; docId += random().nextInt(3)) {
+        final int idx = Arrays.binarySearch(docIds, docId);
+        final long value = sparseValues.get(docId);
+        if (idx >= 0) {
+          assertEquals(values[idx], value);
+        } else {
+          assertEquals(missingValue, value);
+        }
+      }
+    }
+  }
 }