LUCENE-7489: Better sparsity support for Lucene70DocValuesFormat.

2016-10-12 12:55:16 +02:00 · 2016-10-12 12:55:16 +02:00 · 927fd51d64
parent a4a314d160
commit 927fd51d64
14 changed files with 1476 additions and 2545 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -29,6 +29,9 @@ Bug Fixes

 Improvements

+* LUCENE-7489: Better storage of sparse doc-values fields with the default
+  codec. (Adrien Grand)
+
 Optimizations

 * LUCENE-7416: BooleanQuery optimizes queries that have queries that occur both
--- a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesProducer.java
+++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesProducer.java
@ -928,7 +928,7 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close
      }

      @Override
-      public TermsEnum termsEnum() {
+      public TermsEnum termsEnum() throws IOException {
        if (binary instanceof CompressedBinaryDocValues) {
          return ((CompressedBinaryDocValues)binary).getTermsEnum();
        } else {
@ -1233,7 +1233,7 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close
      }

      @Override
-      public TermsEnum termsEnum() {
+      public TermsEnum termsEnum() throws IOException {
        if (binary instanceof CompressedBinaryDocValues) {
          return ((CompressedBinaryDocValues)binary).getTermsEnum();
        } else {
@ -1292,7 +1292,7 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close
      }

      @Override
-      public TermsEnum termsEnum() {
+      public TermsEnum termsEnum() throws IOException {
        if (binary instanceof CompressedBinaryDocValues) {
          return ((CompressedBinaryDocValues) binary).getTermsEnum();
        } else {
@ -1490,12 +1490,8 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close
      }
    }

-    TermsEnum getTermsEnum() {
-      try {
-        return getTermsEnum(data.clone());
-      } catch (IOException e) {
-        throw new RuntimeException(e);
-      }
+    TermsEnum getTermsEnum() throws IOException {
+      return getTermsEnum(data.clone());
    }

    private CompressedBinaryTermsEnum getTermsEnum(IndexInput input) throws IOException {
--- a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java
@ -361,7 +361,7 @@ public abstract class DocValuesConsumer implements Closeable {
    addSortedNumericField(mergeFieldInfo,
                          new EmptyDocValuesProducer() {
                            @Override
-                            public SortedNumericDocValues getSortedNumeric(FieldInfo fieldInfo) {
+                            public SortedNumericDocValues getSortedNumeric(FieldInfo fieldInfo) throws IOException {
                              if (fieldInfo != mergeFieldInfo) {
                                throw new IllegalArgumentException("wrong FieldInfo");
                              }
@ -375,11 +375,7 @@ public abstract class DocValuesConsumer implements Closeable {
                                if (docValuesProducer != null) {
                                  FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name);
                                  if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED_NUMERIC) {
-                                    try {
-                                      values = docValuesProducer.getSortedNumeric(readerFieldInfo);
-                                    } catch (IOException ioe) {
-                                      throw new RuntimeException(ioe);
-                                    }
+                                    values = docValuesProducer.getSortedNumeric(readerFieldInfo);
                                  }
                                }
                                if (values == null) {
@ -391,12 +387,7 @@ public abstract class DocValuesConsumer implements Closeable {

                              final long finalCost = cost;

-                              final DocIDMerger<SortedNumericDocValuesSub> docIDMerger;
-                              try {
-                                docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null);
-                              } catch (IOException ioe) {
-                                throw new RuntimeException(ioe);
-                              }
+                              final DocIDMerger<SortedNumericDocValuesSub> docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null);

                              return new SortedNumericDocValues() {

@ -521,7 +512,7 @@ public abstract class DocValuesConsumer implements Closeable {
    addSortedField(fieldInfo,
                   new EmptyDocValuesProducer() {
                     @Override
-                     public SortedDocValues getSorted(FieldInfo fieldInfoIn) {
+                     public SortedDocValues getSorted(FieldInfo fieldInfoIn) throws IOException {
                       if (fieldInfoIn != fieldInfo) {
                         throw new IllegalArgumentException("wrong FieldInfo");
                       }
@ -536,11 +527,7 @@ public abstract class DocValuesConsumer implements Closeable {
                         if (docValuesProducer != null) {
                           FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(fieldInfo.name);
                           if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED) {
-                             try {
-                               values = docValuesProducer.getSorted(readerFieldInfo);
-                             } catch (IOException ioe) {
-                               throw new RuntimeException(ioe);
-                             }
+                             values = docValuesProducer.getSorted(readerFieldInfo);
                           }
                         }
                         if (values == null) {
@ -553,12 +540,7 @@ public abstract class DocValuesConsumer implements Closeable {

                       final long finalCost = cost;

-                       final DocIDMerger<SortedDocValuesSub> docIDMerger;
-                       try {
-                         docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null);
-                       } catch (IOException ioe) {
-                         throw new RuntimeException(ioe);
-                       }
+                       final DocIDMerger<SortedDocValuesSub> docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null);
                       
                       return new SortedDocValues() {
                         private int docID = -1;
@ -693,7 +675,7 @@ public abstract class DocValuesConsumer implements Closeable {
    addSortedSetField(mergeFieldInfo,
                      new EmptyDocValuesProducer() {
                        @Override
-                        public SortedSetDocValues getSortedSet(FieldInfo fieldInfo) {
+                        public SortedSetDocValues getSortedSet(FieldInfo fieldInfo) throws IOException {
                          if (fieldInfo != mergeFieldInfo) {
                            throw new IllegalArgumentException("wrong FieldInfo");
                          }
@ -709,11 +691,7 @@ public abstract class DocValuesConsumer implements Closeable {
                            if (docValuesProducer != null) {
                              FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name);
                              if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED_SET) {
-                                try {
-                                  values = docValuesProducer.getSortedSet(readerFieldInfo);
-                                } catch (IOException ioe) {
-                                  throw new RuntimeException(ioe);
-                                }
+                                values = docValuesProducer.getSortedSet(readerFieldInfo);
                              }
                            }
                            if (values == null) {
@ -723,12 +701,7 @@ public abstract class DocValuesConsumer implements Closeable {
                            subs.add(new SortedSetDocValuesSub(mergeState.docMaps[i], values, map.getGlobalOrds(i)));
                          }
            
-                          final DocIDMerger<SortedSetDocValuesSub> docIDMerger;
-                          try {
-                            docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null);
-                          } catch (IOException ioe) {
-                            throw new RuntimeException(ioe);
-                          }
+                          final DocIDMerger<SortedSetDocValuesSub> docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null);
                          
                          final long finalCost = cost;

--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesConsumer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesConsumer.java
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesFormat.java
@ -23,39 +23,64 @@ import org.apache.lucene.codecs.DocValuesConsumer;
 import org.apache.lucene.codecs.DocValuesFormat;
 import org.apache.lucene.codecs.DocValuesProducer;
 import org.apache.lucene.index.DocValuesType;
+import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.SegmentReadState;
 import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.store.DataOutput;
 import org.apache.lucene.util.SmallFloat;
 import org.apache.lucene.util.packed.DirectWriter;

 /**
 * Lucene 7.0 DocValues format.
 * <p>
- * Encodes the five per-document value types (Numeric,Binary,Sorted,SortedSet,SortedNumeric) with these strategies:
+ * Documents that have a value for the field are encoded in a way that it is always possible to
+ * know the ordinal of the current document in the set of documents that have a value. For instance,
+ * say the set of documents that have a value for the field is <tt>{1, 5, 6, 11}</tt>. When the
+ * iterator is on <tt>6</tt>, it knows that this is the 3rd item of the set. This way, values can
+ * be stored densely and accessed based on their index at search time. If all documents in a segment
+ * have a value for the field, the index is the same as the doc ID, so this case is encoded implicitly
+ * and is very fast at query time. On the other hand if some documents are missing a value for the
+ * field then the set of documents that have a value is encoded into blocks. All doc IDs that share
+ * the same upper 16 bits are encoded into the same block with the following strategies:
+ * <ul>
+ *     <li>SPARSE: This strategy is used when a block contains at most 4095 documents. The lower 16
+ *         bits of doc IDs are stored as {@link DataOutput#writeShort(short) shorts} while the upper
+ *         16 bits are given by the block ID.
+ *     <li>DENSE: This strategy is used when a block contains between 4096 and 65535 documents. The
+ *         lower bits of doc IDs are stored in a bit set. Advancing is performed using
+ *         {@link Long#numberOfTrailingZeros(long) ntz} operations while the index is computed by
+ *         accumulating the {@link Long#bitCount(long) bit counts} of the visited longs.
+ *     <li>ALL: This strategy is used when a block contains exactly 65536 documents, meaning that
+ *         the block is full. In that case doc IDs do not need to be stored explicitly. This is
+ *         typically faster than both SPARSE and DENSE which is a reason why it is preferable to have
+ *         all documents that have a value for a field using contiguous doc IDs, for instance by
+ *         using {@link IndexWriterConfig#setIndexSort(org.apache.lucene.search.Sort) index sorting}.
+ * </ul>
+ * <p>
+ * Then the five per-document value types (Numeric,Binary,Sorted,SortedSet,SortedNumeric) are
+ * encoded using the following strategies:
 * <p>
 * {@link DocValuesType#NUMERIC NUMERIC}:
 * <ul>
 *    <li>Delta-compressed: per-document integers written as deltas from the minimum value,
 *        compressed with bitpacking. For more information, see {@link DirectWriter}.
 *    <li>Table-compressed: when the number of unique values is very small (&lt; 256), and
- *        when there are unused "gaps" in the range of values used (such as {@link SmallFloat}), 
- *        a lookup table is written instead. Each per-document entry is instead the ordinal 
- *        to this table, and those ordinals are compressed with bitpacking ({@link DirectWriter}). 
+ *        when there are unused "gaps" in the range of values used (such as {@link SmallFloat}),
+ *        a lookup table is written instead. Each per-document entry is instead the ordinal
+ *        to this table, and those ordinals are compressed with bitpacking ({@link DirectWriter}).
 *    <li>GCD-compressed: when all numbers share a common divisor, such as dates, the greatest
 *        common denominator (GCD) is computed, and quotients are stored using Delta-compressed Numerics.
 *    <li>Monotonic-compressed: when all numbers are monotonically increasing offsets, they are written
 *        as blocks of bitpacked integers, encoding the deviation from the expected delta.
- *    <li>Const-compressed: when there is only one possible non-missing value, only the missing
- *        bitset is encoded.
- *    <li>Sparse-compressed: only documents with a value are stored, and lookups are performed
- *        using binary search.
+ *    <li>Const-compressed: when there is only one possible value, no per-document data is needed and
+ *        this value is encoded alone.
 * </ul>
 * <p>
 * {@link DocValuesType#BINARY BINARY}:
 * <ul>
 *    <li>Fixed-width Binary: one large concatenated byte[] is written, along with the fixed length.
- *        Each document's value can be addressed directly with multiplication ({@code docID * length}). 
- *    <li>Variable-width Binary: one large concatenated byte[] is written, along with end addresses 
+ *        Each document's value can be addressed directly with multiplication ({@code docID * length}).
+ *    <li>Variable-width Binary: one large concatenated byte[] is written, along with end addresses
 *        for each document. The addresses are written as Monotonic-compressed numerics.
 *    <li>Prefix-compressed Binary: values are written in chunks of 16, with the first value written
 *        completely and other values sharing prefixes. chunk addresses are written as Monotonic-compressed
@ -64,27 +89,21 @@ import org.apache.lucene.util.packed.DirectWriter;
 * <p>
 * {@link DocValuesType#SORTED SORTED}:
 * <ul>
- *    <li>Sorted: a mapping of ordinals to deduplicated terms is written as Binary, 
+ *    <li>Sorted: a mapping of ordinals to deduplicated terms is written as Prefix-compressed Binary,
 *        along with the per-document ordinals written using one of the numeric strategies above.
 * </ul>
 * <p>
 * {@link DocValuesType#SORTED_SET SORTED_SET}:
 * <ul>
 *    <li>Single: if all documents have 0 or 1 value, then data are written like SORTED.
- *    <li>SortedSet table: when there are few unique sets of values (&lt; 256) then each set is assigned
- *        an id, a lookup table is written and the mapping from document to set id is written using the
- *        numeric strategies above.
- *    <li>SortedSet: a mapping of ordinals to deduplicated terms is written as Binary, 
- *        an ordinal list and per-document index into this list are written using the numeric strategies 
+ *    <li>SortedSet: a mapping of ordinals to deduplicated terms is written as Binary,
+ *        an ordinal list and per-document index into this list are written using the numeric strategies
 *        above.
 * </ul>
 * <p>
 * {@link DocValuesType#SORTED_NUMERIC SORTED_NUMERIC}:
 * <ul>
 *    <li>Single: if all documents have 0 or 1 value, then data are written like NUMERIC.
- *    <li>SortedSet table: when there are few unique sets of values (&lt; 256) then each set is assigned
- *        an id, a lookup table is written and the mapping from document to set id is written using the
- *        numeric strategies above.
 *    <li>SortedNumeric: a value list and per-document index into this list are written using the numeric
 *        strategies above.
 * </ul>
@ -112,72 +131,30 @@ public final class Lucene70DocValuesFormat extends DocValuesFormat {
  public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException {
    return new Lucene70DocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION);
  }
-  
-  static final String DATA_CODEC = "Lucene54DocValuesData";
+
+  static final String DATA_CODEC = "Lucene70DocValuesData";
  static final String DATA_EXTENSION = "dvd";
-  static final String META_CODEC = "Lucene54DocValuesMetadata";
+  static final String META_CODEC = "Lucene70DocValuesMetadata";
  static final String META_EXTENSION = "dvm";
  static final int VERSION_START = 0;
  static final int VERSION_CURRENT = VERSION_START;
-  
+
  // indicates docvalues type
  static final byte NUMERIC = 0;
  static final byte BINARY = 1;
  static final byte SORTED = 2;
  static final byte SORTED_SET = 3;
  static final byte SORTED_NUMERIC = 4;
-  
-  // address terms in blocks of 16 terms
-  static final int INTERVAL_SHIFT = 4;
-  static final int INTERVAL_COUNT = 1 << INTERVAL_SHIFT;
-  static final int INTERVAL_MASK = INTERVAL_COUNT - 1;
-  
-  // build reverse index from every 1024th term
-  static final int REVERSE_INTERVAL_SHIFT = 10;
-  static final int REVERSE_INTERVAL_COUNT = 1 << REVERSE_INTERVAL_SHIFT;
-  static final int REVERSE_INTERVAL_MASK = REVERSE_INTERVAL_COUNT - 1;
-  
-  // for conversion from reverse index to block
-  static final int BLOCK_INTERVAL_SHIFT = REVERSE_INTERVAL_SHIFT - INTERVAL_SHIFT;
-  static final int BLOCK_INTERVAL_COUNT = 1 << BLOCK_INTERVAL_SHIFT;
-  static final int BLOCK_INTERVAL_MASK = BLOCK_INTERVAL_COUNT - 1;

-  /** Compressed using packed blocks of ints. */
-  static final int DELTA_COMPRESSED = 0;
-  /** Compressed by computing the GCD. */
-  static final int GCD_COMPRESSED = 1;
-  /** Compressed by giving IDs to unique values. */
-  static final int TABLE_COMPRESSED = 2;
-  /** Compressed with monotonically increasing values */
-  static final int MONOTONIC_COMPRESSED = 3;
-  /** Compressed with constant value (uses only missing bitset) */
-  static final int CONST_COMPRESSED = 4;
-  /** Compressed with sparse arrays. */
-  static final int SPARSE_COMPRESSED = 5;
-
-  /** Uncompressed binary, written directly (fixed length). */
-  static final int BINARY_FIXED_UNCOMPRESSED = 0;
-  /** Uncompressed binary, written directly (variable length). */
-  static final int BINARY_VARIABLE_UNCOMPRESSED = 1;
-  /** Compressed binary with shared prefixes */
-  static final int BINARY_PREFIX_COMPRESSED = 2;
-
-  /** Standard storage for sorted set values with 1 level of indirection:
-   *  {@code docId -> address -> ord}. */
-  static final int SORTED_WITH_ADDRESSES = 0;
-  /** Single-valued sorted set values, encoded as sorted values, so no level
-   *  of indirection: {@code docId -> ord}. */
-  static final int SORTED_SINGLE_VALUED = 1;
-  /** Compressed giving IDs to unique sets of values:
-   * {@code docId -> setId -> ords} */
-  static final int SORTED_SET_TABLE = 2;
-  
-  /** placeholder for missing offset that means there are no missing values */
-  static final int ALL_LIVE = -1;
-  /** placeholder for missing offset that means all values are missing */
-  static final int ALL_MISSING = -2;
-  
  // addressing uses 16k blocks
  static final int MONOTONIC_BLOCK_SIZE = 16384;
  static final int DIRECT_MONOTONIC_BLOCK_SHIFT = 16;
+
+  static final int TERMS_DICT_BLOCK_SHIFT = 4;
+  static final int TERMS_DICT_BLOCK_SIZE = 1 << TERMS_DICT_BLOCK_SHIFT;
+  static final int TERMS_DICT_BLOCK_MASK = TERMS_DICT_BLOCK_SIZE - 1;
+
+  static final int TERMS_DICT_REVERSE_INDEX_SHIFT = 10;
+  static final int TERMS_DICT_REVERSE_INDEX_SIZE = 1 << TERMS_DICT_REVERSE_INDEX_SHIFT;
+  static final int TERMS_DICT_REVERSE_INDEX_MASK = TERMS_DICT_REVERSE_INDEX_SIZE - 1;
 }
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesProducer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesProducer.java
--- a/lucene/core/src/java/org/apache/lucene/index/EmptyDocValuesProducer.java
+++ b/lucene/core/src/java/org/apache/lucene/index/EmptyDocValuesProducer.java
@ -44,12 +44,12 @@ public abstract class EmptyDocValuesProducer extends DocValuesProducer {
  }

  @Override
-  public SortedNumericDocValues getSortedNumeric(FieldInfo field) {
+  public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
    throw new UnsupportedOperationException();
  }
  
  @Override
-  public SortedSetDocValues getSortedSet(FieldInfo field) {
+  public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
    throw new UnsupportedOperationException();
  }

--- a/lucene/core/src/java/org/apache/lucene/index/LegacySortedSetDocValues.java
+++ b/lucene/core/src/java/org/apache/lucene/index/LegacySortedSetDocValues.java
@ -17,6 +17,8 @@
 package org.apache.lucene.index;


+import java.io.IOException;
+
 import org.apache.lucene.util.BytesRef;

 /**
@ -103,7 +105,7 @@ public abstract class LegacySortedSetDocValues {
   * Returns a {@link TermsEnum} over the values.
   * The enum supports {@link TermsEnum#ord()} and {@link TermsEnum#seekExact(long)}.
   */
-  public TermsEnum termsEnum() {
+  public TermsEnum termsEnum() throws IOException {
    throw new UnsupportedOperationException();
  }
 }
--- a/lucene/core/src/java/org/apache/lucene/index/SingletonSortedSetDocValues.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SingletonSortedSetDocValues.java
@ -95,7 +95,7 @@ final class SingletonSortedSetDocValues extends SortedSetDocValues {
  }

  @Override
-  public TermsEnum termsEnum() {
+  public TermsEnum termsEnum() throws IOException {
    return in.termsEnum();
  }

--- a/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java
@ -104,7 +104,7 @@ public abstract class SortedDocValues extends BinaryDocValues {
   * Returns a {@link TermsEnum} over the values.
   * The enum supports {@link TermsEnum#ord()} and {@link TermsEnum#seekExact(long)}.
   */
-  public TermsEnum termsEnum() {
+  public TermsEnum termsEnum() throws IOException {
    return new SortedDocValuesTermsEnum(this);
  }
 }
--- a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValues.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValues.java
@ -98,7 +98,7 @@ public abstract class SortedSetDocValues extends DocIdSetIterator {
   * Returns a {@link TermsEnum} over the values.
   * The enum supports {@link TermsEnum#ord()} and {@link TermsEnum#seekExact(long)}.
   */
-  public TermsEnum termsEnum() {
+  public TermsEnum termsEnum() throws IOException {
    return new SortedSetDocValuesTermsEnum(this);
  }
 }
--- a/lucene/core/src/java/org/apache/lucene/search/DocValuesRewriteMethod.java
+++ b/lucene/core/src/java/org/apache/lucene/search/DocValuesRewriteMethod.java
@ -81,7 +81,7 @@ public final class DocValuesRewriteMethod extends MultiTermQuery.RewriteMethod {
          TermsEnum termsEnum = query.getTermsEnum(new Terms() {
            
            @Override
-            public TermsEnum iterator() {
+            public TermsEnum iterator() throws IOException {
              return fcsi.termsEnum();
            }

--- a/lucene/core/src/java/org/apache/lucene/util/packed/DirectMonotonicReader.java
+++ b/lucene/core/src/java/org/apache/lucene/util/packed/DirectMonotonicReader.java
@ -46,7 +46,6 @@ public final class DirectMonotonicReader {
  public static class Meta implements Accountable {
    private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(Meta.class);

-    final long numValues;
    final int blockShift;
    final int numBlocks;
    final long[] mins;
@ -55,7 +54,6 @@ public final class DirectMonotonicReader {
    final long[] offsets;

    Meta(long numValues, int blockShift) {
-      this.numValues = numValues;
      this.blockShift = blockShift;
      long numBlocks = numValues >>> blockShift;
      if ((numBlocks << blockShift) < numValues) {
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestLucene70DocValuesFormat.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestLucene70DocValuesFormat.java
@ -25,14 +25,13 @@ import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
 import java.util.TreeSet;
+import java.util.function.Supplier;

 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.codecs.Codec;
 import org.apache.lucene.codecs.DocValuesFormat;
 import org.apache.lucene.codecs.PostingsFormat;
 import org.apache.lucene.codecs.asserting.AssertingCodec;
-import org.apache.lucene.codecs.lucene70.Lucene70DocValuesProducer.SparseNumericDocValues;
-import org.apache.lucene.codecs.lucene70.Lucene70DocValuesProducer.SparseNumericDocValuesRandomAccessWrapper;
 import org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat;
 import org.apache.lucene.document.BinaryDocValuesField;
 import org.apache.lucene.document.Document;
@ -62,7 +61,6 @@ import org.apache.lucene.index.SortedSetDocValues;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum.SeekStatus;
-import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.RAMFile;
@ -70,7 +68,6 @@ import org.apache.lucene.store.RAMInputStream;
 import org.apache.lucene.store.RAMOutputStream;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
-import org.apache.lucene.util.LongValues;
 import org.apache.lucene.util.TestUtil;

 /**
@ -123,7 +120,7 @@ public class TestLucene70DocValuesFormat extends BaseCompressingDocValuesFormatT
  public void testTermsEnumFixedWidth() throws Exception {
    int numIterations = atLeast(1);
    for (int i = 0; i < numIterations; i++) {
-      doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 5121), 10, 10);
+      doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 5121), () -> TestUtil.randomSimpleString(random(), 10, 10));
    }
  }
  
@ -131,7 +128,7 @@ public class TestLucene70DocValuesFormat extends BaseCompressingDocValuesFormatT
  public void testTermsEnumVariableWidth() throws Exception {
    int numIterations = atLeast(1);
    for (int i = 0; i < numIterations; i++) {
-      doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 5121), 1, 500);
+      doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 5121), () -> TestUtil.randomSimpleString(random(), 1, 500));
    }
  }
  
@ -139,7 +136,21 @@ public class TestLucene70DocValuesFormat extends BaseCompressingDocValuesFormatT
  public void testTermsEnumRandomMany() throws Exception {
    int numIterations = atLeast(1);
    for (int i = 0; i < numIterations; i++) {
-      doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 8121), 1, 500);
+      doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 8121), () -> TestUtil.randomSimpleString(random(), 1, 500));
+    }
+  }
+
+  public void testTermsEnumLongSharedPrefixes() throws Exception {
+    int numIterations = atLeast(1);
+    for (int i = 0; i < numIterations; i++) {
+      doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 5121), () -> {
+        char[] chars = new char[random().nextInt(500)];
+        Arrays.fill(chars, 'a');
+        if (chars.length > 0) {
+          chars[random().nextInt(chars.length)] = 'b';
+        }
+        return new String(chars);
+      });
    }
  }

@ -269,7 +280,7 @@ public class TestLucene70DocValuesFormat extends BaseCompressingDocValuesFormatT
  // TODO: try to refactor this and some termsenum tests into the base class.
  // to do this we need to fix the test class to get a DVF not a Codec so we can setup
  // the postings format correctly.
-  private void doTestTermsEnumRandom(int numDocs, int minLength, int maxLength) throws Exception {
+  private void doTestTermsEnumRandom(int numDocs, Supplier<String> valuesProducer) throws Exception {
    Directory dir = newFSDirectory(createTempDir());
    IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
    conf.setMergeScheduler(new SerialMergeScheduler());
@ -294,12 +305,11 @@ public class TestLucene70DocValuesFormat extends BaseCompressingDocValuesFormatT
      Document doc = new Document();
      Field idField = new StringField("id", Integer.toString(i), Field.Store.NO);
      doc.add(idField);
-      final int length = TestUtil.nextInt(random(), minLength, maxLength);
      int numValues = random().nextInt(17);
      // create a random list of strings
      List<String> values = new ArrayList<>();
      for (int v = 0; v < numValues; v++) {
-        values.add(TestUtil.randomSimpleString(random(), minLength, length));
+        values.add(valuesProducer.get());
      }
      
      // add in any order to the indexed field
@ -429,92 +439,6 @@ public class TestLucene70DocValuesFormat extends BaseCompressingDocValuesFormatT
    }
  }

-  public void testSparseLongValues() throws IOException {
-    final int iters = atLeast(5);
-    for (int iter = 0; iter < iters; ++iter) {
-      final int numDocs = TestUtil.nextInt(random(), 0, 100);
-      final int[] docIds = new int[numDocs];
-      final long[] values = new long[numDocs];
-      final int maxDoc;
-      if (numDocs == 0) {
-        maxDoc = 1 + random().nextInt(10);
-      } else {
-        docIds[0] = random().nextInt(10);
-        for (int i = 1; i < docIds.length; ++i) {
-          docIds[i] = docIds[i - 1] + 1 + random().nextInt(100);
-        }
-        maxDoc = docIds[numDocs - 1] + 1 + random().nextInt(10);
-      }
-      for (int i = 0; i < values.length; ++i) {
-        values[i] = random().nextLong();
-      }
-      final long missingValue = random().nextLong();
-      final LongValues docIdsValues = new LongValues() {
-        @Override
-        public long get(long index) {
-          return docIds[Math.toIntExact(index)];
-        }
-      };
-      final LongValues valuesValues = new LongValues() {
-        @Override
-        public long get(long index) {
-          return values[Math.toIntExact(index)];
-        }
-      };
-      final SparseNumericDocValues sparseValues = new SparseNumericDocValues(numDocs, docIdsValues, valuesValues);
-
-      // sequential access
-      assertEquals(-1, sparseValues.docID());
-      for (int i = 0; i < docIds.length; ++i) {
-        assertEquals(docIds[i], sparseValues.nextDoc());
-      }
-      assertEquals(DocIdSetIterator.NO_MORE_DOCS, sparseValues.nextDoc());
-
-      // advance
-      for (int i = 0; i < 2000; ++i) {
-        final int target = TestUtil.nextInt(random(), 0, maxDoc);
-        int index = Arrays.binarySearch(docIds, target);
-        if (index < 0) {
-          index = -1 - index;
-        }
-        sparseValues.reset();
-        if (index > 0) {
-          assertEquals(docIds[index - 1], sparseValues.advance(Math.toIntExact(docIds[index - 1])));
-        }
-        if (index == docIds.length) {
-          assertEquals(DocIdSetIterator.NO_MORE_DOCS, sparseValues.advance(target));
-        } else {
-          assertEquals(docIds[index], sparseValues.advance(target));
-        }
-      }
-
-      final SparseNumericDocValuesRandomAccessWrapper raWrapper = new SparseNumericDocValuesRandomAccessWrapper(sparseValues, missingValue);
-
-      // random-access
-      for (int i = 0; i < 2000; ++i) {
-        final int docId = TestUtil.nextInt(random(), 0, maxDoc - 1);
-        final int idx = Arrays.binarySearch(docIds, docId);
-        final long value = raWrapper.get(docId);
-        if (idx >= 0) {
-          assertEquals(values[idx], value);
-        } else {
-          assertEquals(missingValue, value);
-        }
-      }
-
-      // sequential access
-      for (int docId = 0; docId < maxDoc; docId += random().nextInt(3)) {
-        final int idx = Arrays.binarySearch(docIds, docId);
-        final long value = raWrapper.get(docId);
-        if (idx >= 0) {
-          assertEquals(values[idx], value);
-        } else {
-          assertEquals(missingValue, value);
-        }
-      }
-    }
-  }
-
  @Slow
  public void testSortedSetAroundBlockSize() throws IOException {
    final int frontier = 1 << Lucene70DocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;