LUCENE-7485: Better storage of sparse values in Lucene70NormsFormat.

2016-10-11 09:20:34 +02:00 · 2016-10-11 09:20:34 +02:00 · 2fbbcac580
parent 63ef459022
commit 2fbbcac580
7 changed files with 504 additions and 212 deletions
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/IndexedDISI.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/IndexedDISI.java
@ -0,0 +1,269 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene70;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current document
+ * among the list of documents that this iterator can return. This is useful
+ * to implement sparse doc values by only having to encode values for documents
+ * that actually have a value.
+ * <p>Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:<ul>
+ *   <li>{@code ALL} if the range contains 65536 documents exactly,
+ *   <li>{@code DENSE} if the range contains 4096 documents or more; in that
+ *       case documents are stored in a bit set,
+ *   <li>{@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ *       stored in a {@link DataInput#readShort() short}.
+ * </ul>
+ * <p>Only ranges that contain at least one value are encoded.
+ * <p>This implementation uses 6 bytes per document in the worst-case, which happens
+ * in the case that all ranges contain exactly one document.
+ * @lucene.internal
+ */
+final class IndexedDISI extends DocIdSetIterator {
+
+  static final int MAX_ARRAY_LENGTH = (1 << 12) - 1;
+
+  private static void flush(int block, FixedBitSet buffer, int cardinality, IndexOutput out) throws IOException {
+    assert block >= 0 && block < 65536;
+    out.writeShort((short) block);
+    assert cardinality > 0 && cardinality <= 65536;
+    out.writeShort((short) (cardinality - 1));
+    if (cardinality > MAX_ARRAY_LENGTH) {
+      if (cardinality != 65536) { // all docs are set
+        for (long word : buffer.getBits()) {
+          out.writeLong(word);
+        }
+      }
+    } else {
+      BitSetIterator it = new BitSetIterator(buffer, cardinality);
+      for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+        out.writeShort((short) doc);
+      }
+    }
+  }
+
+  static void writeBitSet(DocIdSetIterator it, IndexOutput out) throws IOException {
+    int i = 0;
+    final FixedBitSet buffer = new FixedBitSet(1<<16);
+    int prevBlock = -1;
+    for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+      final int block = doc >>> 16;
+      if (prevBlock != -1 && block != prevBlock) {
+        flush(prevBlock, buffer, i, out);
+        buffer.clear(0, buffer.length());
+        prevBlock = block;
+        i = 0;
+      }
+      buffer.set(doc & 0xFFFF);
+      i++;
+      prevBlock = block;
+    }
+    if (i > 0) {
+      flush(prevBlock, buffer, i, out);
+      buffer.clear(0, buffer.length());
+    }
+    // NO_MORE_DOCS is stored explicitly
+    buffer.set(DocIdSetIterator.NO_MORE_DOCS & 0xFFFF);
+    flush(DocIdSetIterator.NO_MORE_DOCS >>> 16, buffer, 1, out);
+  }
+
+  /** The slice that stores the {@link DocIdSetIterator}. */
+  private final IndexInput slice;
+  private final long cost;
+
+  IndexedDISI(IndexInput in, long offset, long length, long cost) throws IOException {
+    this.slice = in.slice("docs", offset, length);
+    this.cost = cost;
+  }
+
+  private int block = -1;
+  private long blockEnd;
+  private int nextBlockIndex = -1;
+  Method method;
+
+  private int doc = -1;
+  private int index = -1;
+
+  // DENSE variables
+  private long word;
+  private int wordIndex = -1;
+  // number of one bits encountered so far, including those of `word`
+  private int numberOfOnes;
+
+  // ALL variables
+  private int gap;
+
+  @Override
+  public int docID() {
+    return doc;
+  }
+
+  @Override
+  public int advance(int target) throws IOException {
+    final int targetBlock = target & 0xFFFF0000;
+    if (block != targetBlock) {
+      advanceBlock(targetBlock);
+    }
+    if (block == targetBlock) {
+      if (method.advanceWithinBlock(this, target)) {
+        return doc;
+      }
+      readBlockHeader();
+    }
+    return doc = method.readFirstDoc(this);
+  }
+
+  private void advanceBlock(int targetBlock) throws IOException {
+    do {
+      slice.seek(blockEnd);
+      readBlockHeader();
+    } while (block < targetBlock);
+  }
+
+  private void readBlockHeader() throws IOException {
+    block = Short.toUnsignedInt(slice.readShort()) << 16;
+    assert block >= 0;
+    final int numValues = 1 + Short.toUnsignedInt(slice.readShort());
+    index = nextBlockIndex;
+    nextBlockIndex = index + numValues;
+    if (numValues <= MAX_ARRAY_LENGTH) {
+      method = Method.SPARSE;
+      blockEnd = slice.getFilePointer() + (numValues << 1);
+    } else if (numValues == 65536) {
+      method = Method.ALL;
+      blockEnd = slice.getFilePointer();
+      gap = block - index - 1;
+    } else {
+      method = Method.DENSE;
+      blockEnd = slice.getFilePointer() + (1 << 13);
+      wordIndex = -1;
+      numberOfOnes = index + 1;
+    }
+  }
+
+  @Override
+  public int nextDoc() throws IOException {
+    return advance(doc + 1);
+  }
+
+  public int index() {
+    return index;
+  }
+
+  @Override
+  public long cost() {
+    return cost;
+  }
+
+  enum Method {
+    SPARSE {
+      @Override
+      int readFirstDoc(IndexedDISI disi) throws IOException {
+        disi.index++;
+        return disi.block | Short.toUnsignedInt(disi.slice.readShort());
+      }
+      @Override
+      boolean advanceWithinBlock(IndexedDISI disi, int target) throws IOException {
+        final int targetInBlock = target & 0xFFFF;
+        // TODO: binary search
+        for (; disi.index < disi.nextBlockIndex;) {
+          int doc = Short.toUnsignedInt(disi.slice.readShort());
+          disi.index++;
+          if (doc >= targetInBlock) {
+            disi.doc = disi.block | doc;
+            return true;
+          }
+        }
+        return false;
+      }
+    },
+    DENSE {
+      @Override
+      int readFirstDoc(IndexedDISI disi) throws IOException {
+        do {
+          disi.word = disi.slice.readLong();
+          disi.wordIndex++;
+        } while (disi.word == 0L);
+        disi.index = disi.numberOfOnes;
+        disi.numberOfOnes += Long.bitCount(disi.word);
+        return disi.block | (disi.wordIndex << 6) | Long.numberOfTrailingZeros(disi.word);
+      }
+      @Override
+      boolean advanceWithinBlock(IndexedDISI disi, int target) throws IOException {
+        final int targetInBlock = target & 0xFFFF;
+        final int targetWordIndex = targetInBlock >>> 6;
+        for (int i = disi.wordIndex + 1; i <= targetWordIndex; ++i) {
+          disi.word = disi.slice.readLong();
+          disi.numberOfOnes += Long.bitCount(disi.word);
+        }
+        disi.wordIndex = targetWordIndex;
+
+        long leftBits = disi.word >>> target;
+        if (leftBits != 0L) {
+          disi.doc = target + Long.numberOfTrailingZeros(leftBits);
+          disi.index = disi.numberOfOnes - Long.bitCount(leftBits);
+          return true;
+        }
+
+        while (++disi.wordIndex < 1024) {
+          disi.word = disi.slice.readLong();
+          if (disi.word != 0) {
+            disi.index = disi.numberOfOnes;
+            disi.numberOfOnes += Long.bitCount(disi.word);
+            disi.doc = disi.block | (disi.wordIndex << 6) | Long.numberOfTrailingZeros(disi.word);
+            return true;
+          }
+        }
+        return false;
+      }
+    },
+    ALL {
+      @Override
+      int readFirstDoc(IndexedDISI disi) {
+        return disi.block;
+      }
+      @Override
+      boolean advanceWithinBlock(IndexedDISI disi, int target) throws IOException {
+        disi.doc = target;
+        disi.index = target - disi.gap;
+        return true;
+      }
+    };
+
+    /** Read the first document of the current block. */
+    abstract int readFirstDoc(IndexedDISI disi) throws IOException;
+
+    /** Advance to the first doc from the block that is equal to or greater than {@code target}.
+     *  Return true if there is such a doc and false otherwise. */
+    abstract boolean advanceWithinBlock(IndexedDISI disi, int target) throws IOException;
+  }
+
+}
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsConsumer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsConsumer.java
@ -96,12 +96,16 @@ final class Lucene70NormsConsumer extends NormsConsumer {

    if (numDocsWithValue == 0) {
      meta.writeLong(-2);
+      meta.writeLong(0L);
    } else if (numDocsWithValue == maxDoc) {
      meta.writeLong(-1);
+      meta.writeLong(0L);
    } else {
-      meta.writeLong(data.getFilePointer());
+      long offset = data.getFilePointer();
+      meta.writeLong(offset);
      values = normsProducer.getNorms(field);
-      SparseDISI.writeBitSet(values, maxDoc, data);
+      IndexedDISI.writeBitSet(values, data);
+      meta.writeLong(data.getFilePointer() - offset);
    }

    meta.writeInt(numDocsWithValue);
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsFormat.java
@ -45,9 +45,10 @@ import org.apache.lucene.store.DataOutput;
 *   <p>Norms metadata (.dvm) --&gt; Header,&lt;Entry&gt;<sup>NumFields</sup>,Footer</p>
 *   <ul>
 *     <li>Header --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}</li>
- *     <li>Entry --&gt; FieldNumber, DocsWithFieldAddress, NumDocsWithField, BytesPerNorm, NormsAddress</li>
+ *     <li>Entry --&gt; FieldNumber, DocsWithFieldAddress, DocsWithFieldLength, NumDocsWithField, BytesPerNorm, NormsAddress</li>
 *     <li>FieldNumber --&gt; {@link DataOutput#writeInt Int32}</li>
 *     <li>DocsWithFieldAddress --&gt; {@link DataOutput#writeLong Int64}</li>
+ *     <li>DocsWithFieldLength --&gt; {@link DataOutput#writeLong Int64}</li>
 *     <li>NumDocsWithField --&gt; {@link DataOutput#writeInt Int32}</li>
 *     <li>BytesPerNorm --&gt; {@link DataOutput#writeByte byte}</li>
 *     <li>NormsAddress --&gt; {@link DataOutput#writeLong Int64}</li>
@ -60,6 +61,7 @@ import org.apache.lucene.store.DataOutput;
 *   <p>DocsWithFieldAddress is the pointer to the start of the bit set containing documents that have a norm
 *      in the norms data (.nvd), or -2 if no documents have a norm value, or -1 if all documents have a norm
 *      value.</p>
+ *   <p>DocsWithFieldLength is the number of bytes used to encode the set of documents that have a norm.</p>
 *   <li><a name="nvd"></a>
 *   <p>The Norms data or .nvd file.</p>
 *   <p>For each Norms field, this stores the actual per-document data (the heavy-lifting)</p>
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsProducer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsProducer.java
@ -90,6 +90,7 @@ final class Lucene70NormsProducer extends NormsProducer {
  static class NormsEntry {
    byte bytesPerNorm;
    long docsWithFieldOffset;
+    long docsWithFieldLength;
    int numDocsWithField;
    long normsOffset;
  }
@ -108,6 +109,7 @@ final class Lucene70NormsProducer extends NormsProducer {
      }
      NormsEntry entry = new NormsEntry();
      entry.docsWithFieldOffset = meta.readLong();
+      entry.docsWithFieldLength = meta.readLong();
      entry.numDocsWithField = meta.readInt();
      entry.bytesPerNorm = meta.readByte();
      switch (entry.bytesPerNorm) {
@ -166,7 +168,7 @@ final class Lucene70NormsProducer extends NormsProducer {
    } else {
      // sparse
      final LongValues normValues = getNormValues(entry);
-      final SparseDISI disi = new SparseDISI(maxDoc, data, entry.docsWithFieldOffset, entry.numDocsWithField);
+      final IndexedDISI disi = new IndexedDISI(data, entry.docsWithFieldOffset, entry.docsWithFieldLength, entry.numDocsWithField);
      return new NumericDocValues() {

        @Override
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/SparseDISI.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/SparseDISI.java
@ -1,114 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.codecs.lucene70;
-
-import java.io.IOException;
-
-import org.apache.lucene.search.DocIdSetIterator;
-import org.apache.lucene.store.IndexInput;
-import org.apache.lucene.store.IndexOutput;
-
-final class SparseDISI extends DocIdSetIterator {
-
-  static void writeBitSet(DocIdSetIterator it, int maxDoc, IndexOutput out) throws IOException {
-    int currentIndex = 0;
-    long currentBits = 0;
-    for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
-      final int index = doc >>> 6;
-      if (index > currentIndex) {
-        out.writeLong(currentBits);
-        for (int i = currentIndex + 1; i < index; ++i) {
-          out.writeLong(0L);
-        }
-        currentIndex = index;
-        currentBits = 0L;
-      }
-      currentBits |= 1L << doc;
-    }
-
-    out.writeLong(currentBits);
-    final int maxIndex = (maxDoc - 1) >>> 6;
-    for (int i = currentIndex + 1; i <= maxIndex; ++i) {
-      out.writeLong(0L);
-    }
-  }
-
-  final int maxDoc;
-  final int numWords;
-  final long cost;
-  final IndexInput slice;
-  int doc = -1;
-  int wordIndex = -1;
-  long word;
-  int index = -1;
-
-  SparseDISI(int maxDoc, IndexInput in, long offset, long cost) throws IOException {
-    this.maxDoc = maxDoc;
-    this.numWords = (int) ((maxDoc + 63L) >>> 6);
-    this.slice = in.slice("docs", offset, numWords * 8L);
-    this.cost = cost;
-  }
-
-  @Override
-  public int advance(int target) throws IOException {
-    if (target >= maxDoc) {
-      return doc = NO_MORE_DOCS;
-    }
-
-    final int targetWordIndex = target >>> 6;
-    for (int i = wordIndex + 1; i <= targetWordIndex; ++i) {
-      word = slice.readLong();
-      index += Long.bitCount(word);
-    }
-    wordIndex = targetWordIndex;
-
-    long leftBits = word >>> target;
-    if (leftBits != 0L) {
-      return doc = target + Long.numberOfTrailingZeros(leftBits);
-    }
-
-    while (++wordIndex < numWords) {
-      word = slice.readLong();
-      if (word != 0) {
-        index += Long.bitCount(word);
-        return doc = (wordIndex << 6) + Long.numberOfTrailingZeros(word);
-      }
-    }
-
-    return doc = NO_MORE_DOCS;
-  }
-
-  @Override
-  public int nextDoc() throws IOException {
-    return advance(doc + 1);
-  }
-
-  @Override
-  public int docID() {
-    return doc;
-  }
-
-  @Override
-  public long cost() {
-    return cost;
-  }
-
-  public int index() {
-    return index - Long.bitCount(word >>> doc) + 1;
-  }
-
-}
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestIndexedDISI.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestIndexedDISI.java
@ -0,0 +1,223 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene70;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+
+public class TestIndexedDISI extends LuceneTestCase {
+
+  public void testEmpty() throws IOException {
+    int maxDoc = TestUtil.nextInt(random(), 1, 100000);
+    FixedBitSet set = new FixedBitSet(maxDoc);
+    try (Directory dir = newDirectory()) {
+      doTest(set, dir);
+    }
+  }
+
+  public void testOneDoc() throws IOException {
+    int maxDoc = TestUtil.nextInt(random(), 1, 100000);
+    FixedBitSet set = new FixedBitSet(maxDoc);
+    set.set(random().nextInt(maxDoc));
+    try (Directory dir = newDirectory()) {
+      doTest(set, dir);
+    }
+  }
+
+  public void testTwoDocs() throws IOException {
+    int maxDoc = TestUtil.nextInt(random(), 1, 100000);
+    FixedBitSet set = new FixedBitSet(maxDoc);
+    set.set(random().nextInt(maxDoc));
+    set.set(random().nextInt(maxDoc));
+    try (Directory dir = newDirectory()) {
+      doTest(set, dir);
+    }
+  }
+
+  public void testAllDocs() throws IOException {
+    int maxDoc = TestUtil.nextInt(random(), 1, 100000);
+    FixedBitSet set = new FixedBitSet(maxDoc);
+    set.set(1, maxDoc);
+    try (Directory dir = newDirectory()) {
+      doTest(set, dir);
+    }
+  }
+
+  public void testHalfFull() throws IOException {
+    int maxDoc = TestUtil.nextInt(random(), 1, 100000);
+    FixedBitSet set = new FixedBitSet(maxDoc);
+    for (int i = random().nextInt(2); i < maxDoc; i += TestUtil.nextInt(random(), 1, 3)) {
+      set.set(i);
+    }
+    try (Directory dir = newDirectory()) {
+      doTest(set, dir);
+    }
+  }
+
+  public void testDocRange() throws IOException {
+    try (Directory dir = newDirectory()) {
+      for (int iter = 0; iter < 10; ++iter) {
+        int maxDoc = TestUtil.nextInt(random(), 1, 1000000);
+        FixedBitSet set = new FixedBitSet(maxDoc);
+        final int start = random().nextInt(maxDoc);
+        final int end = TestUtil.nextInt(random(), start + 1, maxDoc);
+        set.set(start, end);
+        doTest(set, dir);
+      }
+    }
+  }
+
+  public void testSparseDenseBoundary() throws IOException {
+    try (Directory dir = newDirectory()) {
+      FixedBitSet set = new FixedBitSet(200000);
+      int start = 65536 + random().nextInt(100);
+
+      // we set MAX_ARRAY_LENGTH bits so the encoding will be sparse
+      set.set(start, start + IndexedDISI.MAX_ARRAY_LENGTH);
+      long length;
+      try (IndexOutput out = dir.createOutput("sparse", IOContext.DEFAULT)) {
+        IndexedDISI.writeBitSet(new BitSetIterator(set, IndexedDISI.MAX_ARRAY_LENGTH), out);
+        length = out.getFilePointer();
+      }
+      try (IndexInput in = dir.openInput("sparse", IOContext.DEFAULT)) {
+        IndexedDISI disi = new IndexedDISI(in, 0L, length, IndexedDISI.MAX_ARRAY_LENGTH);
+        assertEquals(start, disi.nextDoc());
+        assertEquals(IndexedDISI.Method.SPARSE, disi.method);
+      }
+      doTest(set, dir);
+
+      // now we set one more bit so the encoding will be dense
+      set.set(start + IndexedDISI.MAX_ARRAY_LENGTH + random().nextInt(100));
+      try (IndexOutput out = dir.createOutput("bar", IOContext.DEFAULT)) {
+        IndexedDISI.writeBitSet(new BitSetIterator(set, IndexedDISI.MAX_ARRAY_LENGTH + 1), out);
+        length = out.getFilePointer();
+      }
+      try (IndexInput in = dir.openInput("bar", IOContext.DEFAULT)) {
+        IndexedDISI disi = new IndexedDISI(in, 0L, length, IndexedDISI.MAX_ARRAY_LENGTH + 1);
+        assertEquals(start, disi.nextDoc());
+        assertEquals(IndexedDISI.Method.DENSE, disi.method);
+      }
+      doTest(set, dir);
+    }
+  }
+
+  public void testOneDocMissing() throws IOException {
+    int maxDoc = TestUtil.nextInt(random(), 1, 1000000);
+    FixedBitSet set = new FixedBitSet(maxDoc);
+    set.set(0, maxDoc);
+    set.clear(random().nextInt(maxDoc));
+    try (Directory dir = newDirectory()) {
+      doTest(set, dir);
+    }
+  }
+
+  public void testFewMissingDocs() throws IOException {
+    try (Directory dir = newDirectory()) {
+      for (int iter = 0; iter < 100; ++iter) {
+        int maxDoc = TestUtil.nextInt(random(), 1, 100000);
+        FixedBitSet set = new FixedBitSet(maxDoc);
+        set.set(0, maxDoc);
+        final int numMissingDocs = TestUtil.nextInt(random(), 2, 1000);
+        for (int i = 0; i < numMissingDocs; ++i) {
+          set.clear(random().nextInt(maxDoc));
+        }
+        doTest(set, dir);
+      }
+    }
+  }
+
+  public void testRandom() throws IOException {
+    try (Directory dir = newDirectory()) {
+      for (int i = 0; i < 100; ++i) {
+        doTestRandom(dir);
+      }
+    }
+  }
+
+  private void doTestRandom(Directory dir) throws IOException {
+    List<Integer> docs = new ArrayList<>();
+    final int maxStep = TestUtil.nextInt(random(), 1, 1 << TestUtil.nextInt(random(), 2, 20));
+    final int numDocs = TestUtil.nextInt(random(), 1, Math.min(100000, Integer.MAX_VALUE / maxStep));
+    for (int doc = -1, i = 0; i < numDocs; ++i) {
+      doc += TestUtil.nextInt(random(), 1, maxStep);
+      docs.add(doc);
+    }
+    final int maxDoc = docs.get(docs.size() - 1) + TestUtil.nextInt(random(), 1, 100);
+
+    FixedBitSet set = new FixedBitSet(maxDoc);
+    for (int doc : docs) {
+      set.set(doc);
+    }
+
+    doTest(set, dir);
+  }
+
+  private void doTest(FixedBitSet set, Directory dir) throws IOException {
+    final int cardinality = set.cardinality();
+    long length;
+    try (IndexOutput out = dir.createOutput("foo", IOContext.DEFAULT)) {
+      IndexedDISI.writeBitSet(new BitSetIterator(set, cardinality), out);
+      length = out.getFilePointer();
+    }
+
+    try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
+      IndexedDISI disi = new IndexedDISI(in, 0L, length, cardinality);
+      BitSetIterator disi2 = new BitSetIterator(set, cardinality);
+      int i = 0;
+      for (int doc = disi2.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = disi2.nextDoc()) {
+        assertEquals(doc, disi.nextDoc());
+        assertEquals(i++, disi.index());
+      }
+      assertEquals(DocIdSetIterator.NO_MORE_DOCS, disi.nextDoc());
+    }
+
+    for (int step : new int[] {1, 10, 100, 1000, 10000, 100000}) {
+      try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
+        IndexedDISI disi = new IndexedDISI(in, 0L, length, cardinality);
+        BitSetIterator disi2 = new BitSetIterator(set, cardinality);
+        int index = -1;
+        while (true) {
+          int target = disi2.docID() + step;
+          int doc;
+          do {
+            doc = disi2.nextDoc();
+            index++;
+          } while (doc < target);
+          assertEquals(doc, disi.advance(target));
+          if (doc == DocIdSetIterator.NO_MORE_DOCS) {
+            break;
+          }
+          assertEquals(index, disi.index());
+        }
+      }
+    }
+
+    dir.deleteFile("foo");
+  }
+
+}
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestSparseDISI.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestSparseDISI.java
@ -1,94 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.codecs.lucene70;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-
-import org.apache.lucene.search.DocIdSetIterator;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.IOContext;
-import org.apache.lucene.store.IndexInput;
-import org.apache.lucene.store.IndexOutput;
-import org.apache.lucene.util.BitSetIterator;
-import org.apache.lucene.util.FixedBitSet;
-import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.util.TestUtil;
-
-public class TestSparseDISI extends LuceneTestCase {
-
-  public void testRandom() throws IOException {
-    try (Directory dir = newDirectory()) {
-      for (int i = 0; i < 1000; ++i) {
-        doTestRandom(dir);
-      }
-    }
-  }
-
-  private void doTestRandom(Directory dir) throws IOException {
-    List<Integer> docs = new ArrayList<>();
-    final int maxStep = TestUtil.nextInt(random(), 1, 1 << TestUtil.nextInt(random(), 2, 10));
-    final int numDocs = TestUtil.nextInt(random(), 1, 1000);
-    for (int doc = -1, i = 0; i < numDocs; ++i) {
-      doc += TestUtil.nextInt(random(), 1, maxStep);
-      docs.add(doc);
-    }
-    final int maxDoc = docs.get(docs.size() - 1) + TestUtil.nextInt(random(), 1, 100);
-
-    FixedBitSet set = new FixedBitSet(maxDoc);
-    for (int doc : docs) {
-      set.set(doc);
-    }
-
-    try (IndexOutput out = dir.createOutput("foo", IOContext.DEFAULT)) {
-      SparseDISI.writeBitSet(new BitSetIterator(set, docs.size()), maxDoc, out);
-    }
-
-    try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
-      SparseDISI disi = new SparseDISI(maxDoc, in, 0L, docs.size());
-      BitSetIterator disi2 = new BitSetIterator(set, docs.size());
-      int i = 0;
-      for (int doc = disi2.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = disi2.nextDoc()) {
-        assertEquals(doc, disi.nextDoc());
-        assertEquals(i++, disi.index());
-      }
-      assertEquals(DocIdSetIterator.NO_MORE_DOCS, disi.nextDoc());
-    }
-
-    for (int step : new int[] {1, 20, maxStep, maxStep * 10}) {
-      try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
-        SparseDISI disi = new SparseDISI(maxDoc, in, 0L, docs.size());
-        BitSetIterator disi2 = new BitSetIterator(set, docs.size());
-        while (true) {
-          int target = disi2.docID() + step;
-          int doc = disi2.advance(target);
-          assertEquals(doc, disi.advance(target));
-          if (doc == DocIdSetIterator.NO_MORE_DOCS) {
-            break;
-          }
-          int index = Collections.binarySearch(docs, doc);
-          assertEquals(index, disi.index());
-        }
-      }
-    }
-
-    dir.deleteFile("foo");
-  }
-
-}