LUCENE-6906: Fix Lucene54DocValuesFormat on large empty segments.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1715918 13f79535-47bb-0310-9956-ffa450edef68
2025-02-13 13:35:37 +00:00 · 2015-11-23 19:05:31 +00:00 · 2015-11-23 19:05:31 +00:00 · 8eea72d476
commit 8eea72d476
parent 60bb927256
5 changed files with 241 additions and 34 deletions
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesProducer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesProducer.java
@ -319,7 +319,7 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close
      // sparse bits need a bit more metadata
      entry.numDocsWithValue = meta.readVLong();
      final int blockShift = meta.readVInt();
-      entry.monotonicMeta = DirectMonotonicReader.loadMeta(meta, entry.numDocsWithValue + 1, blockShift);
+      entry.monotonicMeta = DirectMonotonicReader.loadMeta(meta, entry.numDocsWithValue, blockShift);
      ramBytesUsed.addAndGet(entry.monotonicMeta.ramBytesUsed());
      directAddressesMeta.put(info.name, entry.monotonicMeta);
    }
--- a/lucene/core/src/java/org/apache/lucene/util/packed/DirectMonotonicWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/packed/DirectMonotonicWriter.java
@ -123,7 +123,9 @@ public final class DirectMonotonicWriter {
    if (finished) {
      throw new IllegalStateException("#finish has been called already");
    }
-    flush();
+    if (bufferSize > 0) {
+      flush();
+    }
    finished = true;
  }

--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene54/TestLucene54DocValuesFormat.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene54/TestLucene54DocValuesFormat.java
@ -17,6 +17,7 @@ package org.apache.lucene.codecs.lucene54;
 * limitations under the License.
 */

+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
@ -24,6 +25,7 @@ import java.util.HashSet;
 import java.util.List;
 import java.util.Set;

+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.codecs.Codec;
 import org.apache.lucene.codecs.DocValuesFormat;
@ -138,7 +140,7 @@ public class TestLucene54DocValuesFormat extends BaseCompressingDocValuesFormatT

  @Slow
  public void testSparseDocValuesVsStoredFields() throws Exception {
-    int numIterations = atLeast(2);
+    int numIterations = atLeast(1);
    for (int i = 0; i < numIterations; i++) {
      doTestSparseDocValuesVsStoredFields();
    }
@ -158,7 +160,7 @@ public class TestLucene54DocValuesFormat extends BaseCompressingDocValuesFormatT
    // sparse compression is only enabled if less than 1% of docs have a value
    final int avgGap = 100;

-    final int numDocs = atLeast(100);
+    final int numDocs = atLeast(200);
    for (int i = random().nextInt(avgGap * 2); i >= 0; --i) {
      writer.addDocument(new Document());
    }
@ -185,7 +187,7 @@ public class TestLucene54DocValuesFormat extends BaseCompressingDocValuesFormatT
      writer.addDocument(doc);

      // add a gap
-      for (int j = random().nextInt(avgGap * 2); j >= 0; --j) {
+      for (int j = TestUtil.nextInt(random(), 0, avgGap * 2); j >= 0; --j) {
        writer.addDocument(new Document());
      }
    }
@ -502,4 +504,5 @@ public class TestLucene54DocValuesFormat extends BaseCompressingDocValuesFormatT
      }
    }
  }
+
 }
--- a/lucene/core/src/test/org/apache/lucene/util/packed/TestDirectMonotonic.java
+++ b/lucene/core/src/test/org/apache/lucene/util/packed/TestDirectMonotonic.java
@ -32,6 +32,28 @@ import org.apache.lucene.util.TestUtil;

 public class TestDirectMonotonic extends LuceneTestCase {

+  public void testEmpty() throws IOException {
+    Directory dir = newDirectory();
+    final int blockShift = TestUtil.nextInt(random(), DirectMonotonicWriter.MIN_BLOCK_SHIFT, DirectMonotonicWriter.MAX_BLOCK_SHIFT);
+
+    final long dataLength;
+    try (IndexOutput metaOut = dir.createOutput("meta", IOContext.DEFAULT);
+        IndexOutput dataOut = dir.createOutput("data", IOContext.DEFAULT)) {
+      DirectMonotonicWriter w = DirectMonotonicWriter.getInstance(metaOut, dataOut, 0, blockShift);
+      w.finish();
+      dataLength = dataOut.getFilePointer();
+    }
+
+    try (IndexInput metaIn = dir.openInput("meta", IOContext.READONCE);
+        IndexInput dataIn = dir.openInput("data", IOContext.DEFAULT)) {
+      DirectMonotonicReader.Meta meta = DirectMonotonicReader.loadMeta(metaIn, 0, blockShift);
+      DirectMonotonicReader.getInstance(meta, dataIn.randomAccessSlice(0, dataLength));
+      // no exception
+    }
+
+    dir.close();
+  }
+
  public void testSimple() throws IOException {
    Directory dir = newDirectory();
    final int blockShift = 2;
@ -100,38 +122,52 @@ public class TestDirectMonotonic extends LuceneTestCase {
  }

  public void testRandom() throws IOException {
-    Directory dir = newDirectory();
-    final int blockShift = TestUtil.nextInt(random(), DirectMonotonicWriter.MIN_BLOCK_SHIFT, DirectMonotonicWriter.MAX_BLOCK_SHIFT);
-    final int numValues = TestUtil.nextInt(random(), 1, 1 << 20);
-    List<Long> actualValues = new ArrayList<>();
-    long previous = random().nextLong();
-    actualValues.add(previous);
-    for (int i = 1; i < numValues; ++i) {
-      previous += random().nextInt(1 << random().nextInt(20));
-      actualValues.add(previous);
-    }
-
-    final long dataLength;
-    try (IndexOutput metaOut = dir.createOutput("meta", IOContext.DEFAULT);
-        IndexOutput dataOut = dir.createOutput("data", IOContext.DEFAULT)) {
-      DirectMonotonicWriter w = DirectMonotonicWriter.getInstance(metaOut, dataOut, numValues, blockShift);
-      for (long v : actualValues) {
-        w.add(v);
+    final int iters = atLeast(3);
+    for (int iter = 0; iter < iters; ++iter) {
+      Directory dir = newDirectory();
+      final int blockShift = TestUtil.nextInt(random(), DirectMonotonicWriter.MIN_BLOCK_SHIFT, DirectMonotonicWriter.MAX_BLOCK_SHIFT);
+      final int maxNumValues = 1 << 20;
+      final int numValues;
+      if (random().nextBoolean()) {
+        // random number
+        numValues = TestUtil.nextInt(random(), 1, maxNumValues);
+      } else {
+        // multiple of the block size
+        final int numBlocks = TestUtil.nextInt(random(), 0, maxNumValues >>> blockShift);
+        numValues = TestUtil.nextInt(random(), 0, numBlocks) << blockShift;
      }
-      w.finish();
-      dataLength = dataOut.getFilePointer();
-    }
-
-    try (IndexInput metaIn = dir.openInput("meta", IOContext.READONCE);
-        IndexInput dataIn = dir.openInput("data", IOContext.DEFAULT)) {
-      DirectMonotonicReader.Meta meta = DirectMonotonicReader.loadMeta(metaIn, numValues, blockShift);
-      LongValues values = DirectMonotonicReader.getInstance(meta, dataIn.randomAccessSlice(0, dataLength));
-      for (int i = 0; i < numValues; ++i) {
-        assertEquals(actualValues.get(i).longValue(), values.get(i));
+      List<Long> actualValues = new ArrayList<>();
+      long previous = random().nextLong();
+      if (numValues > 0) {
+        actualValues.add(previous);
      }
+      for (int i = 1; i < numValues; ++i) {
+        previous += random().nextInt(1 << random().nextInt(20));
+        actualValues.add(previous);
+      }
+  
+      final long dataLength;
+      try (IndexOutput metaOut = dir.createOutput("meta", IOContext.DEFAULT);
+          IndexOutput dataOut = dir.createOutput("data", IOContext.DEFAULT)) {
+        DirectMonotonicWriter w = DirectMonotonicWriter.getInstance(metaOut, dataOut, numValues, blockShift);
+        for (long v : actualValues) {
+          w.add(v);
+        }
+        w.finish();
+        dataLength = dataOut.getFilePointer();
+      }
+  
+      try (IndexInput metaIn = dir.openInput("meta", IOContext.READONCE);
+          IndexInput dataIn = dir.openInput("data", IOContext.DEFAULT)) {
+        DirectMonotonicReader.Meta meta = DirectMonotonicReader.loadMeta(metaIn, numValues, blockShift);
+        LongValues values = DirectMonotonicReader.getInstance(meta, dataIn.randomAccessSlice(0, dataLength));
+        for (int i = 0; i < numValues; ++i) {
+          assertEquals(actualValues.get(i).longValue(), values.get(i));
+        }
+      }
+  
+      dir.close();
    }
-
-    dir.close();
  }

 }
--- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java
@ -3148,6 +3148,172 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes
    assertEquals(term2.get(), enum2.term());
  }

+  // same as testSortedMergeAwayAllValues but on more than 1024 docs to have sparse encoding on
+  public void testSortedMergeAwayAllValuesLargeSegment() throws IOException {
+    Directory directory = newDirectory();
+    Analyzer analyzer = new MockAnalyzer(random());
+    IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer);
+    iwconfig.setMergePolicy(newLogMergePolicy());
+    RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig);
+
+    Document doc = new Document();
+    doc.add(new StringField("id", "1", Field.Store.NO));
+    doc.add(new SortedDocValuesField("field", new BytesRef("hello")));
+    iwriter.addDocument(doc);
+    final int numEmptyDocs = atLeast(1024);
+    for (int i = 0; i < numEmptyDocs; ++i) {
+      iwriter.addDocument(new Document());
+    }
+    iwriter.commit();
+    iwriter.deleteDocuments(new Term("id", "1"));
+    iwriter.forceMerge(1);
+
+    DirectoryReader ireader = iwriter.getReader();
+    iwriter.close();
+
+    SortedDocValues dv = getOnlySegmentReader(ireader).getSortedDocValues("field");
+    for (int i = 0; i < numEmptyDocs; ++i) {
+      assertEquals(-1, dv.getOrd(i));
+    }
+
+    ireader.close();
+    directory.close();
+  }
+
+  // same as testSortedSetMergeAwayAllValues but on more than 1024 docs to have sparse encoding on
+  public void testSortedSetMergeAwayAllValuesLargeSegment() throws IOException {
+    Directory directory = newDirectory();
+    Analyzer analyzer = new MockAnalyzer(random());
+    IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer);
+    iwconfig.setMergePolicy(newLogMergePolicy());
+    RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig);
+
+    Document doc = new Document();
+    doc.add(new StringField("id", "1", Field.Store.NO));
+    doc.add(new SortedSetDocValuesField("field", new BytesRef("hello")));
+    iwriter.addDocument(doc);
+    final int numEmptyDocs = atLeast(1024);
+    for (int i = 0; i < numEmptyDocs; ++i) {
+      iwriter.addDocument(new Document());
+    }
+    iwriter.commit();
+    iwriter.deleteDocuments(new Term("id", "1"));
+    iwriter.forceMerge(1);
+
+    DirectoryReader ireader = iwriter.getReader();
+    iwriter.close();
+
+    SortedSetDocValues dv = getOnlySegmentReader(ireader).getSortedSetDocValues("field");
+    for (int i = 0; i < numEmptyDocs; ++i) {
+      dv.setDocument(i);
+      assertEquals(-1L, dv.nextOrd());
+    }
+
+    ireader.close();
+    directory.close();
+  }
+
+  // same as testNumericMergeAwayAllValues but on more than 1024 docs to have sparse encoding on
+  public void testNumericMergeAwayAllValuesLargeSegment() throws IOException {
+    Directory directory = newDirectory();
+    Analyzer analyzer = new MockAnalyzer(random());
+    IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer);
+    iwconfig.setMergePolicy(newLogMergePolicy());
+    RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig);
+
+    Document doc = new Document();
+    doc.add(new StringField("id", "1", Field.Store.NO));
+    doc.add(new NumericDocValuesField("field", 42L));
+    iwriter.addDocument(doc);
+    final int numEmptyDocs = atLeast(1024);
+    for (int i = 0; i < numEmptyDocs; ++i) {
+      iwriter.addDocument(new Document());
+    }
+    iwriter.commit();
+    iwriter.deleteDocuments(new Term("id", "1"));
+    iwriter.forceMerge(1);
+
+    DirectoryReader ireader = iwriter.getReader();
+    iwriter.close();
+
+    NumericDocValues dv = getOnlySegmentReader(ireader).getNumericDocValues("field");
+    Bits docsWithField = getOnlySegmentReader(ireader).getDocsWithField("field");
+    for (int i = 0; i < numEmptyDocs; ++i) {
+      assertEquals(0, dv.get(i));
+      assertFalse(docsWithField.get(i));
+    }
+
+    ireader.close();
+    directory.close();
+  }
+
+  // same as testSortedNumericMergeAwayAllValues but on more than 1024 docs to have sparse encoding on
+  public void testSortedNumericMergeAwayAllValuesLargeSegment() throws IOException {
+    Directory directory = newDirectory();
+    Analyzer analyzer = new MockAnalyzer(random());
+    IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer);
+    iwconfig.setMergePolicy(newLogMergePolicy());
+    RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig);
+
+    Document doc = new Document();
+    doc.add(new StringField("id", "1", Field.Store.NO));
+    doc.add(new SortedNumericDocValuesField("field", 42L));
+    iwriter.addDocument(doc);
+    final int numEmptyDocs = atLeast(1024);
+    for (int i = 0; i < numEmptyDocs; ++i) {
+      iwriter.addDocument(new Document());
+    }
+    iwriter.commit();
+    iwriter.deleteDocuments(new Term("id", "1"));
+    iwriter.forceMerge(1);
+
+    DirectoryReader ireader = iwriter.getReader();
+    iwriter.close();
+
+    SortedNumericDocValues dv = getOnlySegmentReader(ireader).getSortedNumericDocValues("field");
+    for (int i = 0; i < numEmptyDocs; ++i) {
+      dv.setDocument(i);
+      assertEquals(0, dv.count());
+    }
+
+    ireader.close();
+    directory.close();
+  }
+
+  // same as testBinaryMergeAwayAllValues but on more than 1024 docs to have sparse encoding on
+  public void testBinaryMergeAwayAllValuesLargeSegment() throws IOException {
+    Directory directory = newDirectory();
+    Analyzer analyzer = new MockAnalyzer(random());
+    IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer);
+    iwconfig.setMergePolicy(newLogMergePolicy());
+    RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig);
+
+    Document doc = new Document();
+    doc.add(new StringField("id", "1", Field.Store.NO));
+    doc.add(new BinaryDocValuesField("field", new BytesRef("hello")));
+    iwriter.addDocument(doc);
+    final int numEmptyDocs = atLeast(1024);
+    for (int i = 0; i < numEmptyDocs; ++i) {
+      iwriter.addDocument(new Document());
+    }
+    iwriter.commit();
+    iwriter.deleteDocuments(new Term("id", "1"));
+    iwriter.forceMerge(1);
+
+    DirectoryReader ireader = iwriter.getReader();
+    iwriter.close();
+
+    BinaryDocValues dv = getOnlySegmentReader(ireader).getBinaryDocValues("field");
+    Bits docsWithField = getOnlySegmentReader(ireader).getDocsWithField("field");
+    for (int i = 0; i < numEmptyDocs; ++i) {
+      assertEquals(new BytesRef(), dv.get(i));
+      assertFalse(docsWithField.get(i));
+    }
+
+    ireader.close();
+    directory.close();
+  }
+
  protected boolean codecAcceptsHugeBinaryValues(String field) {
    return true;
  }