Fix: Lucene90DocValuesProducer.TermsDict.seekCeil doesn't always position bytes correctly (#12555)

2023-09-17 00:48:45 +01:00 · 2023-09-17 00:48:45 +01:00 · d633c9b7d4
parent 43c0d72b94
commit d633c9b7d4
3 changed files with 82 additions and 14 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -233,6 +233,9 @@ Bug Fixes

 * LUCENE-12521: Sort After returning in-correct result when missing values are competitive. (Chaitanya Gohel)

+* GITHUB#12555: Fix bug in TermsEnum#seekCeil on doc values terms enums
+  that causes IndexOutOfBoundsException. (Egor Potemkin)
+
 Other
 ---------------------

--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java
@ -1153,6 +1153,7 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
      assert hi < 0 || getTermFromIndex(hi).compareTo(text) <= 0;
      assert hi == ((entry.termsDictSize - 1) >> entry.termsDictIndexShift)
          || getTermFromIndex(hi + 1).compareTo(text) > 0;
+      assert hi < 0 ^ entry.termsDictSize > 0; // return -1 iff empty term dict

      return hi;
    }
@ -1169,7 +1170,9 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
    private long seekBlock(BytesRef text) throws IOException {
      long index = seekTermsIndex(text);
      if (index == -1L) {
-        return -1L;
+        // empty terms dict
+        this.ord = 0;
+        return -2L;
      }

      long ordLo = index << entry.termsDictIndexShift;
@ -1193,26 +1196,30 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
      assert blockHi == ((entry.termsDictSize - 1) >>> TERMS_DICT_BLOCK_LZ4_SHIFT)
          || getFirstTermFromBlock(blockHi + 1).compareTo(text) > 0;

+      // read the block only if term dict is not empty
+      assert entry.termsDictSize > 0;
+      // reset ord and bytes to the ceiling block even if
+      // text is before the first term (blockHi == -1)
+      final long block = Math.max(blockHi, 0);
+      final long blockAddress = blockAddresses.get(block);
+      this.ord = block << TERMS_DICT_BLOCK_LZ4_SHIFT;
+      bytes.seek(blockAddress);
+      decompressBlock();
+
      return blockHi;
    }

    @Override
    public SeekStatus seekCeil(BytesRef text) throws IOException {
      final long block = seekBlock(text);
-      if (block == -1) {
-        // before the first term, or empty terms dict
-        if (entry.termsDictSize == 0) {
-          ord = 0;
-          return SeekStatus.END;
-        } else {
-          seekExact(0L);
-          return SeekStatus.NOT_FOUND;
-        }
+      if (block == -2) {
+        // empty terms dict
+        assert entry.termsDictSize == 0;
+        return SeekStatus.END;
+      } else if (block == -1) {
+        // before the first term
+        return SeekStatus.NOT_FOUND;
      }
-      final long blockAddress = blockAddresses.get(block);
-      this.ord = block << TERMS_DICT_BLOCK_LZ4_SHIFT;
-      bytes.seek(blockAddress);
-      decompressBlock();

      while (true) {
        int cmp = term.compareTo(text);
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90DocValuesFormat.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90DocValuesFormat.java
@ -24,6 +24,7 @@ import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
 import java.util.TreeSet;
+import java.util.function.Function;
 import java.util.function.LongSupplier;
 import java.util.function.Supplier;
 import org.apache.lucene.analysis.Analyzer;
@ -958,4 +959,61 @@ public class TestLucene90DocValuesFormat extends BaseCompressingDocValuesFormatT
    reader.close();
    directory.close();
  }
+
+  // Testing termsEnum seekCeil edge case, where inconsistent internal state led to
+  // IndexOutOfBoundsException
+  // see https://github.com/apache/lucene/pull/12555 for details
+  public void testTermsEnumConsistency() throws IOException {
+    int numTerms =
+        Lucene90DocValuesFormat.TERMS_DICT_BLOCK_LZ4_SIZE
+            + 10; // need more than one block of unique terms.
+    Directory directory = newDirectory();
+    IndexWriterConfig conf = newIndexWriterConfig();
+    RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf);
+    Document doc = new Document();
+
+    // for simplicity, we will generate sorted list of terms which are a) unique b) all greater than
+    // the term that we want to use for the test
+    char termA = 'A';
+    Function<Integer, String> stringSupplier =
+        (Integer n) -> {
+          assert n < 25 * 25;
+          char[] chars = new char[] {(char) (termA + 1 + n / 25), (char) (termA + 1 + n % 25)};
+          return new String(chars);
+        };
+    SortedDocValuesField field =
+        new SortedDocValuesField("field", new BytesRef(stringSupplier.apply(0)));
+    doc.add(field);
+    iwriter.addDocument(doc);
+    for (int i = 1; i < numTerms; i++) {
+      field.setBytesValue(new BytesRef(stringSupplier.apply(i)));
+      iwriter.addDocument(doc);
+    }
+    // merging to one segment to make sure we have more than one block (TERMS_DICT_BLOCK_LZ4_SIZE)
+    // in a segment, to trigger next block decompression.
+    iwriter.forceMerge(1);
+    iwriter.close();
+
+    IndexReader reader = DirectoryReader.open(directory);
+    LeafReader leafReader = getOnlyLeafReader(reader);
+    SortedDocValues values = leafReader.getSortedDocValues("field");
+    TermsEnum termsEnum = values.termsEnum();
+
+    // Position terms enum at 0
+    termsEnum.seekExact(0L);
+    assertEquals(0, termsEnum.ord());
+    // seekCeil to a term which doesn't exist in the index
+    assertEquals(SeekStatus.NOT_FOUND, termsEnum.seekCeil(new BytesRef("A")));
+    // ... and before any other term in the index
+    assertEquals(0, termsEnum.ord());
+
+    assertEquals(new BytesRef(stringSupplier.apply(0)), termsEnum.term());
+    // read more than one block of terms to trigger next block decompression
+    for (int i = 1; i < numTerms; i++) {
+      assertEquals(new BytesRef(stringSupplier.apply(i)), termsEnum.next());
+    }
+    assertNull(termsEnum.next());
+    reader.close();
+    directory.close();
+  }
 }