LUCENE-8836: Speed up TermsEnum#lookupOrd on increasing sequences of ords. (#827)

2022-04-25 09:18:21 +02:00 · 2022-04-25 09:18:21 +02:00 · 2a4c21bb58
parent 1089b482fc
commit 2a4c21bb58
3 changed files with 80 additions and 6 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -121,6 +121,9 @@ Optimizations
 * LUCENE-10315: Use SIMD instructions to decode BKD doc IDs. (Guo Feng, Adrien Grand, Ignacio Vera)
 * LUCENE-8836: Speed up calls to TermsEnum#lookupOrd on doc values terms enums
  and sequences of increasing ords. (Bruno Roustant, Adrien Grand)
 Bug Fixes
 ---------------------
 * LUCENE-10477: Highlighter: WeightedSpanTermExtractor.extractWeightedSpanTerms to Query#rewrite
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java
@ -1111,13 +1111,19 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
      if (ord < 0 || ord >= entry.termsDictSize) {
        throw new IndexOutOfBoundsException();
      }
-      final long blockIndex = ord >>> TERMS_DICT_BLOCK_LZ4_SHIFT;
+      // Signed shift since ord is -1 when the terms enum is not positioned
-      final long blockAddress = blockAddresses.get(blockIndex);
+      final long currentBlockIndex = this.ord >> TERMS_DICT_BLOCK_LZ4_SHIFT;
-      bytes.seek(blockAddress);
+      final long blockIndex = ord >> TERMS_DICT_BLOCK_LZ4_SHIFT;
-      this.ord = (blockIndex << TERMS_DICT_BLOCK_LZ4_SHIFT) - 1;
+      if (ord < this.ord || blockIndex != currentBlockIndex) {
-      do {
+        // The looked up ord is before the current ord or belongs to a different block, seek again
        final long blockAddress = blockAddresses.get(blockIndex);
        bytes.seek(blockAddress);
        this.ord = (blockIndex << TERMS_DICT_BLOCK_LZ4_SHIFT) - 1;
      }
      // Scan to the looked up ord
      while (this.ord < ord) {
        next();
-      } while (this.ord < ord);
+      }
    }
    private BytesRef getTermFromIndex(long index) throws IOException {
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90DocValuesFormat.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90DocValuesFormat.java
@ -865,4 +865,69 @@ public class TestLucene90DocValuesFormat extends BaseCompressingDocValuesFormatT
      ireader.close();
    }
  }
  public void testSortedTermsDictLookupOrd() throws IOException {
    Directory dir = newDirectory();
    IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig());
    Document doc = new Document();
    SortedDocValuesField field = new SortedDocValuesField("foo", new BytesRef());
    doc.add(field);
    final int numDocs = atLeast(Lucene90DocValuesFormat.TERMS_DICT_BLOCK_LZ4_SIZE + 1);
    for (int i = 0; i < numDocs; ++i) {
      field.setBytesValue(new BytesRef("" + i));
      writer.addDocument(doc);
    }
    writer.forceMerge(1);
    IndexReader reader = DirectoryReader.open(writer);
    LeafReader leafReader = getOnlyLeafReader(reader);
    doTestTermsDictLookupOrd(leafReader.getSortedDocValues("foo").termsEnum());
    reader.close();
    writer.close();
    dir.close();
  }
  public void testSortedSetTermsDictLookupOrd() throws IOException {
    Directory dir = newDirectory();
    IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig());
    Document doc = new Document();
    SortedSetDocValuesField field = new SortedSetDocValuesField("foo", new BytesRef());
    doc.add(field);
    final int numDocs = atLeast(2 * Lucene90DocValuesFormat.TERMS_DICT_BLOCK_LZ4_SIZE + 1);
    for (int i = 0; i < numDocs; ++i) {
      field.setBytesValue(new BytesRef("" + i));
      writer.addDocument(doc);
    }
    writer.forceMerge(1);
    IndexReader reader = DirectoryReader.open(writer);
    LeafReader leafReader = getOnlyLeafReader(reader);
    doTestTermsDictLookupOrd(leafReader.getSortedSetDocValues("foo").termsEnum());
    reader.close();
    writer.close();
    dir.close();
  }
  private void doTestTermsDictLookupOrd(TermsEnum te) throws IOException {
    List<BytesRef> terms = new ArrayList<>();
    for (BytesRef term = te.next(); term != null; term = te.next()) {
      terms.add(BytesRef.deepCopyOf(term));
    }
    // iterate in order
    for (int i = 0; i < terms.size(); ++i) {
      te.seekExact(i);
      assertEquals(terms.get(i), te.term());
    }
    // iterate in reverse order
    for (int i = terms.size() - 1; i >= 0; --i) {
      te.seekExact(i);
      assertEquals(terms.get(i), te.term());
    }
    // iterate in forward order with random gaps
    for (int i = random().nextInt(5); i < terms.size(); i += random().nextInt(5)) {
      te.seekExact(i);
      assertEquals(terms.get(i), te.term());
    }
  }
 }