[Fix] Binary search the BlockTree term entries when all suffixes have the same length in a leaf block. (#11888)

* Binary search the entries when all suffixes have the same length in a leaf block. * add comment on allEqual. * BackwardsCompatibility: keep the same logic on fillTerm and SeekStatus(NOT_FOUND, END). * Update comments: modify scan to binary search. * Add unit test for binarySearchTermLeaf. * Format code. * Assert the value of termsEnum.term() is correct after seeking. * Add CHANGES entry. * Clarify "leaf block _of the terms dict_" * Set suffixesReader's position. * Advance to the greater term If binary search ended at the less term. * Assert termsEnum's position after seeking. * Tidy. * Advance to the greater term if binary search ended at the less term: nextEnt plus 1. * Advance to the greater term if binary search ended at the less term and greater term exists. * Add test case: target greater than the last entry of the matched block. * Move test case that target greater than the last entry of the matched block to TestLucene90PostingsFormat. * Move test case for target greater than the last entry of the matched block to TestLucene99PostingsFormat * Clarify code. * Replace ternary with verbose if. * Replace seekExact with seekCeil. * Replace division by 2 with logical right shift. * Remove assert ste.termExists. * Clarify code. * Remove stale change entry. * Fix comment. --------- Co-authored-by: Adrien Grand <jpountz@gmail.com>
2024-04-01 21:14:14 +08:00 · 2024-04-01 21:14:14 +08:00 · bf074502df
parent 42f2da5fe2
commit bf074502df
3 changed files with 170 additions and 8 deletions
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java
@ -75,6 +75,9 @@ final class SegmentTermsEnumFrame {
  // True if all entries are terms
  boolean isLeafBlock;

+  // True if all entries have the same length.
+  boolean allEqual;
+
  long lastSubFP;

  int nextFloorLabel;
@ -183,7 +186,7 @@ final class SegmentTermsEnumFrame {
    suffixesReader.reset(suffixBytes, 0, numSuffixBytes);

    int numSuffixLengthBytes = ste.in.readVInt();
-    final boolean allEqual = (numSuffixLengthBytes & 0x01) != 0;
+    allEqual = (numSuffixLengthBytes & 0x01) != 0;
    numSuffixLengthBytes >>>= 1;
    if (suffixLengthBytes.length < numSuffixLengthBytes) {
      suffixLengthBytes = new byte[ArrayUtil.oversize(numSuffixLengthBytes, 1)];
@ -523,7 +526,15 @@ final class SegmentTermsEnumFrame {

  // NOTE: sets startBytePos/suffix as a side effect
  public SeekStatus scanToTerm(BytesRef target, boolean exactOnly) throws IOException {
-    return isLeafBlock ? scanToTermLeaf(target, exactOnly) : scanToTermNonLeaf(target, exactOnly);
+    if (isLeafBlock) {
+      if (allEqual) {
+        return binarySearchTermLeaf(target, exactOnly);
+      } else {
+        return scanToTermLeaf(target, exactOnly);
+      }
+    } else {
+      return scanToTermNonLeaf(target, exactOnly);
+    }
  }

  private int startBytePos;
@ -554,8 +565,6 @@ final class SegmentTermsEnumFrame {

    assert prefixMatches(target);

-    // TODO: binary search when all terms have the same length, which is common for ID fields,
-    // which are also the most sensitive to lookup performance?
    // Loop over each entry (term or sub-block) in this block:
    do {
      nextEnt++;
@ -628,6 +637,97 @@ final class SegmentTermsEnumFrame {
    return SeekStatus.END;
  }

+  // Target's prefix matches this block's prefix;
+  // And all suffixes have the same length in this block,
+  // we binary search the entries to check if the suffix matches.
+  public SeekStatus binarySearchTermLeaf(BytesRef target, boolean exactOnly) throws IOException {
+    // if (DEBUG) System.out.println("    binarySearchTermLeaf: block fp=" + fp + " prefix=" +
+    // prefix + "
+    // nextEnt=" + nextEnt + " (of " + entCount + ") target=" + brToString(target) + " term=" +
+    // brToString(term));
+
+    assert nextEnt != -1;
+
+    ste.termExists = true;
+    subCode = 0;
+
+    if (nextEnt == entCount) {
+      if (exactOnly) {
+        fillTerm();
+      }
+      return SeekStatus.END;
+    }
+
+    assert prefixMatches(target);
+
+    suffix = suffixLengthsReader.readVInt();
+    // TODO early terminate when target length unequals suffix + prefix.
+    // But we need to keep the same status with scanToTermLeaf.
+    int start = nextEnt;
+    int end = entCount - 1;
+    // Binary search the entries (terms) in this leaf block:
+    int cmp = 0;
+    while (start <= end) {
+      int mid = (start + end) >>> 1;
+      nextEnt = mid + 1;
+      startBytePos = mid * suffix;
+
+      // Binary search bytes in the suffix, comparing to the target.
+      cmp =
+          Arrays.compareUnsigned(
+              suffixBytes,
+              startBytePos,
+              startBytePos + suffix,
+              target.bytes,
+              target.offset + prefix,
+              target.offset + target.length);
+      if (cmp < 0) {
+        start = mid + 1;
+      } else if (cmp > 0) {
+        end = mid - 1;
+      } else {
+        // Exact match!
+        suffixesReader.setPosition(startBytePos + suffix);
+        fillTerm();
+        // if (DEBUG) System.out.println("        found!");
+        return SeekStatus.FOUND;
+      }
+    }
+
+    // It is possible (and OK) that terms index pointed us
+    // at this block, but, we searched the entire block and
+    // did not find the term to position to.  This happens
+    // when the target is after the last term in the block
+    // (but, before the next term in the index).  EG
+    // target could be foozzz, and terms index pointed us
+    // to the foo* block, but the last term in this block
+    // was fooz (and, eg, first term in the next block will
+    // bee fop).
+    // if (DEBUG) System.out.println("      block end");
+    SeekStatus seekStatus;
+    if (end < entCount - 1) {
+      seekStatus = SeekStatus.NOT_FOUND;
+      // If binary search ended at the less term, and greater term exists.
+      // We need to advance to the greater term.
+      if (cmp < 0) {
+        startBytePos += suffix;
+        nextEnt++;
+      }
+      suffixesReader.setPosition(startBytePos + suffix);
+      fillTerm();
+    } else {
+      seekStatus = SeekStatus.END;
+      suffixesReader.setPosition(startBytePos + suffix);
+      if (exactOnly) {
+        fillTerm();
+      }
+    }
+    // TODO: not consistent that in the
+    // not-exact case we don't next() into the next
+    // frame here
+    return seekStatus;
+  }
+
  // Target's prefix matches this block's prefix; we
  // scan the entries check if the suffix matches.
  public SeekStatus scanToTermNonLeaf(BytesRef target, boolean exactOnly) throws IOException {
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99PostingsFormat.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99PostingsFormat.java
@ -29,10 +29,7 @@ import org.apache.lucene.codecs.lucene90.blocktree.Stats;
 import org.apache.lucene.codecs.lucene99.Lucene99ScoreSkipReader.MutableImpactList;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
-import org.apache.lucene.index.DirectoryReader;
-import org.apache.lucene.index.Impact;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.*;
 import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
@ -41,6 +38,7 @@ import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.tests.analysis.MockAnalyzer;
 import org.apache.lucene.tests.index.BasePostingsFormatTestCase;
 import org.apache.lucene.tests.util.TestUtil;
+import org.apache.lucene.util.BytesRef;

 public class TestLucene99PostingsFormat extends BasePostingsFormatTestCase {
  private final Codec codec = TestUtil.alwaysPostingsFormat(new Lucene99PostingsFormat());
@ -143,4 +141,13 @@ public class TestLucene99PostingsFormat extends BasePostingsFormatTestCase {
      }
    }
  }
+
+  @Override
+  protected void subCheckBinarySearch(TermsEnum termsEnum) throws Exception {
+    // 10004a matched block's entries: [100001, 100003, ..., 100049].
+    // if target greater than the last entry of the matched block,
+    // termsEnum.term should be the next leaf block's first entry.
+    assertEquals(TermsEnum.SeekStatus.NOT_FOUND, termsEnum.seekCeil(new BytesRef("10004a")));
+    assertEquals(termsEnum.term(), new BytesRef("100051"));
+  }
 }
--- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePostingsFormatTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePostingsFormatTestCase.java
@ -369,6 +369,61 @@ public abstract class BasePostingsFormatTestCase extends BaseIndexFileFormatTest
    dir.close();
  }

+  protected void subCheckBinarySearch(TermsEnum termsEnum) throws Exception {}
+
+  public void testBinarySearchTermLeaf() throws Exception {
+    Directory dir = newDirectory();
+
+    IndexWriterConfig iwc = newIndexWriterConfig(null);
+    iwc.setCodec(getCodec());
+    iwc.setMergePolicy(newTieredMergePolicy());
+    IndexWriter iw = new IndexWriter(dir, iwc);
+
+    for (int i = 100000; i <= 100400; i++) {
+      // only add odd number
+      if (i % 2 == 1) {
+        Document document = new Document();
+        document.add(new StringField("id", i + "", Field.Store.NO));
+        iw.addDocument(document);
+      }
+    }
+    iw.commit();
+    iw.forceMerge(1);
+
+    DirectoryReader reader = DirectoryReader.open(iw);
+    TermsEnum termsEnum = getOnlyLeafReader(reader).terms("id").iterator();
+    // test seekExact
+    for (int i = 100000; i <= 100400; i++) {
+      BytesRef target = new BytesRef(i + "");
+      if (i % 2 == 1) {
+        assertTrue(termsEnum.seekExact(target));
+        assertEquals(termsEnum.term(), target);
+      } else {
+        assertFalse(termsEnum.seekExact(target));
+      }
+    }
+
+    subCheckBinarySearch(termsEnum);
+    // test seekCeil
+    for (int i = 100000; i < 100400; i++) {
+      BytesRef target = new BytesRef(i + "");
+      if (i % 2 == 1) {
+        assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(target));
+        assertEquals(termsEnum.term(), target);
+        if (i <= 100397) {
+          assertEquals(new BytesRef(i + 2 + ""), termsEnum.next());
+        }
+      } else {
+        assertEquals(SeekStatus.NOT_FOUND, termsEnum.seekCeil(target));
+        assertEquals(new BytesRef(i + 1 + ""), termsEnum.term());
+      }
+    }
+    assertEquals(SeekStatus.END, termsEnum.seekCeil(new BytesRef(100400 + "")));
+    reader.close();
+    iw.close();
+    dir.close();
+  }
+
  // tests that level 2 ghost fields still work
  public void testLevel2Ghosts() throws Exception {
    Directory dir = newDirectory();