[Fix] Binary search the BlockTree term entries when all suffixes have the same length in a leaf block. (#11888)

* Binary search the entries when all suffixes have the same length in a leaf block. * add comment on allEqual. * BackwardsCompatibility: keep the same logic on fillTerm and SeekStatus(NOT_FOUND, END). * Update comments: modify scan to binary search. * Add unit test for binarySearchTermLeaf. * Format code. * Assert the value of termsEnum.term() is correct after seeking. * Add CHANGES entry. * Clarify "leaf block _of the terms dict_" * Set suffixesReader's position. * Advance to the greater term If binary search ended at the less term. * Assert termsEnum's position after seeking. * Tidy. * Advance to the greater term if binary search ended at the less term: nextEnt plus 1. * Advance to the greater term if binary search ended at the less term and greater term exists. * Add test case: target greater than the last entry of the matched block. * Move test case that target greater than the last entry of the matched block to TestLucene90PostingsFormat. * Move test case for target greater than the last entry of the matched block to TestLucene99PostingsFormat * Clarify code. * Replace ternary with verbose if. * Replace seekExact with seekCeil. * Replace division by 2 with logical right shift. * Remove assert ste.termExists. * Clarify code. * Remove stale change entry. * Fix comment. --------- Co-authored-by: Adrien Grand <jpountz@gmail.com>
2024-04-01 21:14:14 +08:00 · 2024-04-01 21:14:14 +08:00 · bf074502df
parent 42f2da5fe2
commit bf074502df
3 changed files with 170 additions and 8 deletions
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java
@ -75,6 +75,9 @@ final class SegmentTermsEnumFrame {
  // True if all entries are terms
  boolean isLeafBlock;
  // True if all entries have the same length.
  boolean allEqual;
  long lastSubFP;
  int nextFloorLabel;
@ -183,7 +186,7 @@ final class SegmentTermsEnumFrame {
    suffixesReader.reset(suffixBytes, 0, numSuffixBytes);
    int numSuffixLengthBytes = ste.in.readVInt();
-    final boolean allEqual = (numSuffixLengthBytes & 0x01) != 0;
+    allEqual = (numSuffixLengthBytes & 0x01) != 0;
    numSuffixLengthBytes >>>= 1;
    if (suffixLengthBytes.length < numSuffixLengthBytes) {
      suffixLengthBytes = new byte[ArrayUtil.oversize(numSuffixLengthBytes, 1)];
@ -523,7 +526,15 @@ final class SegmentTermsEnumFrame {
  // NOTE: sets startBytePos/suffix as a side effect
  public SeekStatus scanToTerm(BytesRef target, boolean exactOnly) throws IOException {
-    return isLeafBlock ? scanToTermLeaf(target, exactOnly) : scanToTermNonLeaf(target, exactOnly);
+    if (isLeafBlock) {
      if (allEqual) {
        return binarySearchTermLeaf(target, exactOnly);
      } else {
        return scanToTermLeaf(target, exactOnly);
      }
    } else {
      return scanToTermNonLeaf(target, exactOnly);
    }
  }
  private int startBytePos;
@ -554,8 +565,6 @@ final class SegmentTermsEnumFrame {
    assert prefixMatches(target);
    // TODO: binary search when all terms have the same length, which is common for ID fields,
    // which are also the most sensitive to lookup performance?
    // Loop over each entry (term or sub-block) in this block:
    do {
      nextEnt++;
@ -628,6 +637,97 @@ final class SegmentTermsEnumFrame {
    return SeekStatus.END;
  }
  // Target's prefix matches this block's prefix;
  // And all suffixes have the same length in this block,
  // we binary search the entries to check if the suffix matches.
  public SeekStatus binarySearchTermLeaf(BytesRef target, boolean exactOnly) throws IOException {
    // if (DEBUG) System.out.println("    binarySearchTermLeaf: block fp=" + fp + " prefix=" +
    // prefix + "
    // nextEnt=" + nextEnt + " (of " + entCount + ") target=" + brToString(target) + " term=" +
    // brToString(term));
    assert nextEnt != -1;
    ste.termExists = true;
    subCode = 0;
    if (nextEnt == entCount) {
      if (exactOnly) {
        fillTerm();
      }
      return SeekStatus.END;
    }
    assert prefixMatches(target);
    suffix = suffixLengthsReader.readVInt();
    // TODO early terminate when target length unequals suffix + prefix.
    // But we need to keep the same status with scanToTermLeaf.
    int start = nextEnt;
    int end = entCount - 1;
    // Binary search the entries (terms) in this leaf block:
    int cmp = 0;
    while (start <= end) {
      int mid = (start + end) >>> 1;
      nextEnt = mid + 1;
      startBytePos = mid * suffix;
      // Binary search bytes in the suffix, comparing to the target.
      cmp =
          Arrays.compareUnsigned(
              suffixBytes,
              startBytePos,
              startBytePos + suffix,
              target.bytes,
              target.offset + prefix,
              target.offset + target.length);
      if (cmp < 0) {
        start = mid + 1;
      } else if (cmp > 0) {
        end = mid - 1;
      } else {
        // Exact match!
        suffixesReader.setPosition(startBytePos + suffix);
        fillTerm();
        // if (DEBUG) System.out.println("        found!");
        return SeekStatus.FOUND;
      }
    }
    // It is possible (and OK) that terms index pointed us
    // at this block, but, we searched the entire block and
    // did not find the term to position to.  This happens
    // when the target is after the last term in the block
    // (but, before the next term in the index).  EG
    // target could be foozzz, and terms index pointed us
    // to the foo* block, but the last term in this block
    // was fooz (and, eg, first term in the next block will
    // bee fop).
    // if (DEBUG) System.out.println("      block end");
    SeekStatus seekStatus;
    if (end < entCount - 1) {
      seekStatus = SeekStatus.NOT_FOUND;
      // If binary search ended at the less term, and greater term exists.
      // We need to advance to the greater term.
      if (cmp < 0) {
        startBytePos += suffix;
        nextEnt++;
      }
      suffixesReader.setPosition(startBytePos + suffix);
      fillTerm();
    } else {
      seekStatus = SeekStatus.END;
      suffixesReader.setPosition(startBytePos + suffix);
      if (exactOnly) {
        fillTerm();
      }
    }
    // TODO: not consistent that in the
    // not-exact case we don't next() into the next
    // frame here
    return seekStatus;
  }
  // Target's prefix matches this block's prefix; we
  // scan the entries check if the suffix matches.
  public SeekStatus scanToTermNonLeaf(BytesRef target, boolean exactOnly) throws IOException {
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99PostingsFormat.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99PostingsFormat.java
@ -29,10 +29,7 @@ import org.apache.lucene.codecs.lucene90.blocktree.Stats;
 import org.apache.lucene.codecs.lucene99.Lucene99ScoreSkipReader.MutableImpactList;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
-import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.*;
 import org.apache.lucene.index.Impact;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
@ -41,6 +38,7 @@ import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.tests.analysis.MockAnalyzer;
 import org.apache.lucene.tests.index.BasePostingsFormatTestCase;
 import org.apache.lucene.tests.util.TestUtil;
 import org.apache.lucene.util.BytesRef;
 public class TestLucene99PostingsFormat extends BasePostingsFormatTestCase {
  private final Codec codec = TestUtil.alwaysPostingsFormat(new Lucene99PostingsFormat());
@ -143,4 +141,13 @@ public class TestLucene99PostingsFormat extends BasePostingsFormatTestCase {
      }
    }
  }
  @Override
  protected void subCheckBinarySearch(TermsEnum termsEnum) throws Exception {
    // 10004a matched block's entries: [100001, 100003, ..., 100049].
    // if target greater than the last entry of the matched block,
    // termsEnum.term should be the next leaf block's first entry.
    assertEquals(TermsEnum.SeekStatus.NOT_FOUND, termsEnum.seekCeil(new BytesRef("10004a")));
    assertEquals(termsEnum.term(), new BytesRef("100051"));
  }
 }
--- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePostingsFormatTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePostingsFormatTestCase.java
@ -369,6 +369,61 @@ public abstract class BasePostingsFormatTestCase extends BaseIndexFileFormatTest
    dir.close();
  }
  protected void subCheckBinarySearch(TermsEnum termsEnum) throws Exception {}
  public void testBinarySearchTermLeaf() throws Exception {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = newIndexWriterConfig(null);
    iwc.setCodec(getCodec());
    iwc.setMergePolicy(newTieredMergePolicy());
    IndexWriter iw = new IndexWriter(dir, iwc);
    for (int i = 100000; i <= 100400; i++) {
      // only add odd number
      if (i % 2 == 1) {
        Document document = new Document();
        document.add(new StringField("id", i + "", Field.Store.NO));
        iw.addDocument(document);
      }
    }
    iw.commit();
    iw.forceMerge(1);
    DirectoryReader reader = DirectoryReader.open(iw);
    TermsEnum termsEnum = getOnlyLeafReader(reader).terms("id").iterator();
    // test seekExact
    for (int i = 100000; i <= 100400; i++) {
      BytesRef target = new BytesRef(i + "");
      if (i % 2 == 1) {
        assertTrue(termsEnum.seekExact(target));
        assertEquals(termsEnum.term(), target);
      } else {
        assertFalse(termsEnum.seekExact(target));
      }
    }
    subCheckBinarySearch(termsEnum);
    // test seekCeil
    for (int i = 100000; i < 100400; i++) {
      BytesRef target = new BytesRef(i + "");
      if (i % 2 == 1) {
        assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(target));
        assertEquals(termsEnum.term(), target);
        if (i <= 100397) {
          assertEquals(new BytesRef(i + 2 + ""), termsEnum.next());
        }
      } else {
        assertEquals(SeekStatus.NOT_FOUND, termsEnum.seekCeil(target));
        assertEquals(new BytesRef(i + 1 + ""), termsEnum.term());
      }
    }
    assertEquals(SeekStatus.END, termsEnum.seekCeil(new BytesRef(100400 + "")));
    reader.close();
    iw.close();
    dir.close();
  }
  // tests that level 2 ghost fields still work
  public void testLevel2Ghosts() throws Exception {
    Directory dir = newDirectory();