diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java index 66231313e52..11e88c24b02 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java @@ -75,6 +75,9 @@ final class SegmentTermsEnumFrame { // True if all entries are terms boolean isLeafBlock; + // True if all entries have the same length. + boolean allEqual; + long lastSubFP; int nextFloorLabel; @@ -183,7 +186,7 @@ final class SegmentTermsEnumFrame { suffixesReader.reset(suffixBytes, 0, numSuffixBytes); int numSuffixLengthBytes = ste.in.readVInt(); - final boolean allEqual = (numSuffixLengthBytes & 0x01) != 0; + allEqual = (numSuffixLengthBytes & 0x01) != 0; numSuffixLengthBytes >>>= 1; if (suffixLengthBytes.length < numSuffixLengthBytes) { suffixLengthBytes = new byte[ArrayUtil.oversize(numSuffixLengthBytes, 1)]; @@ -523,7 +526,15 @@ final class SegmentTermsEnumFrame { // NOTE: sets startBytePos/suffix as a side effect public SeekStatus scanToTerm(BytesRef target, boolean exactOnly) throws IOException { - return isLeafBlock ? scanToTermLeaf(target, exactOnly) : scanToTermNonLeaf(target, exactOnly); + if (isLeafBlock) { + if (allEqual) { + return binarySearchTermLeaf(target, exactOnly); + } else { + return scanToTermLeaf(target, exactOnly); + } + } else { + return scanToTermNonLeaf(target, exactOnly); + } } private int startBytePos; @@ -554,8 +565,6 @@ final class SegmentTermsEnumFrame { assert prefixMatches(target); - // TODO: binary search when all terms have the same length, which is common for ID fields, - // which are also the most sensitive to lookup performance? // Loop over each entry (term or sub-block) in this block: do { nextEnt++; @@ -628,6 +637,97 @@ final class SegmentTermsEnumFrame { return SeekStatus.END; } + // Target's prefix matches this block's prefix; + // And all suffixes have the same length in this block, + // we binary search the entries to check if the suffix matches. + public SeekStatus binarySearchTermLeaf(BytesRef target, boolean exactOnly) throws IOException { + // if (DEBUG) System.out.println(" binarySearchTermLeaf: block fp=" + fp + " prefix=" + + // prefix + " + // nextEnt=" + nextEnt + " (of " + entCount + ") target=" + brToString(target) + " term=" + + // brToString(term)); + + assert nextEnt != -1; + + ste.termExists = true; + subCode = 0; + + if (nextEnt == entCount) { + if (exactOnly) { + fillTerm(); + } + return SeekStatus.END; + } + + assert prefixMatches(target); + + suffix = suffixLengthsReader.readVInt(); + // TODO early terminate when target length unequals suffix + prefix. + // But we need to keep the same status with scanToTermLeaf. + int start = nextEnt; + int end = entCount - 1; + // Binary search the entries (terms) in this leaf block: + int cmp = 0; + while (start <= end) { + int mid = (start + end) >>> 1; + nextEnt = mid + 1; + startBytePos = mid * suffix; + + // Binary search bytes in the suffix, comparing to the target. + cmp = + Arrays.compareUnsigned( + suffixBytes, + startBytePos, + startBytePos + suffix, + target.bytes, + target.offset + prefix, + target.offset + target.length); + if (cmp < 0) { + start = mid + 1; + } else if (cmp > 0) { + end = mid - 1; + } else { + // Exact match! + suffixesReader.setPosition(startBytePos + suffix); + fillTerm(); + // if (DEBUG) System.out.println(" found!"); + return SeekStatus.FOUND; + } + } + + // It is possible (and OK) that terms index pointed us + // at this block, but, we searched the entire block and + // did not find the term to position to. This happens + // when the target is after the last term in the block + // (but, before the next term in the index). EG + // target could be foozzz, and terms index pointed us + // to the foo* block, but the last term in this block + // was fooz (and, eg, first term in the next block will + // bee fop). + // if (DEBUG) System.out.println(" block end"); + SeekStatus seekStatus; + if (end < entCount - 1) { + seekStatus = SeekStatus.NOT_FOUND; + // If binary search ended at the less term, and greater term exists. + // We need to advance to the greater term. + if (cmp < 0) { + startBytePos += suffix; + nextEnt++; + } + suffixesReader.setPosition(startBytePos + suffix); + fillTerm(); + } else { + seekStatus = SeekStatus.END; + suffixesReader.setPosition(startBytePos + suffix); + if (exactOnly) { + fillTerm(); + } + } + // TODO: not consistent that in the + // not-exact case we don't next() into the next + // frame here + return seekStatus; + } + // Target's prefix matches this block's prefix; we // scan the entries check if the suffix matches. public SeekStatus scanToTermNonLeaf(BytesRef target, boolean exactOnly) throws IOException { diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99PostingsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99PostingsFormat.java index 99c0e0a6ae2..341805e8a3e 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99PostingsFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99PostingsFormat.java @@ -29,10 +29,7 @@ import org.apache.lucene.codecs.lucene90.blocktree.Stats; import org.apache.lucene.codecs.lucene99.Lucene99ScoreSkipReader.MutableImpactList; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; -import org.apache.lucene.index.DirectoryReader; -import org.apache.lucene.index.Impact; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.*; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; @@ -41,6 +38,7 @@ import org.apache.lucene.store.IndexOutput; import org.apache.lucene.tests.analysis.MockAnalyzer; import org.apache.lucene.tests.index.BasePostingsFormatTestCase; import org.apache.lucene.tests.util.TestUtil; +import org.apache.lucene.util.BytesRef; public class TestLucene99PostingsFormat extends BasePostingsFormatTestCase { private final Codec codec = TestUtil.alwaysPostingsFormat(new Lucene99PostingsFormat()); @@ -143,4 +141,13 @@ public class TestLucene99PostingsFormat extends BasePostingsFormatTestCase { } } } + + @Override + protected void subCheckBinarySearch(TermsEnum termsEnum) throws Exception { + // 10004a matched block's entries: [100001, 100003, ..., 100049]. + // if target greater than the last entry of the matched block, + // termsEnum.term should be the next leaf block's first entry. + assertEquals(TermsEnum.SeekStatus.NOT_FOUND, termsEnum.seekCeil(new BytesRef("10004a"))); + assertEquals(termsEnum.term(), new BytesRef("100051")); + } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePostingsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePostingsFormatTestCase.java index 1de01269d82..4d0024b93e3 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePostingsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePostingsFormatTestCase.java @@ -369,6 +369,61 @@ public abstract class BasePostingsFormatTestCase extends BaseIndexFileFormatTest dir.close(); } + protected void subCheckBinarySearch(TermsEnum termsEnum) throws Exception {} + + public void testBinarySearchTermLeaf() throws Exception { + Directory dir = newDirectory(); + + IndexWriterConfig iwc = newIndexWriterConfig(null); + iwc.setCodec(getCodec()); + iwc.setMergePolicy(newTieredMergePolicy()); + IndexWriter iw = new IndexWriter(dir, iwc); + + for (int i = 100000; i <= 100400; i++) { + // only add odd number + if (i % 2 == 1) { + Document document = new Document(); + document.add(new StringField("id", i + "", Field.Store.NO)); + iw.addDocument(document); + } + } + iw.commit(); + iw.forceMerge(1); + + DirectoryReader reader = DirectoryReader.open(iw); + TermsEnum termsEnum = getOnlyLeafReader(reader).terms("id").iterator(); + // test seekExact + for (int i = 100000; i <= 100400; i++) { + BytesRef target = new BytesRef(i + ""); + if (i % 2 == 1) { + assertTrue(termsEnum.seekExact(target)); + assertEquals(termsEnum.term(), target); + } else { + assertFalse(termsEnum.seekExact(target)); + } + } + + subCheckBinarySearch(termsEnum); + // test seekCeil + for (int i = 100000; i < 100400; i++) { + BytesRef target = new BytesRef(i + ""); + if (i % 2 == 1) { + assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(target)); + assertEquals(termsEnum.term(), target); + if (i <= 100397) { + assertEquals(new BytesRef(i + 2 + ""), termsEnum.next()); + } + } else { + assertEquals(SeekStatus.NOT_FOUND, termsEnum.seekCeil(target)); + assertEquals(new BytesRef(i + 1 + ""), termsEnum.term()); + } + } + assertEquals(SeekStatus.END, termsEnum.seekCeil(new BytesRef(100400 + ""))); + reader.close(); + iw.close(); + dir.close(); + } + // tests that level 2 ghost fields still work public void testLevel2Ghosts() throws Exception { Directory dir = newDirectory();