mirror of https://github.com/apache/lucene.git
[Fix] Binary search the BlockTree term entries when all suffixes have the same length in a leaf block. (#11888)
* Binary search the entries when all suffixes have the same length in a leaf block. * add comment on allEqual. * BackwardsCompatibility: keep the same logic on fillTerm and SeekStatus(NOT_FOUND, END). * Update comments: modify scan to binary search. * Add unit test for binarySearchTermLeaf. * Format code. * Assert the value of termsEnum.term() is correct after seeking. * Add CHANGES entry. * Clarify "leaf block _of the terms dict_" * Set suffixesReader's position. * Advance to the greater term If binary search ended at the less term. * Assert termsEnum's position after seeking. * Tidy. * Advance to the greater term if binary search ended at the less term: nextEnt plus 1. * Advance to the greater term if binary search ended at the less term and greater term exists. * Add test case: target greater than the last entry of the matched block. * Move test case that target greater than the last entry of the matched block to TestLucene90PostingsFormat. * Move test case for target greater than the last entry of the matched block to TestLucene99PostingsFormat * Clarify code. * Replace ternary with verbose if. * Replace seekExact with seekCeil. * Replace division by 2 with logical right shift. * Remove assert ste.termExists. * Clarify code. * Remove stale change entry. * Fix comment. --------- Co-authored-by: Adrien Grand <jpountz@gmail.com>
This commit is contained in:
parent
42f2da5fe2
commit
bf074502df
|
@ -75,6 +75,9 @@ final class SegmentTermsEnumFrame {
|
||||||
// True if all entries are terms
|
// True if all entries are terms
|
||||||
boolean isLeafBlock;
|
boolean isLeafBlock;
|
||||||
|
|
||||||
|
// True if all entries have the same length.
|
||||||
|
boolean allEqual;
|
||||||
|
|
||||||
long lastSubFP;
|
long lastSubFP;
|
||||||
|
|
||||||
int nextFloorLabel;
|
int nextFloorLabel;
|
||||||
|
@ -183,7 +186,7 @@ final class SegmentTermsEnumFrame {
|
||||||
suffixesReader.reset(suffixBytes, 0, numSuffixBytes);
|
suffixesReader.reset(suffixBytes, 0, numSuffixBytes);
|
||||||
|
|
||||||
int numSuffixLengthBytes = ste.in.readVInt();
|
int numSuffixLengthBytes = ste.in.readVInt();
|
||||||
final boolean allEqual = (numSuffixLengthBytes & 0x01) != 0;
|
allEqual = (numSuffixLengthBytes & 0x01) != 0;
|
||||||
numSuffixLengthBytes >>>= 1;
|
numSuffixLengthBytes >>>= 1;
|
||||||
if (suffixLengthBytes.length < numSuffixLengthBytes) {
|
if (suffixLengthBytes.length < numSuffixLengthBytes) {
|
||||||
suffixLengthBytes = new byte[ArrayUtil.oversize(numSuffixLengthBytes, 1)];
|
suffixLengthBytes = new byte[ArrayUtil.oversize(numSuffixLengthBytes, 1)];
|
||||||
|
@ -523,7 +526,15 @@ final class SegmentTermsEnumFrame {
|
||||||
|
|
||||||
// NOTE: sets startBytePos/suffix as a side effect
|
// NOTE: sets startBytePos/suffix as a side effect
|
||||||
public SeekStatus scanToTerm(BytesRef target, boolean exactOnly) throws IOException {
|
public SeekStatus scanToTerm(BytesRef target, boolean exactOnly) throws IOException {
|
||||||
return isLeafBlock ? scanToTermLeaf(target, exactOnly) : scanToTermNonLeaf(target, exactOnly);
|
if (isLeafBlock) {
|
||||||
|
if (allEqual) {
|
||||||
|
return binarySearchTermLeaf(target, exactOnly);
|
||||||
|
} else {
|
||||||
|
return scanToTermLeaf(target, exactOnly);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return scanToTermNonLeaf(target, exactOnly);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private int startBytePos;
|
private int startBytePos;
|
||||||
|
@ -554,8 +565,6 @@ final class SegmentTermsEnumFrame {
|
||||||
|
|
||||||
assert prefixMatches(target);
|
assert prefixMatches(target);
|
||||||
|
|
||||||
// TODO: binary search when all terms have the same length, which is common for ID fields,
|
|
||||||
// which are also the most sensitive to lookup performance?
|
|
||||||
// Loop over each entry (term or sub-block) in this block:
|
// Loop over each entry (term or sub-block) in this block:
|
||||||
do {
|
do {
|
||||||
nextEnt++;
|
nextEnt++;
|
||||||
|
@ -628,6 +637,97 @@ final class SegmentTermsEnumFrame {
|
||||||
return SeekStatus.END;
|
return SeekStatus.END;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Target's prefix matches this block's prefix;
|
||||||
|
// And all suffixes have the same length in this block,
|
||||||
|
// we binary search the entries to check if the suffix matches.
|
||||||
|
public SeekStatus binarySearchTermLeaf(BytesRef target, boolean exactOnly) throws IOException {
|
||||||
|
// if (DEBUG) System.out.println(" binarySearchTermLeaf: block fp=" + fp + " prefix=" +
|
||||||
|
// prefix + "
|
||||||
|
// nextEnt=" + nextEnt + " (of " + entCount + ") target=" + brToString(target) + " term=" +
|
||||||
|
// brToString(term));
|
||||||
|
|
||||||
|
assert nextEnt != -1;
|
||||||
|
|
||||||
|
ste.termExists = true;
|
||||||
|
subCode = 0;
|
||||||
|
|
||||||
|
if (nextEnt == entCount) {
|
||||||
|
if (exactOnly) {
|
||||||
|
fillTerm();
|
||||||
|
}
|
||||||
|
return SeekStatus.END;
|
||||||
|
}
|
||||||
|
|
||||||
|
assert prefixMatches(target);
|
||||||
|
|
||||||
|
suffix = suffixLengthsReader.readVInt();
|
||||||
|
// TODO early terminate when target length unequals suffix + prefix.
|
||||||
|
// But we need to keep the same status with scanToTermLeaf.
|
||||||
|
int start = nextEnt;
|
||||||
|
int end = entCount - 1;
|
||||||
|
// Binary search the entries (terms) in this leaf block:
|
||||||
|
int cmp = 0;
|
||||||
|
while (start <= end) {
|
||||||
|
int mid = (start + end) >>> 1;
|
||||||
|
nextEnt = mid + 1;
|
||||||
|
startBytePos = mid * suffix;
|
||||||
|
|
||||||
|
// Binary search bytes in the suffix, comparing to the target.
|
||||||
|
cmp =
|
||||||
|
Arrays.compareUnsigned(
|
||||||
|
suffixBytes,
|
||||||
|
startBytePos,
|
||||||
|
startBytePos + suffix,
|
||||||
|
target.bytes,
|
||||||
|
target.offset + prefix,
|
||||||
|
target.offset + target.length);
|
||||||
|
if (cmp < 0) {
|
||||||
|
start = mid + 1;
|
||||||
|
} else if (cmp > 0) {
|
||||||
|
end = mid - 1;
|
||||||
|
} else {
|
||||||
|
// Exact match!
|
||||||
|
suffixesReader.setPosition(startBytePos + suffix);
|
||||||
|
fillTerm();
|
||||||
|
// if (DEBUG) System.out.println(" found!");
|
||||||
|
return SeekStatus.FOUND;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// It is possible (and OK) that terms index pointed us
|
||||||
|
// at this block, but, we searched the entire block and
|
||||||
|
// did not find the term to position to. This happens
|
||||||
|
// when the target is after the last term in the block
|
||||||
|
// (but, before the next term in the index). EG
|
||||||
|
// target could be foozzz, and terms index pointed us
|
||||||
|
// to the foo* block, but the last term in this block
|
||||||
|
// was fooz (and, eg, first term in the next block will
|
||||||
|
// bee fop).
|
||||||
|
// if (DEBUG) System.out.println(" block end");
|
||||||
|
SeekStatus seekStatus;
|
||||||
|
if (end < entCount - 1) {
|
||||||
|
seekStatus = SeekStatus.NOT_FOUND;
|
||||||
|
// If binary search ended at the less term, and greater term exists.
|
||||||
|
// We need to advance to the greater term.
|
||||||
|
if (cmp < 0) {
|
||||||
|
startBytePos += suffix;
|
||||||
|
nextEnt++;
|
||||||
|
}
|
||||||
|
suffixesReader.setPosition(startBytePos + suffix);
|
||||||
|
fillTerm();
|
||||||
|
} else {
|
||||||
|
seekStatus = SeekStatus.END;
|
||||||
|
suffixesReader.setPosition(startBytePos + suffix);
|
||||||
|
if (exactOnly) {
|
||||||
|
fillTerm();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// TODO: not consistent that in the
|
||||||
|
// not-exact case we don't next() into the next
|
||||||
|
// frame here
|
||||||
|
return seekStatus;
|
||||||
|
}
|
||||||
|
|
||||||
// Target's prefix matches this block's prefix; we
|
// Target's prefix matches this block's prefix; we
|
||||||
// scan the entries check if the suffix matches.
|
// scan the entries check if the suffix matches.
|
||||||
public SeekStatus scanToTermNonLeaf(BytesRef target, boolean exactOnly) throws IOException {
|
public SeekStatus scanToTermNonLeaf(BytesRef target, boolean exactOnly) throws IOException {
|
||||||
|
|
|
@ -29,10 +29,7 @@ import org.apache.lucene.codecs.lucene90.blocktree.Stats;
|
||||||
import org.apache.lucene.codecs.lucene99.Lucene99ScoreSkipReader.MutableImpactList;
|
import org.apache.lucene.codecs.lucene99.Lucene99ScoreSkipReader.MutableImpactList;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.index.DirectoryReader;
|
import org.apache.lucene.index.*;
|
||||||
import org.apache.lucene.index.Impact;
|
|
||||||
import org.apache.lucene.index.IndexWriter;
|
|
||||||
import org.apache.lucene.index.IndexWriterConfig;
|
|
||||||
import org.apache.lucene.store.ByteArrayDataInput;
|
import org.apache.lucene.store.ByteArrayDataInput;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.store.IOContext;
|
import org.apache.lucene.store.IOContext;
|
||||||
|
@ -41,6 +38,7 @@ import org.apache.lucene.store.IndexOutput;
|
||||||
import org.apache.lucene.tests.analysis.MockAnalyzer;
|
import org.apache.lucene.tests.analysis.MockAnalyzer;
|
||||||
import org.apache.lucene.tests.index.BasePostingsFormatTestCase;
|
import org.apache.lucene.tests.index.BasePostingsFormatTestCase;
|
||||||
import org.apache.lucene.tests.util.TestUtil;
|
import org.apache.lucene.tests.util.TestUtil;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
|
||||||
public class TestLucene99PostingsFormat extends BasePostingsFormatTestCase {
|
public class TestLucene99PostingsFormat extends BasePostingsFormatTestCase {
|
||||||
private final Codec codec = TestUtil.alwaysPostingsFormat(new Lucene99PostingsFormat());
|
private final Codec codec = TestUtil.alwaysPostingsFormat(new Lucene99PostingsFormat());
|
||||||
|
@ -143,4 +141,13 @@ public class TestLucene99PostingsFormat extends BasePostingsFormatTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void subCheckBinarySearch(TermsEnum termsEnum) throws Exception {
|
||||||
|
// 10004a matched block's entries: [100001, 100003, ..., 100049].
|
||||||
|
// if target greater than the last entry of the matched block,
|
||||||
|
// termsEnum.term should be the next leaf block's first entry.
|
||||||
|
assertEquals(TermsEnum.SeekStatus.NOT_FOUND, termsEnum.seekCeil(new BytesRef("10004a")));
|
||||||
|
assertEquals(termsEnum.term(), new BytesRef("100051"));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -369,6 +369,61 @@ public abstract class BasePostingsFormatTestCase extends BaseIndexFileFormatTest
|
||||||
dir.close();
|
dir.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected void subCheckBinarySearch(TermsEnum termsEnum) throws Exception {}
|
||||||
|
|
||||||
|
public void testBinarySearchTermLeaf() throws Exception {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
|
||||||
|
IndexWriterConfig iwc = newIndexWriterConfig(null);
|
||||||
|
iwc.setCodec(getCodec());
|
||||||
|
iwc.setMergePolicy(newTieredMergePolicy());
|
||||||
|
IndexWriter iw = new IndexWriter(dir, iwc);
|
||||||
|
|
||||||
|
for (int i = 100000; i <= 100400; i++) {
|
||||||
|
// only add odd number
|
||||||
|
if (i % 2 == 1) {
|
||||||
|
Document document = new Document();
|
||||||
|
document.add(new StringField("id", i + "", Field.Store.NO));
|
||||||
|
iw.addDocument(document);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
iw.commit();
|
||||||
|
iw.forceMerge(1);
|
||||||
|
|
||||||
|
DirectoryReader reader = DirectoryReader.open(iw);
|
||||||
|
TermsEnum termsEnum = getOnlyLeafReader(reader).terms("id").iterator();
|
||||||
|
// test seekExact
|
||||||
|
for (int i = 100000; i <= 100400; i++) {
|
||||||
|
BytesRef target = new BytesRef(i + "");
|
||||||
|
if (i % 2 == 1) {
|
||||||
|
assertTrue(termsEnum.seekExact(target));
|
||||||
|
assertEquals(termsEnum.term(), target);
|
||||||
|
} else {
|
||||||
|
assertFalse(termsEnum.seekExact(target));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
subCheckBinarySearch(termsEnum);
|
||||||
|
// test seekCeil
|
||||||
|
for (int i = 100000; i < 100400; i++) {
|
||||||
|
BytesRef target = new BytesRef(i + "");
|
||||||
|
if (i % 2 == 1) {
|
||||||
|
assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(target));
|
||||||
|
assertEquals(termsEnum.term(), target);
|
||||||
|
if (i <= 100397) {
|
||||||
|
assertEquals(new BytesRef(i + 2 + ""), termsEnum.next());
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
assertEquals(SeekStatus.NOT_FOUND, termsEnum.seekCeil(target));
|
||||||
|
assertEquals(new BytesRef(i + 1 + ""), termsEnum.term());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assertEquals(SeekStatus.END, termsEnum.seekCeil(new BytesRef(100400 + "")));
|
||||||
|
reader.close();
|
||||||
|
iw.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
|
||||||
// tests that level 2 ghost fields still work
|
// tests that level 2 ghost fields still work
|
||||||
public void testLevel2Ghosts() throws Exception {
|
public void testLevel2Ghosts() throws Exception {
|
||||||
Directory dir = newDirectory();
|
Directory dir = newDirectory();
|
||||||
|
|
Loading…
Reference in New Issue