LUCENE-8836: Speed up TermsEnum#lookupOrd on increasing sequences of ords. (#827)

This commit is contained in:
Adrien Grand 2022-04-25 09:18:21 +02:00 committed by GitHub
parent 1089b482fc
commit 2a4c21bb58
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 80 additions and 6 deletions

View File

@ -121,6 +121,9 @@ Optimizations
* LUCENE-10315: Use SIMD instructions to decode BKD doc IDs. (Guo Feng, Adrien Grand, Ignacio Vera)
* LUCENE-8836: Speed up calls to TermsEnum#lookupOrd on doc values terms enums
and sequences of increasing ords. (Bruno Roustant, Adrien Grand)
Bug Fixes
---------------------
* LUCENE-10477: Highlighter: WeightedSpanTermExtractor.extractWeightedSpanTerms to Query#rewrite

View File

@ -1111,13 +1111,19 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
if (ord < 0 || ord >= entry.termsDictSize) {
throw new IndexOutOfBoundsException();
}
final long blockIndex = ord >>> TERMS_DICT_BLOCK_LZ4_SHIFT;
final long blockAddress = blockAddresses.get(blockIndex);
bytes.seek(blockAddress);
this.ord = (blockIndex << TERMS_DICT_BLOCK_LZ4_SHIFT) - 1;
do {
// Signed shift since ord is -1 when the terms enum is not positioned
final long currentBlockIndex = this.ord >> TERMS_DICT_BLOCK_LZ4_SHIFT;
final long blockIndex = ord >> TERMS_DICT_BLOCK_LZ4_SHIFT;
if (ord < this.ord || blockIndex != currentBlockIndex) {
// The looked up ord is before the current ord or belongs to a different block, seek again
final long blockAddress = blockAddresses.get(blockIndex);
bytes.seek(blockAddress);
this.ord = (blockIndex << TERMS_DICT_BLOCK_LZ4_SHIFT) - 1;
}
// Scan to the looked up ord
while (this.ord < ord) {
next();
} while (this.ord < ord);
}
}
private BytesRef getTermFromIndex(long index) throws IOException {

View File

@ -865,4 +865,69 @@ public class TestLucene90DocValuesFormat extends BaseCompressingDocValuesFormatT
ireader.close();
}
}
public void testSortedTermsDictLookupOrd() throws IOException {
Directory dir = newDirectory();
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig());
Document doc = new Document();
SortedDocValuesField field = new SortedDocValuesField("foo", new BytesRef());
doc.add(field);
final int numDocs = atLeast(Lucene90DocValuesFormat.TERMS_DICT_BLOCK_LZ4_SIZE + 1);
for (int i = 0; i < numDocs; ++i) {
field.setBytesValue(new BytesRef("" + i));
writer.addDocument(doc);
}
writer.forceMerge(1);
IndexReader reader = DirectoryReader.open(writer);
LeafReader leafReader = getOnlyLeafReader(reader);
doTestTermsDictLookupOrd(leafReader.getSortedDocValues("foo").termsEnum());
reader.close();
writer.close();
dir.close();
}
public void testSortedSetTermsDictLookupOrd() throws IOException {
Directory dir = newDirectory();
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig());
Document doc = new Document();
SortedSetDocValuesField field = new SortedSetDocValuesField("foo", new BytesRef());
doc.add(field);
final int numDocs = atLeast(2 * Lucene90DocValuesFormat.TERMS_DICT_BLOCK_LZ4_SIZE + 1);
for (int i = 0; i < numDocs; ++i) {
field.setBytesValue(new BytesRef("" + i));
writer.addDocument(doc);
}
writer.forceMerge(1);
IndexReader reader = DirectoryReader.open(writer);
LeafReader leafReader = getOnlyLeafReader(reader);
doTestTermsDictLookupOrd(leafReader.getSortedSetDocValues("foo").termsEnum());
reader.close();
writer.close();
dir.close();
}
private void doTestTermsDictLookupOrd(TermsEnum te) throws IOException {
List<BytesRef> terms = new ArrayList<>();
for (BytesRef term = te.next(); term != null; term = te.next()) {
terms.add(BytesRef.deepCopyOf(term));
}
// iterate in order
for (int i = 0; i < terms.size(); ++i) {
te.seekExact(i);
assertEquals(terms.get(i), te.term());
}
// iterate in reverse order
for (int i = terms.size() - 1; i >= 0; --i) {
te.seekExact(i);
assertEquals(terms.get(i), te.term());
}
// iterate in forward order with random gaps
for (int i = random().nextInt(5); i < terms.size(); i += random().nextInt(5)) {
te.seekExact(i);
assertEquals(terms.get(i), te.term());
}
}
}