mirror of https://github.com/apache/lucene.git
LUCENE-8836: Speed up TermsEnum#lookupOrd on increasing sequences of ords. (#827)
This commit is contained in:
parent
1089b482fc
commit
2a4c21bb58
|
@ -121,6 +121,9 @@ Optimizations
|
|||
|
||||
* LUCENE-10315: Use SIMD instructions to decode BKD doc IDs. (Guo Feng, Adrien Grand, Ignacio Vera)
|
||||
|
||||
* LUCENE-8836: Speed up calls to TermsEnum#lookupOrd on doc values terms enums
|
||||
and sequences of increasing ords. (Bruno Roustant, Adrien Grand)
|
||||
|
||||
Bug Fixes
|
||||
---------------------
|
||||
* LUCENE-10477: Highlighter: WeightedSpanTermExtractor.extractWeightedSpanTerms to Query#rewrite
|
||||
|
|
|
@ -1111,13 +1111,19 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
|
|||
if (ord < 0 || ord >= entry.termsDictSize) {
|
||||
throw new IndexOutOfBoundsException();
|
||||
}
|
||||
final long blockIndex = ord >>> TERMS_DICT_BLOCK_LZ4_SHIFT;
|
||||
final long blockAddress = blockAddresses.get(blockIndex);
|
||||
bytes.seek(blockAddress);
|
||||
this.ord = (blockIndex << TERMS_DICT_BLOCK_LZ4_SHIFT) - 1;
|
||||
do {
|
||||
// Signed shift since ord is -1 when the terms enum is not positioned
|
||||
final long currentBlockIndex = this.ord >> TERMS_DICT_BLOCK_LZ4_SHIFT;
|
||||
final long blockIndex = ord >> TERMS_DICT_BLOCK_LZ4_SHIFT;
|
||||
if (ord < this.ord || blockIndex != currentBlockIndex) {
|
||||
// The looked up ord is before the current ord or belongs to a different block, seek again
|
||||
final long blockAddress = blockAddresses.get(blockIndex);
|
||||
bytes.seek(blockAddress);
|
||||
this.ord = (blockIndex << TERMS_DICT_BLOCK_LZ4_SHIFT) - 1;
|
||||
}
|
||||
// Scan to the looked up ord
|
||||
while (this.ord < ord) {
|
||||
next();
|
||||
} while (this.ord < ord);
|
||||
}
|
||||
}
|
||||
|
||||
private BytesRef getTermFromIndex(long index) throws IOException {
|
||||
|
|
|
@ -865,4 +865,69 @@ public class TestLucene90DocValuesFormat extends BaseCompressingDocValuesFormatT
|
|||
ireader.close();
|
||||
}
|
||||
}
|
||||
|
||||
public void testSortedTermsDictLookupOrd() throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig());
|
||||
Document doc = new Document();
|
||||
SortedDocValuesField field = new SortedDocValuesField("foo", new BytesRef());
|
||||
doc.add(field);
|
||||
final int numDocs = atLeast(Lucene90DocValuesFormat.TERMS_DICT_BLOCK_LZ4_SIZE + 1);
|
||||
for (int i = 0; i < numDocs; ++i) {
|
||||
field.setBytesValue(new BytesRef("" + i));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
writer.forceMerge(1);
|
||||
IndexReader reader = DirectoryReader.open(writer);
|
||||
LeafReader leafReader = getOnlyLeafReader(reader);
|
||||
doTestTermsDictLookupOrd(leafReader.getSortedDocValues("foo").termsEnum());
|
||||
reader.close();
|
||||
writer.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testSortedSetTermsDictLookupOrd() throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig());
|
||||
Document doc = new Document();
|
||||
SortedSetDocValuesField field = new SortedSetDocValuesField("foo", new BytesRef());
|
||||
doc.add(field);
|
||||
final int numDocs = atLeast(2 * Lucene90DocValuesFormat.TERMS_DICT_BLOCK_LZ4_SIZE + 1);
|
||||
for (int i = 0; i < numDocs; ++i) {
|
||||
field.setBytesValue(new BytesRef("" + i));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
writer.forceMerge(1);
|
||||
IndexReader reader = DirectoryReader.open(writer);
|
||||
LeafReader leafReader = getOnlyLeafReader(reader);
|
||||
doTestTermsDictLookupOrd(leafReader.getSortedSetDocValues("foo").termsEnum());
|
||||
reader.close();
|
||||
writer.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
private void doTestTermsDictLookupOrd(TermsEnum te) throws IOException {
|
||||
List<BytesRef> terms = new ArrayList<>();
|
||||
for (BytesRef term = te.next(); term != null; term = te.next()) {
|
||||
terms.add(BytesRef.deepCopyOf(term));
|
||||
}
|
||||
|
||||
// iterate in order
|
||||
for (int i = 0; i < terms.size(); ++i) {
|
||||
te.seekExact(i);
|
||||
assertEquals(terms.get(i), te.term());
|
||||
}
|
||||
|
||||
// iterate in reverse order
|
||||
for (int i = terms.size() - 1; i >= 0; --i) {
|
||||
te.seekExact(i);
|
||||
assertEquals(terms.get(i), te.term());
|
||||
}
|
||||
|
||||
// iterate in forward order with random gaps
|
||||
for (int i = random().nextInt(5); i < terms.size(); i += random().nextInt(5)) {
|
||||
te.seekExact(i);
|
||||
assertEquals(terms.get(i), te.term());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue