LUCENE-10031: Speed up SortedDocIdMerger on low-cardinality sort fields. (#221)

When sorting by low-cardinality fields, the same sub remains current for long
sequences of doc IDs. This speeds up SortedDocIdMerger a bit by extracting
the sub that leads iteration.
This commit is contained in:
Adrien Grand 2021-07-29 08:46:10 +02:00 committed by GitHub
parent 03b1db91f9
commit 0e6c3146d7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 48 additions and 54 deletions

View File

@ -402,6 +402,9 @@ Optimizations
* LUCENE-10022: Rewrite empty DisjunctionMaxQuery to MatchNoDocsQuery.
(David Harsha via Julie Tibshirani)
* LUCENE-10031: Slightly faster segment merging for sorted indices.
(Adrien Grand)
Bug Fixes
---------------------
* LUCENE-9988: Fix DrillSideways correctness bug introduced in LUCENE-9944 (Greg Miller)

View File

@ -48,6 +48,24 @@ public abstract class DocIDMerger<T extends DocIDMerger.Sub> {
* when done
*/
public abstract int nextDoc() throws IOException;
/**
* Like {@link #nextDoc()} but skips over unmapped docs and returns the next mapped doc ID, or
* {@link DocIdSetIterator#NO_MORE_DOCS} when exhausted. This method sets {@link #mappedDocID}
* as a side effect.
*/
public final int nextMappedDoc() throws IOException {
while (true) {
int doc = nextDoc();
if (doc == NO_MORE_DOCS) {
return this.mappedDocID = NO_MORE_DOCS;
}
int mappedDoc = docMap.get(doc);
if (mappedDoc != -1) {
return this.mappedDocID = mappedDoc;
}
}
}
}
/** Construct this from the provided subs, specifying the maximum sub count */
@ -101,36 +119,31 @@ public abstract class DocIDMerger<T extends DocIDMerger.Sub> {
@Override
public T next() throws IOException {
while (true) {
int docID = current.nextDoc();
if (docID == NO_MORE_DOCS) {
if (nextIndex == subs.size()) {
current = null;
return null;
}
current = subs.get(nextIndex);
nextIndex++;
continue;
}
int mappedDocID = current.docMap.get(docID);
if (mappedDocID != -1) {
current.mappedDocID = mappedDocID;
return current;
while (current.nextMappedDoc() == NO_MORE_DOCS) {
if (nextIndex == subs.size()) {
current = null;
return null;
}
current = subs.get(nextIndex);
nextIndex++;
}
return current;
}
}
private static class SortedDocIDMerger<T extends DocIDMerger.Sub> extends DocIDMerger<T> {
private final List<T> subs;
private T current;
private final PriorityQueue<T> queue;
private SortedDocIDMerger(List<T> subs, int maxCount) throws IOException {
if (maxCount <= 1) {
throw new IllegalArgumentException();
}
this.subs = subs;
queue =
new PriorityQueue<T>(maxCount) {
new PriorityQueue<T>(maxCount - 1) {
@Override
protected boolean lessThan(Sub a, Sub b) {
assert a.mappedDocID != b.mappedDocID;
@ -144,59 +157,37 @@ public abstract class DocIDMerger<T extends DocIDMerger.Sub> {
public void reset() throws IOException {
// caller may not have fully consumed the queue:
queue.clear();
current = null;
boolean first = true;
for (T sub : subs) {
if (first) {
// by setting mappedDocID = -1, this entry is guaranteed to be the top of the queue
// so the first call to next() will advance it
sub.mappedDocID = -1;
current = sub;
first = false;
} else {
int mappedDocID;
while (true) {
int docID = sub.nextDoc();
if (docID == NO_MORE_DOCS) {
mappedDocID = NO_MORE_DOCS;
break;
}
mappedDocID = sub.docMap.get(docID);
if (mappedDocID != -1) {
break;
}
}
if (mappedDocID == NO_MORE_DOCS) {
// all docs in this sub were deleted; do not add it to the queue!
continue;
}
sub.mappedDocID = mappedDocID;
}
queue.add(sub);
} else if (sub.nextMappedDoc() != NO_MORE_DOCS) {
queue.add(sub);
} // else all docs in this sub were deleted; do not add it to the queue!
}
}
@Override
public T next() throws IOException {
T top = queue.top();
while (true) {
int docID = top.nextDoc();
if (docID == NO_MORE_DOCS) {
queue.pop();
top = queue.top();
break;
}
int mappedDocID = top.docMap.get(docID);
if (mappedDocID == -1) {
// doc was deleted
continue;
int nextDoc = current.nextMappedDoc();
if (nextDoc == NO_MORE_DOCS) {
if (queue.size() == 0) {
current = null;
} else {
top.mappedDocID = mappedDocID;
top = queue.updateTop();
break;
current = queue.pop();
}
} else if (queue.size() > 0 && nextDoc > queue.top().mappedDocID) {
T newCurrent = queue.top();
queue.updateTop(current);
current = newCurrent;
}
return top;
return current;
}
}
}