LUCENE-10020 DocComparator don't skip docs of same docID (#204)

DocComparator should not skip docs with the same docID on multiple
sorts with search after.

Because of the optimization introduced in LUCENE-9449, currently when
searching with sort on [_doc, other fields] with search after,
DocComparator can efficiently skip all docs before and including
the provided [search after docID]. This is a desirable behaviour
in a single index search. But in a distributed search, where multiple
indices have docs with the same docID, and when searching on
 [_doc, other fields], the sort optimization should NOT skip
documents with the same docIDs.

This PR fixes this.

Relates to LUCENE-9449
This commit is contained in:
Mayya Sharipova 2021-07-06 14:59:57 -04:00 committed by GitHub
parent 167bd99c23
commit 64d9f8c587
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 72 additions and 2 deletions

View File

@ -383,6 +383,9 @@ Bug Fixes
* LUCENE-9964: Duplicate long values in a document field should only be counted once when using SortedNumericDocValuesFields * LUCENE-9964: Duplicate long values in a document field should only be counted once when using SortedNumericDocValuesFields
(Gautam Worah) (Gautam Worah)
* LUCENE-10020: DocComparator should not skip docs with the same docID on
multiple sorts with search after (Mayya Sharipova, Julie Tibshirani)
Other Other
--------------------- ---------------------
(No changes) (No changes)

View File

@ -81,7 +81,11 @@ public class DocComparator extends FieldComparator<Integer> {
public DocLeafComparator(LeafReaderContext context) { public DocLeafComparator(LeafReaderContext context) {
this.docBase = context.docBase; this.docBase = context.docBase;
if (enableSkipping) { if (enableSkipping) {
this.minDoc = topValue + 1; // Skip docs before topValue, but include docs starting with topValue.
// Including topValue is necessary when doing sort on [_doc, other fields]
// in a distributed search where there are docs from different indices
// with the same docID.
this.minDoc = topValue;
this.maxDoc = context.reader().maxDoc(); this.maxDoc = context.reader().maxDoc();
this.competitiveIterator = DocIdSetIterator.all(maxDoc); this.competitiveIterator = DocIdSetIterator.all(maxDoc);
} else { } else {

View File

@ -37,7 +37,7 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
public class TestFieldSortOptimizationSkipping extends LuceneTestCase { public class TestSortOptimization extends LuceneTestCase {
public void testLongSortOptimization() throws IOException { public void testLongSortOptimization() throws IOException {
@ -332,6 +332,69 @@ public class TestFieldSortOptimizationSkipping extends LuceneTestCase {
dir.close(); dir.close();
} }
/**
* Test that a search with sort on [_doc, other fields] across multiple indices doesn't miss any
* documents.
*/
public void testDocSortOptimizationMultipleIndices() throws IOException {
final int numIndices = 3;
final int numDocsInIndex = atLeast(50);
Directory[] dirs = new Directory[numIndices];
IndexReader[] readers = new IndexReader[numIndices];
for (int i = 0; i < numIndices; i++) {
dirs[i] = newDirectory();
try (IndexWriter writer = new IndexWriter(dirs[i], new IndexWriterConfig())) {
for (int docID = 0; docID < numDocsInIndex; docID++) {
final Document doc = new Document();
doc.add(new NumericDocValuesField("my_field", docID * numIndices + i));
writer.addDocument(doc);
}
writer.flush();
}
readers[i] = DirectoryReader.open(dirs[i]);
}
final int size = 7;
final int totalHitsThreshold = 7;
final Sort sort = new Sort(FIELD_DOC, new SortField("my_field", SortField.Type.LONG));
TopFieldDocs[] topDocs = new TopFieldDocs[numIndices];
int curNumHits;
FieldDoc after = null;
long collectedDocs = 0;
long totalDocs = 0;
int numHits = 0;
do {
for (int i = 0; i < numIndices; i++) {
IndexSearcher searcher = newSearcher(readers[i]);
final TopFieldCollector collector =
TopFieldCollector.create(sort, size, after, totalHitsThreshold);
searcher.search(new MatchAllDocsQuery(), collector);
topDocs[i] = collector.topDocs();
for (int docID = 0; docID < topDocs[i].scoreDocs.length; docID++) {
topDocs[i].scoreDocs[docID].shardIndex = i;
}
collectedDocs += topDocs[i].totalHits.value;
totalDocs += numDocsInIndex;
}
TopFieldDocs mergedTopDcs = TopDocs.merge(sort, size, topDocs);
curNumHits = mergedTopDcs.scoreDocs.length;
numHits += curNumHits;
if (curNumHits > 0) {
after = (FieldDoc) mergedTopDcs.scoreDocs[curNumHits - 1];
}
} while (curNumHits > 0);
for (int i = 0; i < numIndices; i++) {
readers[i].close();
dirs[i].close();
}
final int expectedNumHits = numDocsInIndex * numIndices;
assertEquals(expectedNumHits, numHits);
// check that the optimization was run, as very few docs were collected
assertTrue(collectedDocs < totalDocs);
}
public void testDocSortOptimizationWithAfter() throws IOException { public void testDocSortOptimizationWithAfter() throws IOException {
final Directory dir = newDirectory(); final Directory dir = newDirectory();
final IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig()); final IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig());