LUCENE-10582: Fix merging of CollectionStatistics in CombinedFieldQuery (#910)

CombinedFieldQuery does not properly combine overridden collection statistics, resulting in an IllegalArgumentException during searches.
This commit is contained in:
Yannick Welsch 2022-05-30 20:02:40 +02:00 committed by GitHub
parent 318177af83
commit e319a5223c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 86 additions and 1 deletions

View File

@ -88,6 +88,8 @@ Bug Fixes
* LUCENE-10574: Prevent pathological O(N^2) merging. (Adrien Grand)
* LUCENE-10582: Fix merging of overridden CollectionStatistics in CombinedFieldQuery (Yannick Welsch)
Other
---------------------

View File

@ -352,13 +352,14 @@ public final class CombinedFieldQuery extends Query implements Accountable {
private CollectionStatistics mergeCollectionStatistics(IndexSearcher searcher)
throws IOException {
long maxDoc = searcher.getIndexReader().maxDoc();
long maxDoc = 0;
long docCount = 0;
long sumTotalTermFreq = 0;
long sumDocFreq = 0;
for (FieldAndWeight fieldWeight : fieldAndWeights.values()) {
CollectionStatistics collectionStats = searcher.collectionStatistics(fieldWeight.field);
if (collectionStats != null) {
maxDoc = Math.max(collectionStats.maxDoc(), maxDoc);
docCount = Math.max(collectionStats.docCount(), docCount);
sumDocFreq = Math.max(collectionStats.sumDocFreq(), sumDocFreq);
sumTotalTermFreq += (double) fieldWeight.weight * collectionStats.sumTotalTermFreq();

View File

@ -589,4 +589,86 @@ public class TestCombinedFieldQuery extends LuceneTestCase {
return new BM25Similarity().scorer(boost, collectionStats, termStats);
}
}
public void testOverrideCollectionStatistics() throws IOException {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig();
Similarity similarity = randomCompatibleSimilarity();
iwc.setSimilarity(similarity);
RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
int numMatch = atLeast(10);
for (int i = 0; i < numMatch; i++) {
Document doc = new Document();
if (random().nextBoolean()) {
doc.add(new TextField("a", "baz", Store.NO));
doc.add(new TextField("b", "baz", Store.NO));
for (int k = 0; k < 2; k++) {
doc.add(new TextField("ab", "baz", Store.NO));
}
w.addDocument(doc);
doc.clear();
}
int freqA = random().nextInt(5) + 1;
for (int j = 0; j < freqA; j++) {
doc.add(new TextField("a", "foo", Store.NO));
}
int freqB = random().nextInt(5) + 1;
for (int j = 0; j < freqB; j++) {
doc.add(new TextField("b", "foo", Store.NO));
}
int freqAB = freqA + freqB;
for (int j = 0; j < freqAB; j++) {
doc.add(new TextField("ab", "foo", Store.NO));
}
w.addDocument(doc);
}
IndexReader reader = w.getReader();
int extraMaxDoc = randomIntBetween(0, 10);
int extraDocCount = randomIntBetween(0, extraMaxDoc);
int extraSumDocFreq = extraDocCount + randomIntBetween(0, 10);
int extraSumTotalTermFreqA = extraSumDocFreq + randomIntBetween(0, 10);
int extraSumTotalTermFreqB = extraSumDocFreq + randomIntBetween(0, 10);
int extraSumTotalTermFreqAB = extraSumTotalTermFreqA + extraSumTotalTermFreqB;
IndexSearcher searcher =
new IndexSearcher(reader) {
@Override
public CollectionStatistics collectionStatistics(String field) throws IOException {
CollectionStatistics shardStatistics = super.collectionStatistics(field);
int extraSumTotalTermFreq;
if (field.equals("a")) {
extraSumTotalTermFreq = extraSumTotalTermFreqA;
} else if (field.equals("b")) {
extraSumTotalTermFreq = extraSumTotalTermFreqB;
} else if (field.equals("ab")) {
extraSumTotalTermFreq = extraSumTotalTermFreqAB;
} else {
throw new AssertionError("should never be called");
}
return new CollectionStatistics(
field,
shardStatistics.maxDoc() + extraMaxDoc,
shardStatistics.docCount() + extraDocCount,
shardStatistics.sumTotalTermFreq() + extraSumTotalTermFreq,
shardStatistics.sumDocFreq() + extraSumDocFreq);
}
};
searcher.setSimilarity(similarity);
CombinedFieldQuery query =
new CombinedFieldQuery.Builder()
.addField("a")
.addField("b")
.addTerm(new BytesRef("foo"))
.build();
checkExpectedHits(searcher, numMatch, query, new TermQuery(new Term("ab", "foo")));
reader.close();
w.close();
dir.close();
}
}