From f333b70dbff88c3670c73b7773f40b11e6712af3 Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Sat, 17 Jul 2021 03:40:28 +0200 Subject: [PATCH] LUCENE-9999: CombinedFieldQuery can fail with an exception when document is missing fields (#185) This change fixes a bug in `MultiNormsLeafSimScorer` that assumes that each field should have a norm for every term/document. As part of the fix, it adds validation that the fields have consistent norms settings. --- lucene/CHANGES.txt | 3 + .../sandbox/search/CombinedFieldQuery.java | 29 +++++ .../search/MultiNormsLeafSimScorer.java | 21 +++- .../search/TestCombinedFieldQuery.java | 103 ++++++++++++++++++ 4 files changed, 150 insertions(+), 6 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index ce9efa72ef5..14ca78d93aa 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -398,6 +398,9 @@ Bug Fixes * LUCENE-9964: Duplicate long values in a document field should only be counted once when using SortedNumericDocValuesFields (Gautam Worah) +* LUCENE-9999: CombinedFieldQuery can fail with an exception when document + is missing some fields. (Jim Ferenczi, Julie Tibshirani) + * LUCENE-10020: DocComparator should not skip docs with the same docID on multiple sorts with search after (Mayya Sharipova, Julie Tibshirani) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java index 8bafe1178b5..ffb9d13f1f9 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java @@ -27,6 +27,8 @@ import java.util.Map; import java.util.Objects; import java.util.Set; import java.util.TreeMap; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.PostingsEnum; @@ -81,6 +83,9 @@ import org.apache.lucene.util.SmallFloat; * compute norms the same way as {@link SimilarityBase#computeNorm}, which includes {@link * BM25Similarity} and {@link DFRSimilarity}. Per-field similarities are not supported. * + *

The query also requires that either all fields or no fields have norms enabled. Having only + * some fields with norms enabled can result in errors. + * *

The scoring is based on BM25F's simple formula described in: * http://www.staff.city.ac.uk/~sb317/papers/foundations_bm25_review.pdf. This query implements the * same approach but allows other similarities besides {@link @@ -278,6 +283,7 @@ public final class CombinedFieldQuery extends Query implements Accountable { @Override public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { + validateConsistentNorms(searcher.getIndexReader()); if (scoreMode.needsScores()) { return new CombinedFieldWeight(this, searcher, scoreMode, boost); } else { @@ -287,6 +293,29 @@ public final class CombinedFieldQuery extends Query implements Accountable { } } + private void validateConsistentNorms(IndexReader reader) { + boolean allFieldsHaveNorms = true; + boolean noFieldsHaveNorms = true; + + for (LeafReaderContext context : reader.leaves()) { + FieldInfos fieldInfos = context.reader().getFieldInfos(); + for (String field : fieldAndWeights.keySet()) { + FieldInfo fieldInfo = fieldInfos.fieldInfo(field); + if (fieldInfo != null) { + allFieldsHaveNorms &= fieldInfo.hasNorms(); + noFieldsHaveNorms &= fieldInfo.omitsNorms(); + } + } + } + + if (allFieldsHaveNorms == false && noFieldsHaveNorms == false) { + throw new IllegalArgumentException( + getClass().getSimpleName() + + " requires norms to be consistent across fields: some fields cannot " + + " have norms enabled, while others have norms disabled"); + } + } + class CombinedFieldWeight extends Weight { private final IndexSearcher searcher; private final TermStates termStates[]; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/MultiNormsLeafSimScorer.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/MultiNormsLeafSimScorer.java index e9fde6f9dea..f2c6120cff4 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/MultiNormsLeafSimScorer.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/MultiNormsLeafSimScorer.java @@ -30,7 +30,13 @@ import org.apache.lucene.search.LeafSimScorer; import org.apache.lucene.search.similarities.Similarity.SimScorer; import org.apache.lucene.util.SmallFloat; -/** Copy of {@link LeafSimScorer} that sums document's norms from multiple fields. */ +/** + * Copy of {@link LeafSimScorer} that sums document's norms from multiple fields. + * + *

For all fields, norms must be encoded using {@link SmallFloat#intToByte4}. This scorer also + * requires that either all fields or no fields have norms enabled. Having only some fields with + * norms enabled can result in errors or undefined behavior. + */ final class MultiNormsLeafSimScorer { /** Cache of decoded norms. */ private static final float[] LENGTH_TABLE = new float[256]; @@ -62,6 +68,7 @@ final class MultiNormsLeafSimScorer { weightList.add(field.weight); } } + if (normsList.isEmpty()) { norms = null; } else if (normsList.size() == 1) { @@ -128,14 +135,16 @@ final class MultiNormsLeafSimScorer { @Override public boolean advanceExact(int target) throws IOException { float normValue = 0; + boolean found = false; for (int i = 0; i < normsArr.length; i++) { - boolean found = normsArr[i].advanceExact(target); - assert found; - normValue += - weightArr[i] * LENGTH_TABLE[Byte.toUnsignedInt((byte) normsArr[i].longValue())]; + if (normsArr[i].advanceExact(target)) { + normValue += + weightArr[i] * LENGTH_TABLE[Byte.toUnsignedInt((byte) normsArr[i].longValue())]; + found = true; + } } current = SmallFloat.intToByte4(Math.round(normValue)); - return true; + return found; } @Override diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java index 4c2c0aef4a5..77aa9ed73ff 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java @@ -46,6 +46,7 @@ import org.apache.lucene.search.similarities.LMDirichletSimilarity; import org.apache.lucene.search.similarities.LMJelinekMercerSimilarity; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.store.Directory; +import org.apache.lucene.store.MMapDirectory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; @@ -164,6 +165,59 @@ public class TestCombinedFieldQuery extends LuceneTestCase { dir.close(); } + public void testNormsDisabled() throws IOException { + Directory dir = newDirectory(); + Similarity similarity = randomCompatibleSimilarity(); + + IndexWriterConfig iwc = new IndexWriterConfig(); + iwc.setSimilarity(similarity); + RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc); + + Document doc = new Document(); + doc.add(new StringField("a", "value", Store.NO)); + doc.add(new StringField("b", "value", Store.NO)); + doc.add(new TextField("c", "value", Store.NO)); + w.addDocument(doc); + w.commit(); + + doc = new Document(); + doc.add(new StringField("a", "value", Store.NO)); + doc.add(new TextField("c", "value", Store.NO)); + w.addDocument(doc); + + IndexReader reader = w.getReader(); + IndexSearcher searcher = newSearcher(reader); + + Similarity searchSimilarity = randomCompatibleSimilarity(); + searcher.setSimilarity(searchSimilarity); + TopScoreDocCollector collector = TopScoreDocCollector.create(10, null, 10); + + CombinedFieldQuery query = + new CombinedFieldQuery.Builder() + .addField("a", 1.0f) + .addField("b", 1.0f) + .addTerm(new BytesRef("value")) + .build(); + searcher.search(query, collector); + TopDocs topDocs = collector.topDocs(); + assertEquals(new TotalHits(2, TotalHits.Relation.EQUAL_TO), topDocs.totalHits); + + CombinedFieldQuery invalidQuery = + new CombinedFieldQuery.Builder() + .addField("b", 1.0f) + .addField("c", 1.0f) + .addTerm(new BytesRef("value")) + .build(); + IllegalArgumentException e = + expectThrows( + IllegalArgumentException.class, () -> searcher.search(invalidQuery, collector)); + assertTrue(e.getMessage().contains("requires norms to be consistent across fields")); + + reader.close(); + w.close(); + dir.close(); + } + public void testCopyField() throws IOException { Directory dir = newDirectory(); Similarity similarity = randomCompatibleSimilarity(); @@ -265,6 +319,55 @@ public class TestCombinedFieldQuery extends LuceneTestCase { dir.close(); } + public void testCopyFieldWithMissingFields() throws IOException { + Directory dir = new MMapDirectory(createTempDir()); + Similarity similarity = randomCompatibleSimilarity(); + + IndexWriterConfig iwc = new IndexWriterConfig(); + iwc.setSimilarity(similarity); + RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc); + + int boost1 = Math.max(1, random().nextInt(5)); + int boost2 = Math.max(1, random().nextInt(5)); + int numMatch = atLeast(10); + for (int i = 0; i < numMatch; i++) { + Document doc = new Document(); + int freqA = random().nextInt(5) + 1; + for (int j = 0; j < freqA; j++) { + doc.add(new TextField("a", "foo", Store.NO)); + } + + // Choose frequencies such that sometimes we don't add field B + int freqB = random().nextInt(3); + for (int j = 0; j < freqB; j++) { + doc.add(new TextField("b", "foo", Store.NO)); + } + + int freqAB = freqA * boost1 + freqB * boost2; + for (int j = 0; j < freqAB; j++) { + doc.add(new TextField("ab", "foo", Store.NO)); + } + + w.addDocument(doc); + } + + IndexReader reader = w.getReader(); + IndexSearcher searcher = newSearcher(reader); + searcher.setSimilarity(similarity); + CombinedFieldQuery query = + new CombinedFieldQuery.Builder() + .addField("a", (float) boost1) + .addField("b", (float) boost2) + .addTerm(new BytesRef("foo")) + .build(); + + checkExpectedHits(searcher, numMatch, query, new TermQuery(new Term("ab", "foo"))); + + reader.close(); + w.close(); + dir.close(); + } + private static Similarity randomCompatibleSimilarity() { return RandomPicks.randomFrom( random(),