LUCENE-9999: CombinedFieldQuery can fail with an exception when document is missing fields (#185)

This change fixes a bug in `MultiNormsLeafSimScorer` that assumes that each field should have a norm for every term/document. As part of the fix, it adds validation that the fields have consistent norms settings.
2021-07-17 03:40:28 +02:00 · 2021-07-17 03:40:28 +02:00 · f333b70dbf
parent 30beb70ffa
commit f333b70dbf
4 changed files with 150 additions and 6 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -398,6 +398,9 @@ Bug Fixes
 * LUCENE-9964: Duplicate long values in a document field should only be counted once when using SortedNumericDocValuesFields
  (Gautam Worah)
 * LUCENE-9999: CombinedFieldQuery can fail with an exception when document
  is missing some fields. (Jim Ferenczi, Julie Tibshirani)
 * LUCENE-10020: DocComparator should not skip docs with the same docID on
  multiple sorts with search after (Mayya Sharipova, Julie Tibshirani)
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
@ -27,6 +27,8 @@ import java.util.Map;
 import java.util.Objects;
 import java.util.Set;
 import java.util.TreeMap;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.FieldInfos;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.PostingsEnum;
@ -81,6 +83,9 @@ import org.apache.lucene.util.SmallFloat;
 * compute norms the same way as {@link SimilarityBase#computeNorm}, which includes {@link
 * BM25Similarity} and {@link DFRSimilarity}. Per-field similarities are not supported.
 *
 * <p>The query also requires that either all fields or no fields have norms enabled. Having only
 * some fields with norms enabled can result in errors.
 *
 * <p>The scoring is based on BM25F's simple formula described in:
 * http://www.staff.city.ac.uk/~sb317/papers/foundations_bm25_review.pdf. This query implements the
 * same approach but allows other similarities besides {@link
@ -278,6 +283,7 @@ public final class CombinedFieldQuery extends Query implements Accountable {
  @Override
  public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
      throws IOException {
    validateConsistentNorms(searcher.getIndexReader());
    if (scoreMode.needsScores()) {
      return new CombinedFieldWeight(this, searcher, scoreMode, boost);
    } else {
@ -287,6 +293,29 @@ public final class CombinedFieldQuery extends Query implements Accountable {
    }
  }
  private void validateConsistentNorms(IndexReader reader) {
    boolean allFieldsHaveNorms = true;
    boolean noFieldsHaveNorms = true;
    for (LeafReaderContext context : reader.leaves()) {
      FieldInfos fieldInfos = context.reader().getFieldInfos();
      for (String field : fieldAndWeights.keySet()) {
        FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
        if (fieldInfo != null) {
          allFieldsHaveNorms &= fieldInfo.hasNorms();
          noFieldsHaveNorms &= fieldInfo.omitsNorms();
        }
      }
    }
    if (allFieldsHaveNorms == false && noFieldsHaveNorms == false) {
      throw new IllegalArgumentException(
          getClass().getSimpleName()
              + " requires norms to be consistent across fields: some fields cannot "
              + " have norms enabled, while others have norms disabled");
    }
  }
  class CombinedFieldWeight extends Weight {
    private final IndexSearcher searcher;
    private final TermStates termStates[];
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/MultiNormsLeafSimScorer.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/MultiNormsLeafSimScorer.java
@ -30,7 +30,13 @@ import org.apache.lucene.search.LeafSimScorer;
 import org.apache.lucene.search.similarities.Similarity.SimScorer;
 import org.apache.lucene.util.SmallFloat;
-/** Copy of {@link LeafSimScorer} that sums document's norms from multiple fields. */
+/**
 * Copy of {@link LeafSimScorer} that sums document's norms from multiple fields.
 *
 * <p>For all fields, norms must be encoded using {@link SmallFloat#intToByte4}. This scorer also
 * requires that either all fields or no fields have norms enabled. Having only some fields with
 * norms enabled can result in errors or undefined behavior.
 */
 final class MultiNormsLeafSimScorer {
  /** Cache of decoded norms. */
  private static final float[] LENGTH_TABLE = new float[256];
@ -62,6 +68,7 @@ final class MultiNormsLeafSimScorer {
          weightList.add(field.weight);
        }
      }
      if (normsList.isEmpty()) {
        norms = null;
      } else if (normsList.size() == 1) {
@ -128,14 +135,16 @@ final class MultiNormsLeafSimScorer {
    @Override
    public boolean advanceExact(int target) throws IOException {
      float normValue = 0;
      boolean found = false;
      for (int i = 0; i < normsArr.length; i++) {
-        boolean found = normsArr[i].advanceExact(target);
+        if (normsArr[i].advanceExact(target)) {
        assert found;
          normValue +=
              weightArr[i] * LENGTH_TABLE[Byte.toUnsignedInt((byte) normsArr[i].longValue())];
          found = true;
        }
      }
      current = SmallFloat.intToByte4(Math.round(normValue));
-      return true;
+      return found;
    }
    @Override
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java
@ -46,6 +46,7 @@ import org.apache.lucene.search.similarities.LMDirichletSimilarity;
 import org.apache.lucene.search.similarities.LMJelinekMercerSimilarity;
 import org.apache.lucene.search.similarities.Similarity;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.MMapDirectory;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase;
@ -164,6 +165,59 @@ public class TestCombinedFieldQuery extends LuceneTestCase {
    dir.close();
  }
  public void testNormsDisabled() throws IOException {
    Directory dir = newDirectory();
    Similarity similarity = randomCompatibleSimilarity();
    IndexWriterConfig iwc = new IndexWriterConfig();
    iwc.setSimilarity(similarity);
    RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
    Document doc = new Document();
    doc.add(new StringField("a", "value", Store.NO));
    doc.add(new StringField("b", "value", Store.NO));
    doc.add(new TextField("c", "value", Store.NO));
    w.addDocument(doc);
    w.commit();
    doc = new Document();
    doc.add(new StringField("a", "value", Store.NO));
    doc.add(new TextField("c", "value", Store.NO));
    w.addDocument(doc);
    IndexReader reader = w.getReader();
    IndexSearcher searcher = newSearcher(reader);
    Similarity searchSimilarity = randomCompatibleSimilarity();
    searcher.setSimilarity(searchSimilarity);
    TopScoreDocCollector collector = TopScoreDocCollector.create(10, null, 10);
    CombinedFieldQuery query =
        new CombinedFieldQuery.Builder()
            .addField("a", 1.0f)
            .addField("b", 1.0f)
            .addTerm(new BytesRef("value"))
            .build();
    searcher.search(query, collector);
    TopDocs topDocs = collector.topDocs();
    assertEquals(new TotalHits(2, TotalHits.Relation.EQUAL_TO), topDocs.totalHits);
    CombinedFieldQuery invalidQuery =
        new CombinedFieldQuery.Builder()
            .addField("b", 1.0f)
            .addField("c", 1.0f)
            .addTerm(new BytesRef("value"))
            .build();
    IllegalArgumentException e =
        expectThrows(
            IllegalArgumentException.class, () -> searcher.search(invalidQuery, collector));
    assertTrue(e.getMessage().contains("requires norms to be consistent across fields"));
    reader.close();
    w.close();
    dir.close();
  }
  public void testCopyField() throws IOException {
    Directory dir = newDirectory();
    Similarity similarity = randomCompatibleSimilarity();
@ -265,6 +319,55 @@ public class TestCombinedFieldQuery extends LuceneTestCase {
    dir.close();
  }
  public void testCopyFieldWithMissingFields() throws IOException {
    Directory dir = new MMapDirectory(createTempDir());
    Similarity similarity = randomCompatibleSimilarity();
    IndexWriterConfig iwc = new IndexWriterConfig();
    iwc.setSimilarity(similarity);
    RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
    int boost1 = Math.max(1, random().nextInt(5));
    int boost2 = Math.max(1, random().nextInt(5));
    int numMatch = atLeast(10);
    for (int i = 0; i < numMatch; i++) {
      Document doc = new Document();
      int freqA = random().nextInt(5) + 1;
      for (int j = 0; j < freqA; j++) {
        doc.add(new TextField("a", "foo", Store.NO));
      }
      // Choose frequencies such that sometimes we don't add field B
      int freqB = random().nextInt(3);
      for (int j = 0; j < freqB; j++) {
        doc.add(new TextField("b", "foo", Store.NO));
      }
      int freqAB = freqA * boost1 + freqB * boost2;
      for (int j = 0; j < freqAB; j++) {
        doc.add(new TextField("ab", "foo", Store.NO));
      }
      w.addDocument(doc);
    }
    IndexReader reader = w.getReader();
    IndexSearcher searcher = newSearcher(reader);
    searcher.setSimilarity(similarity);
    CombinedFieldQuery query =
        new CombinedFieldQuery.Builder()
            .addField("a", (float) boost1)
            .addField("b", (float) boost2)
            .addTerm(new BytesRef("foo"))
            .build();
    checkExpectedHits(searcher, numMatch, query, new TermQuery(new Term("ab", "foo")));
    reader.close();
    w.close();
    dir.close();
  }
  private static Similarity randomCompatibleSimilarity() {
    return RandomPicks.randomFrom(
        random(),