mirror of https://github.com/apache/lucene.git
LUCENE-9999: CombinedFieldQuery can fail with an exception when document is missing fields (#185)
This change fixes a bug in `MultiNormsLeafSimScorer` that assumes that each field should have a norm for every term/document. As part of the fix, it adds validation that the fields have consistent norms settings.
This commit is contained in:
parent
30beb70ffa
commit
f333b70dbf
|
@ -398,6 +398,9 @@ Bug Fixes
|
||||||
* LUCENE-9964: Duplicate long values in a document field should only be counted once when using SortedNumericDocValuesFields
|
* LUCENE-9964: Duplicate long values in a document field should only be counted once when using SortedNumericDocValuesFields
|
||||||
(Gautam Worah)
|
(Gautam Worah)
|
||||||
|
|
||||||
|
* LUCENE-9999: CombinedFieldQuery can fail with an exception when document
|
||||||
|
is missing some fields. (Jim Ferenczi, Julie Tibshirani)
|
||||||
|
|
||||||
* LUCENE-10020: DocComparator should not skip docs with the same docID on
|
* LUCENE-10020: DocComparator should not skip docs with the same docID on
|
||||||
multiple sorts with search after (Mayya Sharipova, Julie Tibshirani)
|
multiple sorts with search after (Mayya Sharipova, Julie Tibshirani)
|
||||||
|
|
||||||
|
|
|
@ -27,6 +27,8 @@ import java.util.Map;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.TreeMap;
|
import java.util.TreeMap;
|
||||||
|
import org.apache.lucene.index.FieldInfo;
|
||||||
|
import org.apache.lucene.index.FieldInfos;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.LeafReaderContext;
|
import org.apache.lucene.index.LeafReaderContext;
|
||||||
import org.apache.lucene.index.PostingsEnum;
|
import org.apache.lucene.index.PostingsEnum;
|
||||||
|
@ -81,6 +83,9 @@ import org.apache.lucene.util.SmallFloat;
|
||||||
* compute norms the same way as {@link SimilarityBase#computeNorm}, which includes {@link
|
* compute norms the same way as {@link SimilarityBase#computeNorm}, which includes {@link
|
||||||
* BM25Similarity} and {@link DFRSimilarity}. Per-field similarities are not supported.
|
* BM25Similarity} and {@link DFRSimilarity}. Per-field similarities are not supported.
|
||||||
*
|
*
|
||||||
|
* <p>The query also requires that either all fields or no fields have norms enabled. Having only
|
||||||
|
* some fields with norms enabled can result in errors.
|
||||||
|
*
|
||||||
* <p>The scoring is based on BM25F's simple formula described in:
|
* <p>The scoring is based on BM25F's simple formula described in:
|
||||||
* http://www.staff.city.ac.uk/~sb317/papers/foundations_bm25_review.pdf. This query implements the
|
* http://www.staff.city.ac.uk/~sb317/papers/foundations_bm25_review.pdf. This query implements the
|
||||||
* same approach but allows other similarities besides {@link
|
* same approach but allows other similarities besides {@link
|
||||||
|
@ -278,6 +283,7 @@ public final class CombinedFieldQuery extends Query implements Accountable {
|
||||||
@Override
|
@Override
|
||||||
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
|
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
validateConsistentNorms(searcher.getIndexReader());
|
||||||
if (scoreMode.needsScores()) {
|
if (scoreMode.needsScores()) {
|
||||||
return new CombinedFieldWeight(this, searcher, scoreMode, boost);
|
return new CombinedFieldWeight(this, searcher, scoreMode, boost);
|
||||||
} else {
|
} else {
|
||||||
|
@ -287,6 +293,29 @@ public final class CombinedFieldQuery extends Query implements Accountable {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void validateConsistentNorms(IndexReader reader) {
|
||||||
|
boolean allFieldsHaveNorms = true;
|
||||||
|
boolean noFieldsHaveNorms = true;
|
||||||
|
|
||||||
|
for (LeafReaderContext context : reader.leaves()) {
|
||||||
|
FieldInfos fieldInfos = context.reader().getFieldInfos();
|
||||||
|
for (String field : fieldAndWeights.keySet()) {
|
||||||
|
FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
|
||||||
|
if (fieldInfo != null) {
|
||||||
|
allFieldsHaveNorms &= fieldInfo.hasNorms();
|
||||||
|
noFieldsHaveNorms &= fieldInfo.omitsNorms();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (allFieldsHaveNorms == false && noFieldsHaveNorms == false) {
|
||||||
|
throw new IllegalArgumentException(
|
||||||
|
getClass().getSimpleName()
|
||||||
|
+ " requires norms to be consistent across fields: some fields cannot "
|
||||||
|
+ " have norms enabled, while others have norms disabled");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
class CombinedFieldWeight extends Weight {
|
class CombinedFieldWeight extends Weight {
|
||||||
private final IndexSearcher searcher;
|
private final IndexSearcher searcher;
|
||||||
private final TermStates termStates[];
|
private final TermStates termStates[];
|
||||||
|
|
|
@ -30,7 +30,13 @@ import org.apache.lucene.search.LeafSimScorer;
|
||||||
import org.apache.lucene.search.similarities.Similarity.SimScorer;
|
import org.apache.lucene.search.similarities.Similarity.SimScorer;
|
||||||
import org.apache.lucene.util.SmallFloat;
|
import org.apache.lucene.util.SmallFloat;
|
||||||
|
|
||||||
/** Copy of {@link LeafSimScorer} that sums document's norms from multiple fields. */
|
/**
|
||||||
|
* Copy of {@link LeafSimScorer} that sums document's norms from multiple fields.
|
||||||
|
*
|
||||||
|
* <p>For all fields, norms must be encoded using {@link SmallFloat#intToByte4}. This scorer also
|
||||||
|
* requires that either all fields or no fields have norms enabled. Having only some fields with
|
||||||
|
* norms enabled can result in errors or undefined behavior.
|
||||||
|
*/
|
||||||
final class MultiNormsLeafSimScorer {
|
final class MultiNormsLeafSimScorer {
|
||||||
/** Cache of decoded norms. */
|
/** Cache of decoded norms. */
|
||||||
private static final float[] LENGTH_TABLE = new float[256];
|
private static final float[] LENGTH_TABLE = new float[256];
|
||||||
|
@ -62,6 +68,7 @@ final class MultiNormsLeafSimScorer {
|
||||||
weightList.add(field.weight);
|
weightList.add(field.weight);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (normsList.isEmpty()) {
|
if (normsList.isEmpty()) {
|
||||||
norms = null;
|
norms = null;
|
||||||
} else if (normsList.size() == 1) {
|
} else if (normsList.size() == 1) {
|
||||||
|
@ -128,14 +135,16 @@ final class MultiNormsLeafSimScorer {
|
||||||
@Override
|
@Override
|
||||||
public boolean advanceExact(int target) throws IOException {
|
public boolean advanceExact(int target) throws IOException {
|
||||||
float normValue = 0;
|
float normValue = 0;
|
||||||
|
boolean found = false;
|
||||||
for (int i = 0; i < normsArr.length; i++) {
|
for (int i = 0; i < normsArr.length; i++) {
|
||||||
boolean found = normsArr[i].advanceExact(target);
|
if (normsArr[i].advanceExact(target)) {
|
||||||
assert found;
|
normValue +=
|
||||||
normValue +=
|
weightArr[i] * LENGTH_TABLE[Byte.toUnsignedInt((byte) normsArr[i].longValue())];
|
||||||
weightArr[i] * LENGTH_TABLE[Byte.toUnsignedInt((byte) normsArr[i].longValue())];
|
found = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
current = SmallFloat.intToByte4(Math.round(normValue));
|
current = SmallFloat.intToByte4(Math.round(normValue));
|
||||||
return true;
|
return found;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -46,6 +46,7 @@ import org.apache.lucene.search.similarities.LMDirichletSimilarity;
|
||||||
import org.apache.lucene.search.similarities.LMJelinekMercerSimilarity;
|
import org.apache.lucene.search.similarities.LMJelinekMercerSimilarity;
|
||||||
import org.apache.lucene.search.similarities.Similarity;
|
import org.apache.lucene.search.similarities.Similarity;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.store.MMapDirectory;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
|
@ -164,6 +165,59 @@ public class TestCombinedFieldQuery extends LuceneTestCase {
|
||||||
dir.close();
|
dir.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testNormsDisabled() throws IOException {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
Similarity similarity = randomCompatibleSimilarity();
|
||||||
|
|
||||||
|
IndexWriterConfig iwc = new IndexWriterConfig();
|
||||||
|
iwc.setSimilarity(similarity);
|
||||||
|
RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
|
||||||
|
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(new StringField("a", "value", Store.NO));
|
||||||
|
doc.add(new StringField("b", "value", Store.NO));
|
||||||
|
doc.add(new TextField("c", "value", Store.NO));
|
||||||
|
w.addDocument(doc);
|
||||||
|
w.commit();
|
||||||
|
|
||||||
|
doc = new Document();
|
||||||
|
doc.add(new StringField("a", "value", Store.NO));
|
||||||
|
doc.add(new TextField("c", "value", Store.NO));
|
||||||
|
w.addDocument(doc);
|
||||||
|
|
||||||
|
IndexReader reader = w.getReader();
|
||||||
|
IndexSearcher searcher = newSearcher(reader);
|
||||||
|
|
||||||
|
Similarity searchSimilarity = randomCompatibleSimilarity();
|
||||||
|
searcher.setSimilarity(searchSimilarity);
|
||||||
|
TopScoreDocCollector collector = TopScoreDocCollector.create(10, null, 10);
|
||||||
|
|
||||||
|
CombinedFieldQuery query =
|
||||||
|
new CombinedFieldQuery.Builder()
|
||||||
|
.addField("a", 1.0f)
|
||||||
|
.addField("b", 1.0f)
|
||||||
|
.addTerm(new BytesRef("value"))
|
||||||
|
.build();
|
||||||
|
searcher.search(query, collector);
|
||||||
|
TopDocs topDocs = collector.topDocs();
|
||||||
|
assertEquals(new TotalHits(2, TotalHits.Relation.EQUAL_TO), topDocs.totalHits);
|
||||||
|
|
||||||
|
CombinedFieldQuery invalidQuery =
|
||||||
|
new CombinedFieldQuery.Builder()
|
||||||
|
.addField("b", 1.0f)
|
||||||
|
.addField("c", 1.0f)
|
||||||
|
.addTerm(new BytesRef("value"))
|
||||||
|
.build();
|
||||||
|
IllegalArgumentException e =
|
||||||
|
expectThrows(
|
||||||
|
IllegalArgumentException.class, () -> searcher.search(invalidQuery, collector));
|
||||||
|
assertTrue(e.getMessage().contains("requires norms to be consistent across fields"));
|
||||||
|
|
||||||
|
reader.close();
|
||||||
|
w.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
|
||||||
public void testCopyField() throws IOException {
|
public void testCopyField() throws IOException {
|
||||||
Directory dir = newDirectory();
|
Directory dir = newDirectory();
|
||||||
Similarity similarity = randomCompatibleSimilarity();
|
Similarity similarity = randomCompatibleSimilarity();
|
||||||
|
@ -265,6 +319,55 @@ public class TestCombinedFieldQuery extends LuceneTestCase {
|
||||||
dir.close();
|
dir.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testCopyFieldWithMissingFields() throws IOException {
|
||||||
|
Directory dir = new MMapDirectory(createTempDir());
|
||||||
|
Similarity similarity = randomCompatibleSimilarity();
|
||||||
|
|
||||||
|
IndexWriterConfig iwc = new IndexWriterConfig();
|
||||||
|
iwc.setSimilarity(similarity);
|
||||||
|
RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
|
||||||
|
|
||||||
|
int boost1 = Math.max(1, random().nextInt(5));
|
||||||
|
int boost2 = Math.max(1, random().nextInt(5));
|
||||||
|
int numMatch = atLeast(10);
|
||||||
|
for (int i = 0; i < numMatch; i++) {
|
||||||
|
Document doc = new Document();
|
||||||
|
int freqA = random().nextInt(5) + 1;
|
||||||
|
for (int j = 0; j < freqA; j++) {
|
||||||
|
doc.add(new TextField("a", "foo", Store.NO));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Choose frequencies such that sometimes we don't add field B
|
||||||
|
int freqB = random().nextInt(3);
|
||||||
|
for (int j = 0; j < freqB; j++) {
|
||||||
|
doc.add(new TextField("b", "foo", Store.NO));
|
||||||
|
}
|
||||||
|
|
||||||
|
int freqAB = freqA * boost1 + freqB * boost2;
|
||||||
|
for (int j = 0; j < freqAB; j++) {
|
||||||
|
doc.add(new TextField("ab", "foo", Store.NO));
|
||||||
|
}
|
||||||
|
|
||||||
|
w.addDocument(doc);
|
||||||
|
}
|
||||||
|
|
||||||
|
IndexReader reader = w.getReader();
|
||||||
|
IndexSearcher searcher = newSearcher(reader);
|
||||||
|
searcher.setSimilarity(similarity);
|
||||||
|
CombinedFieldQuery query =
|
||||||
|
new CombinedFieldQuery.Builder()
|
||||||
|
.addField("a", (float) boost1)
|
||||||
|
.addField("b", (float) boost2)
|
||||||
|
.addTerm(new BytesRef("foo"))
|
||||||
|
.build();
|
||||||
|
|
||||||
|
checkExpectedHits(searcher, numMatch, query, new TermQuery(new Term("ab", "foo")));
|
||||||
|
|
||||||
|
reader.close();
|
||||||
|
w.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
|
||||||
private static Similarity randomCompatibleSimilarity() {
|
private static Similarity randomCompatibleSimilarity() {
|
||||||
return RandomPicks.randomFrom(
|
return RandomPicks.randomFrom(
|
||||||
random(),
|
random(),
|
||||||
|
|
Loading…
Reference in New Issue