From e8663b30b85c1d48a8d18d37866a553895ffb8ae Mon Sep 17 00:00:00 2001 From: Julie Tibshirani Date: Wed, 28 Jul 2021 15:43:56 +0300 Subject: [PATCH] LUCENE-10039: Fix single-field scoring for CombinedFieldQuery (#229) When there's only one field, CombinedFieldQuery will ignore its weight while scoring. This makes the scoring inconsistent, since the field weight is supposed to multiply its term frequency. This PR removes the optimizations around single-field scoring to make sure the weight is always taken into account. These optimizations are not critical since it should be uncommon to use CombinedFieldQuery with only one field. --- lucene/CHANGES.txt | 3 ++ .../sandbox/search/CombinedFieldQuery.java | 22 +++------- .../search/MultiNormsLeafSimScorer.java | 2 - .../search/TestCombinedFieldQuery.java | 41 +++++++++++++++++++ 4 files changed, 49 insertions(+), 19 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index b2dbfba08a1..3c8120e033d 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -418,6 +418,9 @@ Bug Fixes * LUCENE-10026: Fix CombinedFieldQuery equals and hashCode, which ensures query rewrites don't drop CombinedFieldQuery clauses. (Julie Tibshirani) +* LUCENE-10039: Correct CombinedFieldQuery scoring when there is a single + field. (Julie Tibshirani) + Other --------------------- (No changes) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java index ffb9d13f1f9..c4e00eabdcf 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java @@ -254,8 +254,7 @@ public final class CombinedFieldQuery extends Query implements Accountable { @Override public Query rewrite(IndexReader reader) throws IOException { - // optimize zero and single field cases - if (terms.length == 0) { + if (terms.length == 0 || fieldAndWeights.isEmpty()) { return new BooleanQuery.Builder().build(); } return this; @@ -383,14 +382,9 @@ public final class CombinedFieldQuery extends Query implements Accountable { if (scorer != null) { int newDoc = scorer.iterator().advance(doc); if (newDoc == doc) { - final float freq; - if (scorer instanceof CombinedFieldScorer) { - freq = ((CombinedFieldScorer) scorer).freq(); - } else { - assert scorer instanceof TermScorer; - freq = ((TermScorer) scorer).freq(); - } - final MultiNormsLeafSimScorer docScorer = + assert scorer instanceof CombinedFieldScorer; + float freq = ((CombinedFieldScorer) scorer).freq(); + MultiNormsLeafSimScorer docScorer = new MultiNormsLeafSimScorer( simWeight, context.reader(), fieldAndWeights.values(), true); Explanation freqExplanation = Explanation.match(freq, "termFreq=" + freq); @@ -423,13 +417,7 @@ public final class CombinedFieldQuery extends Query implements Accountable { return null; } - // we must optimize this case (term not in segment), disjunctions require >= 2 subs - if (iterators.size() == 1) { - final LeafSimScorer scoringSimScorer = - new LeafSimScorer(simWeight, context.reader(), fields.get(0).field, true); - return new TermScorer(this, iterators.get(0), scoringSimScorer); - } - final MultiNormsLeafSimScorer scoringSimScorer = + MultiNormsLeafSimScorer scoringSimScorer = new MultiNormsLeafSimScorer(simWeight, context.reader(), fields, true); LeafSimScorer nonScoringSimScorer = new LeafSimScorer(simWeight, context.reader(), "pseudo_field", false); diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/MultiNormsLeafSimScorer.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/MultiNormsLeafSimScorer.java index f2c6120cff4..ba1d69a8b16 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/MultiNormsLeafSimScorer.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/MultiNormsLeafSimScorer.java @@ -71,8 +71,6 @@ final class MultiNormsLeafSimScorer { if (normsList.isEmpty()) { norms = null; - } else if (normsList.size() == 1) { - norms = normsList.get(0); } else { final NumericDocValues[] normsArr = normsList.toArray(new NumericDocValues[0]); final float[] weightArr = new float[normsList.size()]; diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java index 77aa9ed73ff..7798a2a7017 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java @@ -319,6 +319,47 @@ public class TestCombinedFieldQuery extends LuceneTestCase { dir.close(); } + public void testCopyFieldWithSingleField() throws IOException { + Directory dir = new MMapDirectory(createTempDir()); + Similarity similarity = randomCompatibleSimilarity(); + + IndexWriterConfig iwc = new IndexWriterConfig(); + iwc.setSimilarity(similarity); + RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc); + + int boost = Math.max(1, random().nextInt(5)); + int numMatch = atLeast(10); + for (int i = 0; i < numMatch; i++) { + Document doc = new Document(); + int freqA = random().nextInt(5) + 1; + for (int j = 0; j < freqA; j++) { + doc.add(new TextField("a", "foo", Store.NO)); + } + + int freqB = freqA * boost; + for (int j = 0; j < freqB; j++) { + doc.add(new TextField("b", "foo", Store.NO)); + } + + w.addDocument(doc); + } + + IndexReader reader = w.getReader(); + IndexSearcher searcher = newSearcher(reader); + searcher.setSimilarity(similarity); + CombinedFieldQuery query = + new CombinedFieldQuery.Builder() + .addField("a", (float) boost) + .addTerm(new BytesRef("foo")) + .build(); + + checkExpectedHits(searcher, numMatch, query, new TermQuery(new Term("b", "foo"))); + + reader.close(); + w.close(); + dir.close(); + } + public void testCopyFieldWithMissingFields() throws IOException { Directory dir = new MMapDirectory(createTempDir()); Similarity similarity = randomCompatibleSimilarity();