From c8570ed821654cdce5f92ae17d06a21f242524e2 Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Mon, 6 Jun 2016 10:35:16 -0400 Subject: [PATCH] LUCENE-7132: BooleanQuery sometimes assigned the wrong score when ranges of documents had only one clause matching while other ranges had more than one clause matchng --- lucene/CHANGES.txt | 5 + .../apache/lucene/search/BooleanScorer.java | 6 +- .../lucene/search/FilterLeafCollector.java | 7 +- .../apache/lucene/search/TestBoolean2.java | 130 +++++++++++++++--- .../lucene/search/TestSimpleExplanations.java | 27 +++- .../TestSimpleExplanationsWithFillerDocs.java | 126 +++++++++++++++++ .../search/BaseExplanationTestCase.java | 43 ++++-- .../search/TestBaseExplanationTestCase.java | 120 ++++++++++++++++ 8 files changed, 422 insertions(+), 42 deletions(-) create mode 100644 lucene/core/src/test/org/apache/lucene/search/TestSimpleExplanationsWithFillerDocs.java create mode 100644 lucene/test-framework/src/test/org/apache/lucene/search/TestBaseExplanationTestCase.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 8c851f79d35..bd79c3768a7 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -148,6 +148,11 @@ Bug Fixes * LUCENE-7312: Fix geo3d's x/y/z double to int encoding to ensure it always rounds down (Karl Wright, Mike McCandless) +* LUCENE-7132: BooleanQuery sometimes assigned too-low scores in cases + where ranges of documents had only a single clause matching while + other ranges had more than one clause matching (Ahmet Arslan, + hossman, Mike McCandless) + Documentation * LUCENE-7223: Improve XXXPoint javadocs to make it clear that you diff --git a/lucene/core/src/java/org/apache/lucene/search/BooleanScorer.java b/lucene/core/src/java/org/apache/lucene/search/BooleanScorer.java index 4534bd465ab..73880a8ff70 100644 --- a/lucene/core/src/java/org/apache/lucene/search/BooleanScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/BooleanScorer.java @@ -275,13 +275,13 @@ final class BooleanScorer extends BulkScorer { } } - private void scoreWindowSingleScorer(BulkScorerAndDoc bulkScorer, LeafCollector collector, + private void scoreWindowSingleScorer(BulkScorerAndDoc bulkScorer, LeafCollector collector, LeafCollector singleClauseCollector, Bits acceptDocs, int windowMin, int windowMax, int max) throws IOException { assert tail.size() == 0; final int nextWindowBase = head.top().next & ~MASK; final int end = Math.max(windowMax, Math.min(max, nextWindowBase)); - bulkScorer.score(collector, acceptDocs, windowMin, end); + bulkScorer.score(singleClauseCollector, acceptDocs, windowMin, end); // reset the scorer that should be used for the general case collector.setScorer(fakeScorer); @@ -304,7 +304,7 @@ final class BooleanScorer extends BulkScorer { // special case: only one scorer can match in the current window, // we can collect directly final BulkScorerAndDoc bulkScorer = leads[0]; - scoreWindowSingleScorer(bulkScorer, singleClauseCollector, acceptDocs, windowMin, windowMax, max); + scoreWindowSingleScorer(bulkScorer, collector, singleClauseCollector, acceptDocs, windowMin, windowMax, max); return head.add(bulkScorer); } else { // general case, collect through a bit set first and then replay diff --git a/lucene/core/src/java/org/apache/lucene/search/FilterLeafCollector.java b/lucene/core/src/java/org/apache/lucene/search/FilterLeafCollector.java index b55410c87c7..14f5ab0e818 100644 --- a/lucene/core/src/java/org/apache/lucene/search/FilterLeafCollector.java +++ b/lucene/core/src/java/org/apache/lucene/search/FilterLeafCollector.java @@ -45,7 +45,12 @@ public abstract class FilterLeafCollector implements LeafCollector { @Override public String toString() { - return getClass().getSimpleName() + "(" + in + ")"; + String name = getClass().getSimpleName(); + if (name.length() == 0) { + // an anonoymous subclass will have empty name? + name = "FilterLeafCollector"; + } + return name + "(" + in + ")"; } } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestBoolean2.java b/lucene/core/src/test/org/apache/lucene/search/TestBoolean2.java index 105cd6024cd..581e26f02a0 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestBoolean2.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestBoolean2.java @@ -18,15 +18,18 @@ package org.apache.lucene.search; +import java.util.Arrays; import java.util.Random; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; -import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.similarities.ClassicSimilarity; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.store.Directory; @@ -42,24 +45,45 @@ import org.junit.Test; */ public class TestBoolean2 extends LuceneTestCase { private static IndexSearcher searcher; + private static IndexSearcher singleSegmentSearcher; private static IndexSearcher bigSearcher; private static IndexReader reader; private static IndexReader littleReader; - private static int NUM_EXTRA_DOCS = 6000; - + private static IndexReader singleSegmentReader; + /** num of empty docs injected between every doc in the (main) index */ + private static int NUM_FILLER_DOCS; + /** num of empty docs injected prior to the first doc in the (main) index */ + private static int PRE_FILLER_DOCS; + /** num "extra" docs containing value in "field2" added to the "big" clone of the index */ + private static final int NUM_EXTRA_DOCS = 6000; + public static final String field = "field"; private static Directory directory; + private static Directory singleSegmentDirectory; private static Directory dir2; private static int mulFactor; @BeforeClass public static void beforeClass() throws Exception { + // in some runs, test immediate adjacency of matches - in others, force a full bucket gap betwen docs + NUM_FILLER_DOCS = random().nextBoolean() ? 0 : BooleanScorer.SIZE; + PRE_FILLER_DOCS = TestUtil.nextInt(random(), 0, (NUM_FILLER_DOCS / 2)); + directory = newDirectory(); RandomIndexWriter writer= new RandomIndexWriter(random(), directory, newIndexWriterConfig(new MockAnalyzer(random())).setMergePolicy(newLogMergePolicy())); + + Document doc = new Document(); + for (int filler = 0; filler < PRE_FILLER_DOCS; filler++) { + writer.addDocument(doc); + } for (int i = 0; i < docFields.length; i++) { - Document doc = new Document(); doc.add(newTextField(field, docFields[i], Field.Store.NO)); writer.addDocument(doc); + + doc = new Document(); + for (int filler = 0; filler < NUM_FILLER_DOCS; filler++) { + writer.addDocument(doc); + } } writer.close(); littleReader = DirectoryReader.open(directory); @@ -67,6 +91,18 @@ public class TestBoolean2 extends LuceneTestCase { // this is intentionally using the baseline sim, because it compares against bigSearcher (which uses a random one) searcher.setSimilarity(new ClassicSimilarity()); + // make a copy of our index using a single segment + singleSegmentDirectory = new MockDirectoryWrapper(random(), TestUtil.ramCopyOf(directory)); + IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); + // we need docID order to be preserved: + iwc.setMergePolicy(newLogMergePolicy()); + try (IndexWriter w = new IndexWriter(singleSegmentDirectory, iwc)) { + w.forceMerge(1, true); + } + singleSegmentReader = DirectoryReader.open(singleSegmentDirectory); + singleSegmentSearcher = newSearcher(singleSegmentReader); + singleSegmentSearcher.setSimilarity(searcher.getSimilarity(true)); + // Make big index dir2 = new MockDirectoryWrapper(random(), TestUtil.ramCopyOf(directory)); @@ -86,12 +122,12 @@ public class TestBoolean2 extends LuceneTestCase { docCount = w.maxDoc(); w.close(); mulFactor *= 2; - } while(docCount < 3000); + } while(docCount < 3000 * NUM_FILLER_DOCS); RandomIndexWriter w = new RandomIndexWriter(random(), dir2, newIndexWriterConfig(new MockAnalyzer(random())) .setMaxBufferedDocs(TestUtil.nextInt(random(), 50, 1000))); - Document doc = new Document(); + doc = new Document(); doc.add(newTextField("field2", "xxx", Field.Store.NO)); for(int i=0;iexpDocNrs based on the filler docs injected in the index, + * and if neccessary wraps the q in a BooleanQuery that will filter out all + * filler docs using the {@link #EXTRA} field. + * + * @see #replaceIndex + */ + @Override + public void qtest(Query q, int[] expDocNrs) throws Exception { + + expDocNrs = Arrays.copyOf(expDocNrs, expDocNrs.length); + for (int i=0; i < expDocNrs.length; i++) { + expDocNrs[i] = PRE_FILLER_DOCS + ((NUM_FILLER_DOCS + 1) * expDocNrs[i]); + } + + if (null != EXTRA) { + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + builder.add(new BooleanClause(q, BooleanClause.Occur.MUST)); + builder.add(new BooleanClause(new TermQuery(new Term(EXTRA, EXTRA)), BooleanClause.Occur.MUST_NOT)); + q = builder.build(); + } + super.qtest(q, expDocNrs); + } + + public void testMA1() throws Exception { + Assume.assumeNotNull("test is not viable with empty filler docs", EXTRA); + super.testMA1(); + } + public void testMA2() throws Exception { + Assume.assumeNotNull("test is not viable with empty filler docs", EXTRA); + super.testMA2(); + } + + +} diff --git a/lucene/test-framework/src/java/org/apache/lucene/search/BaseExplanationTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/search/BaseExplanationTestCase.java index cde2cdd966e..7775f118396 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/search/BaseExplanationTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/search/BaseExplanationTestCase.java @@ -71,22 +71,26 @@ public abstract class BaseExplanationTestCase extends LuceneTestCase { public static void beforeClassTestExplanations() throws Exception { directory = newDirectory(); analyzer = new MockAnalyzer(random()); - RandomIndexWriter writer= new RandomIndexWriter(random(), directory, newIndexWriterConfig(analyzer).setMergePolicy(newLogMergePolicy())); - for (int i = 0; i < docFields.length; i++) { - Document doc = new Document(); - doc.add(newStringField(KEY, ""+i, Field.Store.NO)); - doc.add(new SortedDocValuesField(KEY, new BytesRef(""+i))); - Field f = newTextField(FIELD, docFields[i], Field.Store.NO); - f.setBoost(i); - doc.add(f); - doc.add(newTextField(ALTFIELD, docFields[i], Field.Store.NO)); - writer.addDocument(doc); + try (RandomIndexWriter writer = new RandomIndexWriter(random(), directory, newIndexWriterConfig(analyzer).setMergePolicy(newLogMergePolicy()))) { + for (int i = 0; i < docFields.length; i++) { + writer.addDocument(createDoc(i)); + } + reader = writer.getReader(); + searcher = newSearcher(reader); } - reader = writer.getReader(); - writer.close(); - searcher = newSearcher(reader); } + public static Document createDoc(int index) { + Document doc = new Document(); + doc.add(newStringField(KEY, ""+index, Field.Store.NO)); + doc.add(new SortedDocValuesField(KEY, new BytesRef(""+index))); + Field f = newTextField(FIELD, docFields[index], Field.Store.NO); + f.setBoost(index); + doc.add(f); + doc.add(newTextField(ALTFIELD, docFields[index], Field.Store.NO)); + return doc; + } + protected static final String[] docFields = { "w1 w2 w3 w4 w5", "w1 w3 w2 w3 zz", @@ -94,8 +98,19 @@ public abstract class BaseExplanationTestCase extends LuceneTestCase { "w1 w3 xx w2 yy w3 zz" }; - /** check the expDocNrs first, then check the query (and the explanations) */ + /** + * check the expDocNrs match and have scores that match the explanations. + * Query may be randomly wrapped in a BooleanQuery with a term that matches no documents in + * order to trigger coord logic. + */ public void qtest(Query q, int[] expDocNrs) throws Exception { + if (random().nextBoolean()) { + BooleanQuery.Builder bq = new BooleanQuery.Builder(); + bq.setDisableCoord(random().nextBoolean()); + bq.add(q, BooleanClause.Occur.SHOULD); + bq.add(new TermQuery(new Term("NEVER","MATCH")), BooleanClause.Occur.SHOULD); + q = bq.build(); + } CheckHits.checkHitCollector(random(), q, FIELD, searcher, expDocNrs); } diff --git a/lucene/test-framework/src/test/org/apache/lucene/search/TestBaseExplanationTestCase.java b/lucene/test-framework/src/test/org/apache/lucene/search/TestBaseExplanationTestCase.java new file mode 100644 index 00000000000..97ddd4ac070 --- /dev/null +++ b/lucene/test-framework/src/test/org/apache/lucene/search/TestBaseExplanationTestCase.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; +import java.util.Set; + +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.Term; + + +import junit.framework.AssertionFailedError; + +/** + * Tests that the {@link BaseExplanationTestCase} helper code, as well as + * {@link CheckHits#checkNoMatchExplanations} are checking what they are suppose to. + */ +public class TestBaseExplanationTestCase extends BaseExplanationTestCase { + + public void testQueryNoMatchWhenExpected() throws Exception { + expectThrows(AssertionFailedError.class, () -> { + qtest(new TermQuery(new Term(FIELD, "BOGUS")), new int[] { 3 /* none */ }); + }); + } + public void testQueryMatchWhenNotExpected() throws Exception { + expectThrows(AssertionFailedError.class, () -> { + qtest(new TermQuery(new Term(FIELD, "w1")), new int[] { 0, 1 /*, 2, 3 */ }); + }); + } + + public void testIncorrectExplainScores() throws Exception { + // sanity check what a real TermQuery matches + qtest(new TermQuery(new Term(FIELD, "zz")), new int[] { 1, 3 }); + + // ensure when the Explanations are broken, we get an error about those matches + expectThrows(AssertionFailedError.class, () -> { + qtest(new BrokenExplainTermQuery(new Term(FIELD, "zz"), false, true), new int[] { 1, 3 }); + + }); + } + + public void testIncorrectExplainMatches() throws Exception { + // sanity check what a real TermQuery matches + qtest(new TermQuery(new Term(FIELD, "zz")), new int[] { 1, 3 }); + + // ensure when the Explanations are broken, we get an error about the non matches + expectThrows(AssertionFailedError.class, () -> { + CheckHits.checkNoMatchExplanations(new BrokenExplainTermQuery(new Term(FIELD, "zz"), true, false), + FIELD, searcher, new int[] { 1, 3 }); + }); + } + + + public static final class BrokenExplainTermQuery extends TermQuery { + public final boolean toggleExplainMatch; + public final boolean breakExplainScores; + public BrokenExplainTermQuery(Term t, boolean toggleExplainMatch, boolean breakExplainScores) { + super(t); + this.toggleExplainMatch = toggleExplainMatch; + this.breakExplainScores = breakExplainScores; + } + public Weight createWeight(IndexSearcher searcher, boolean needsScores) throws IOException { + return new BrokenExplainWeight(this, super.createWeight(searcher,needsScores)); + } + } + + public static final class BrokenExplainWeight extends Weight { + final Weight in; + public BrokenExplainWeight(BrokenExplainTermQuery q, Weight in) { + super(q); + this.in = in; + } + public BulkScorer bulkScorer(LeafReaderContext context) throws IOException { + return in.bulkScorer(context); + } + public Explanation explain(LeafReaderContext context, int doc) throws IOException { + BrokenExplainTermQuery q = (BrokenExplainTermQuery) this.getQuery(); + Explanation result = in.explain(context, doc); + if (result.isMatch()) { + if (q.breakExplainScores) { + result = Explanation.match(-1F * result.getValue(), "Broken Explanation Score", result); + } + if (q.toggleExplainMatch) { + result = Explanation.noMatch("Broken Explanation Matching", result); + } + } else { + if (q.toggleExplainMatch) { + result = Explanation.match(-42.0F, "Broken Explanation Matching", result); + } + } + return result; + } + public void extractTerms(Set terms) { + in.extractTerms(terms); + } + public float getValueForNormalization() throws IOException { + return in.getValueForNormalization(); + } + public void normalize(float norm, float boost) { + in.normalize(norm, boost); + } + public Scorer scorer(LeafReaderContext context) throws IOException { + return in.scorer(context); + } + } +}