From ce93d455327dd7ea200f3b515d72a751a7889881 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Tue, 8 Feb 2022 17:14:42 +0100 Subject: [PATCH] LUCENE-10367: Optimize CoveringQuery for the case when the minimum number of matching clauses is a constant. --- lucene/CHANGES.txt | 3 + .../lucene/search/LongValuesSource.java | 12 ++- .../lucene/sandbox/search/CoveringQuery.java | 17 ++++ .../sandbox/search/TestCoveringQuery.java | 88 +++++++++++++++++++ 4 files changed, 119 insertions(+), 1 deletion(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 1662eb2d503..e9057f826da 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -199,6 +199,9 @@ Optimizations * LUCENE-10315: Use SIMD instructions to decode BKD doc IDs. (Guo Feng, Adrien Grand, Ignacio Vera) +* LUCENE-10367: Optimize CoveringQuery for the case when the minimum number of + matching clauses is a constant. (LuYunCheng via Adrien Grand) + Changes in runtime behavior --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/search/LongValuesSource.java b/lucene/core/src/java/org/apache/lucene/search/LongValuesSource.java index 2b2ed49795e..6bf9dd6cb3f 100644 --- a/lucene/core/src/java/org/apache/lucene/search/LongValuesSource.java +++ b/lucene/core/src/java/org/apache/lucene/search/LongValuesSource.java @@ -156,7 +156,12 @@ public abstract class LongValuesSource implements SegmentCacheable { return new ConstantLongValuesSource(value); } - private static class ConstantLongValuesSource extends LongValuesSource { + /** + * A ConstantLongValuesSource that always returns a constant value + * + * @lucene.internal + */ + public static class ConstantLongValuesSource extends LongValuesSource { private final long value; @@ -211,6 +216,11 @@ public abstract class LongValuesSource implements SegmentCacheable { public LongValuesSource rewrite(IndexSearcher searcher) throws IOException { return this; } + + /** Get the constant value. */ + public long getValue() { + return value; + } } private static class FieldValuesSource extends LongValuesSource { diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CoveringQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CoveringQuery.java index c9968989ed8..69e5bd69f83 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CoveringQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CoveringQuery.java @@ -25,10 +25,12 @@ import java.util.stream.Collectors; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.LongValues; import org.apache.lucene.search.LongValuesSource; +import org.apache.lucene.search.MatchNoDocsQuery; import org.apache.lucene.search.Matches; import org.apache.lucene.search.MatchesUtils; import org.apache.lucene.search.Multiset; @@ -124,6 +126,21 @@ public final class CoveringQuery extends Query implements Accountable { @Override public Query rewrite(IndexReader reader) throws IOException { + if (minimumNumberMatch instanceof LongValuesSource.ConstantLongValuesSource) { + final long constantMin = + ((LongValuesSource.ConstantLongValuesSource) minimumNumberMatch).getValue(); + if (constantMin > queries.size()) { + return new MatchNoDocsQuery( + "More clauses are required to match than the number of clauses"); + } + BooleanQuery.Builder builder = + new BooleanQuery.Builder().setMinimumNumberShouldMatch((int) Math.max(constantMin, 1)); + for (Query query : queries) { + Query r = query.rewrite(reader); + builder.add(r, BooleanClause.Occur.SHOULD); + } + return builder.build(); + } Multiset rewritten = new Multiset<>(); boolean actuallyRewritten = false; for (Query query : queries) { diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCoveringQuery.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCoveringQuery.java index 4627ce2c264..98d174f806c 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCoveringQuery.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCoveringQuery.java @@ -36,7 +36,9 @@ import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.LongValuesSource; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; +import org.apache.lucene.search.Sort; import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.search.QueryUtils; import org.apache.lucene.tests.util.LuceneTestCase; @@ -147,6 +149,7 @@ public class TestCoveringQuery extends LuceneTestCase { } Query q1 = builder.build(); Query q2 = new CoveringQuery(queries, LongValuesSource.constant(i)); + assertSameMatches(searcher, q1, q2, true); assertEquals(searcher.count(q1), searcher.count(q2)); } @@ -161,4 +164,89 @@ public class TestCoveringQuery extends LuceneTestCase { r.close(); dir.close(); } + + public void testRandomWand() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + int numDocs = atLeast(50); + for (int i = 0; i < numDocs; ++i) { + Document doc = new Document(); + if (random().nextBoolean()) { + doc.add(new StringField("field", "A", Store.NO)); + } + if (random().nextBoolean()) { + doc.add(new StringField("field", "B", Store.NO)); + } + if (random().nextDouble() > 0.9) { + doc.add(new StringField("field", "C", Store.NO)); + } + if (random().nextDouble() > 0.1) { + doc.add(new StringField("field", "D", Store.NO)); + } + doc.add(new NumericDocValuesField("min_match", 1)); + w.addDocument(doc); + } + + IndexReader r = DirectoryReader.open(w); + IndexSearcher searcher = new IndexSearcher(r); + w.close(); + + int iters = atLeast(10); + for (int iter = 0; iter < iters; ++iter) { + List queries = new ArrayList<>(); + if (random().nextBoolean()) { + queries.add(new TermQuery(new Term("field", "A"))); + } + if (random().nextBoolean()) { + queries.add(new TermQuery(new Term("field", "B"))); + } + if (random().nextBoolean()) { + queries.add(new TermQuery(new Term("field", "C"))); + } + if (random().nextBoolean()) { + queries.add(new TermQuery(new Term("field", "D"))); + } + if (random().nextBoolean()) { + queries.add(new TermQuery(new Term("field", "E"))); + } + + Query q = new CoveringQuery(queries, LongValuesSource.fromLongField("min_match")); + QueryUtils.check(random(), q, searcher); + + for (int i = 1; i < 4; ++i) { + BooleanQuery.Builder builder = new BooleanQuery.Builder().setMinimumNumberShouldMatch(i); + for (Query query : queries) { + builder.add(query, Occur.SHOULD); + } + Query q1 = builder.build(); + Query q2 = new CoveringQuery(queries, LongValuesSource.constant(i)); + assertSameMatches(searcher, q1, q2, true); + assertEquals(searcher.count(q1), searcher.count(q2)); + } + + Query filtered = + new BooleanQuery.Builder() + .add(q, Occur.MUST) + .add(new TermQuery(new Term("field", "A")), Occur.MUST) + .build(); + QueryUtils.check(random(), filtered, searcher); + } + + r.close(); + dir.close(); + } + + private void assertSameMatches(IndexSearcher searcher, Query q1, Query q2, boolean scores) + throws IOException { + final int maxDoc = searcher.getIndexReader().maxDoc(); + final TopDocs td1 = searcher.search(q1, maxDoc, scores ? Sort.RELEVANCE : Sort.INDEXORDER); + final TopDocs td2 = searcher.search(q2, maxDoc, scores ? Sort.RELEVANCE : Sort.INDEXORDER); + assertEquals(td1.totalHits.value, td2.totalHits.value); + for (int i = 0; i < td1.scoreDocs.length; ++i) { + assertEquals(td1.scoreDocs[i].doc, td2.scoreDocs[i].doc); + if (scores) { + assertEquals(td1.scoreDocs[i].score, td2.scoreDocs[i].score, 10e-7); + } + } + } }