diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index c5d96cbc6fe..561c860a907 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -120,6 +120,8 @@ Optimizations * GITHUB#11900: BloomFilteringPostingsFormat now uses multiple hash functions in order to achieve the same false positive probability with less memory. (Jean-François Boeuf) + +* GITHUB#12118 Optimize FeatureQuery to TermQuery & weight when scoring is not required (Ben Trent, Robert Muir) Bug Fixes --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/document/FeatureQuery.java b/lucene/core/src/java/org/apache/lucene/document/FeatureQuery.java index fcd28a3d2ec..8b1ea8c4725 100644 --- a/lucene/core/src/java/org/apache/lucene/document/FeatureQuery.java +++ b/lucene/core/src/java/org/apache/lucene/document/FeatureQuery.java @@ -22,6 +22,7 @@ import org.apache.lucene.document.FeatureField.FeatureFunction; import org.apache.lucene.index.ImpactsEnum; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.DocIdSetIterator; @@ -32,6 +33,7 @@ import org.apache.lucene.search.Query; import org.apache.lucene.search.QueryVisitor; import org.apache.lucene.search.ScoreMode; import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.Weight; import org.apache.lucene.search.similarities.Similarity.SimScorer; import org.apache.lucene.util.BytesRef; @@ -80,6 +82,13 @@ final class FeatureQuery extends Query { @Override public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { + if (!scoreMode.needsScores()) { + // We don't need scores (e.g. for faceting), and since features are stored as terms, + // allow TermQuery to optimize in this case + TermQuery tq = new TermQuery(new Term(fieldName, featureName)); + return searcher.rewrite(tq).createWeight(searcher, scoreMode, boost); + } + return new Weight(this) { @Override diff --git a/lucene/core/src/test/org/apache/lucene/document/TestFeatureField.java b/lucene/core/src/test/org/apache/lucene/document/TestFeatureField.java index 1f12944a981..3ec9bc4b3da 100644 --- a/lucene/core/src/test/org/apache/lucene/document/TestFeatureField.java +++ b/lucene/core/src/test/org/apache/lucene/document/TestFeatureField.java @@ -17,6 +17,7 @@ package org.apache.lucene.document; import java.io.IOException; +import java.util.List; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.LeafReaderContext; @@ -82,7 +83,7 @@ public class TestFeatureField extends LuceneTestCase { DirectoryReader reader = writer.getReader(); writer.close(); - IndexSearcher searcher = new IndexSearcher(reader); + IndexSearcher searcher = LuceneTestCase.newSearcher(reader); LeafReaderContext context = reader.leaves().get(0); Query q = FeatureField.newLogQuery("features", "pagerank", 3f, 4.5f); @@ -208,7 +209,7 @@ public class TestFeatureField extends LuceneTestCase { DirectoryReader reader = writer.getReader(); writer.close(); - IndexSearcher searcher = new IndexSearcher(reader); + IndexSearcher searcher = LuceneTestCase.newSearcher(reader); QueryUtils.check( random(), FeatureField.newLogQuery("features", "pagerank", 1f, 4.5f), searcher); @@ -341,7 +342,7 @@ public class TestFeatureField extends LuceneTestCase { // NOTE: If you need to make changes below, then you likely also need to // update javadocs of FeatureField. - IndexSearcher searcher = new IndexSearcher(reader); + IndexSearcher searcher = LuceneTestCase.newSearcher(reader); searcher.setSimilarity(new BM25Similarity()); Query query = new BooleanQuery.Builder() @@ -361,4 +362,88 @@ public class TestFeatureField extends LuceneTestCase { reader.close(); dir.close(); } + + public void testBasicsNonScoringCase() throws IOException { + try (Directory dir = newDirectory()) { + DirectoryReader reader; + try (RandomIndexWriter writer = + new RandomIndexWriter( + random(), + dir, + newIndexWriterConfig().setMergePolicy(newLogMergePolicy(random().nextBoolean())))) { + Document doc = new Document(); + FeatureField pagerank = new FeatureField("features", "pagerank", 1); + FeatureField urlLength = new FeatureField("features", "urlLen", 1); + doc.add(pagerank); + doc.add(urlLength); + + pagerank.setFeatureValue(10); + urlLength.setFeatureValue(1f / 24); + writer.addDocument(doc); + + pagerank.setFeatureValue(100); + urlLength.setFeatureValue(1f / 20); + writer.addDocument(doc); + + writer.addDocument(new Document()); // gap + + pagerank.setFeatureValue(1); + urlLength.setFeatureValue(1f / 100); + writer.addDocument(doc); + + pagerank.setFeatureValue(42); + urlLength.setFeatureValue(1f / 23); + writer.addDocument(doc); + + Document urlLenDoc = new Document(); + urlLenDoc.add(urlLength); + writer.addDocument(urlLenDoc); + + Document pageRankDoc = new Document(); + pageRankDoc.add(pagerank); + writer.addDocument(pageRankDoc); + + reader = writer.getReader(); + writer.forceMerge(1); + } + + IndexSearcher searcher = LuceneTestCase.newSearcher(reader); + LeafReaderContext context = reader.leaves().get(0); + + for (Query q : + List.of( + FeatureField.newLogQuery("features", "pagerank", 3f, 4.5f), + FeatureField.newLinearQuery("features", "pagerank", 2f), + FeatureField.newSaturationQuery("features", "pagerank", 3f, 4.5f), + FeatureField.newSigmoidQuery("features", "pagerank", 3f, 4.5f, 0.6f))) { + Weight w = q.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1); + Scorer s = w.scorer(context); + + assertEquals(q.toString(), 0, s.iterator().nextDoc()); + assertEquals(q.toString(), 1, s.iterator().nextDoc()); + assertEquals(q.toString(), 3, s.iterator().nextDoc()); + assertEquals(q.toString(), 4, s.iterator().nextDoc()); + assertEquals(q.toString(), 6, s.iterator().nextDoc()); + assertEquals(q.toString(), DocIdSetIterator.NO_MORE_DOCS, s.iterator().nextDoc()); + } + + for (Query q : + List.of( + FeatureField.newLogQuery("features", "urlLen", 3f, 4.5f), + FeatureField.newLinearQuery("features", "urlLen", 2f), + FeatureField.newSaturationQuery("features", "urlLen", 3f, 4.5f), + FeatureField.newSigmoidQuery("features", "urlLen", 3f, 4.5f, 0.6f))) { + Weight w = q.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1); + Scorer s = w.scorer(context); + + assertEquals(q.toString(), 0, s.iterator().nextDoc()); + assertEquals(q.toString(), 1, s.iterator().nextDoc()); + assertEquals(q.toString(), 3, s.iterator().nextDoc()); + assertEquals(q.toString(), 4, s.iterator().nextDoc()); + assertEquals(q.toString(), 5, s.iterator().nextDoc()); + assertEquals(q.toString(), DocIdSetIterator.NO_MORE_DOCS, s.iterator().nextDoc()); + } + reader.close(); + } + } }