Add `FeatureQuery` weight caching in non-scoring case (#12118)

While FeatureQuery is a powerful tool in the scoring case, there are scenarios when caching should be allowed and scoring disabled.

A particular case is when the FeatureQuery is used in conjunction with learned-sparse retrieval. It is useful to iterate and calculate the entire matching doc set when combined with various other queries.

related to: https://github.com/apache/lucene/issues/11799
This commit is contained in:
Benjamin Trent 2023-02-02 13:00:33 -05:00 committed by GitHub
parent d591c9c37a
commit 4bbc273a43
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 99 additions and 3 deletions

View File

@ -120,6 +120,8 @@ Optimizations
* GITHUB#11900: BloomFilteringPostingsFormat now uses multiple hash functions * GITHUB#11900: BloomFilteringPostingsFormat now uses multiple hash functions
in order to achieve the same false positive probability with less memory. in order to achieve the same false positive probability with less memory.
(Jean-François Boeuf) (Jean-François Boeuf)
* GITHUB#12118 Optimize FeatureQuery to TermQuery & weight when scoring is not required (Ben Trent, Robert Muir)
Bug Fixes Bug Fixes
--------------------- ---------------------

View File

@ -22,6 +22,7 @@ import org.apache.lucene.document.FeatureField.FeatureFunction;
import org.apache.lucene.index.ImpactsEnum; import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms; import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.DocIdSetIterator;
@ -32,6 +33,7 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor; import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.ScoreMode; import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Scorer; import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.Weight; import org.apache.lucene.search.Weight;
import org.apache.lucene.search.similarities.Similarity.SimScorer; import org.apache.lucene.search.similarities.Similarity.SimScorer;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
@ -80,6 +82,13 @@ final class FeatureQuery extends Query {
@Override @Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
throws IOException { throws IOException {
if (!scoreMode.needsScores()) {
// We don't need scores (e.g. for faceting), and since features are stored as terms,
// allow TermQuery to optimize in this case
TermQuery tq = new TermQuery(new Term(fieldName, featureName));
return searcher.rewrite(tq).createWeight(searcher, scoreMode, boost);
}
return new Weight(this) { return new Weight(this) {
@Override @Override

View File

@ -17,6 +17,7 @@
package org.apache.lucene.document; package org.apache.lucene.document;
import java.io.IOException; import java.io.IOException;
import java.util.List;
import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.LeafReaderContext;
@ -82,7 +83,7 @@ public class TestFeatureField extends LuceneTestCase {
DirectoryReader reader = writer.getReader(); DirectoryReader reader = writer.getReader();
writer.close(); writer.close();
IndexSearcher searcher = new IndexSearcher(reader); IndexSearcher searcher = LuceneTestCase.newSearcher(reader);
LeafReaderContext context = reader.leaves().get(0); LeafReaderContext context = reader.leaves().get(0);
Query q = FeatureField.newLogQuery("features", "pagerank", 3f, 4.5f); Query q = FeatureField.newLogQuery("features", "pagerank", 3f, 4.5f);
@ -208,7 +209,7 @@ public class TestFeatureField extends LuceneTestCase {
DirectoryReader reader = writer.getReader(); DirectoryReader reader = writer.getReader();
writer.close(); writer.close();
IndexSearcher searcher = new IndexSearcher(reader); IndexSearcher searcher = LuceneTestCase.newSearcher(reader);
QueryUtils.check( QueryUtils.check(
random(), FeatureField.newLogQuery("features", "pagerank", 1f, 4.5f), searcher); random(), FeatureField.newLogQuery("features", "pagerank", 1f, 4.5f), searcher);
@ -341,7 +342,7 @@ public class TestFeatureField extends LuceneTestCase {
// NOTE: If you need to make changes below, then you likely also need to // NOTE: If you need to make changes below, then you likely also need to
// update javadocs of FeatureField. // update javadocs of FeatureField.
IndexSearcher searcher = new IndexSearcher(reader); IndexSearcher searcher = LuceneTestCase.newSearcher(reader);
searcher.setSimilarity(new BM25Similarity()); searcher.setSimilarity(new BM25Similarity());
Query query = Query query =
new BooleanQuery.Builder() new BooleanQuery.Builder()
@ -361,4 +362,88 @@ public class TestFeatureField extends LuceneTestCase {
reader.close(); reader.close();
dir.close(); dir.close();
} }
public void testBasicsNonScoringCase() throws IOException {
try (Directory dir = newDirectory()) {
DirectoryReader reader;
try (RandomIndexWriter writer =
new RandomIndexWriter(
random(),
dir,
newIndexWriterConfig().setMergePolicy(newLogMergePolicy(random().nextBoolean())))) {
Document doc = new Document();
FeatureField pagerank = new FeatureField("features", "pagerank", 1);
FeatureField urlLength = new FeatureField("features", "urlLen", 1);
doc.add(pagerank);
doc.add(urlLength);
pagerank.setFeatureValue(10);
urlLength.setFeatureValue(1f / 24);
writer.addDocument(doc);
pagerank.setFeatureValue(100);
urlLength.setFeatureValue(1f / 20);
writer.addDocument(doc);
writer.addDocument(new Document()); // gap
pagerank.setFeatureValue(1);
urlLength.setFeatureValue(1f / 100);
writer.addDocument(doc);
pagerank.setFeatureValue(42);
urlLength.setFeatureValue(1f / 23);
writer.addDocument(doc);
Document urlLenDoc = new Document();
urlLenDoc.add(urlLength);
writer.addDocument(urlLenDoc);
Document pageRankDoc = new Document();
pageRankDoc.add(pagerank);
writer.addDocument(pageRankDoc);
reader = writer.getReader();
writer.forceMerge(1);
}
IndexSearcher searcher = LuceneTestCase.newSearcher(reader);
LeafReaderContext context = reader.leaves().get(0);
for (Query q :
List.of(
FeatureField.newLogQuery("features", "pagerank", 3f, 4.5f),
FeatureField.newLinearQuery("features", "pagerank", 2f),
FeatureField.newSaturationQuery("features", "pagerank", 3f, 4.5f),
FeatureField.newSigmoidQuery("features", "pagerank", 3f, 4.5f, 0.6f))) {
Weight w = q.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1);
Scorer s = w.scorer(context);
assertEquals(q.toString(), 0, s.iterator().nextDoc());
assertEquals(q.toString(), 1, s.iterator().nextDoc());
assertEquals(q.toString(), 3, s.iterator().nextDoc());
assertEquals(q.toString(), 4, s.iterator().nextDoc());
assertEquals(q.toString(), 6, s.iterator().nextDoc());
assertEquals(q.toString(), DocIdSetIterator.NO_MORE_DOCS, s.iterator().nextDoc());
}
for (Query q :
List.of(
FeatureField.newLogQuery("features", "urlLen", 3f, 4.5f),
FeatureField.newLinearQuery("features", "urlLen", 2f),
FeatureField.newSaturationQuery("features", "urlLen", 3f, 4.5f),
FeatureField.newSigmoidQuery("features", "urlLen", 3f, 4.5f, 0.6f))) {
Weight w = q.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1);
Scorer s = w.scorer(context);
assertEquals(q.toString(), 0, s.iterator().nextDoc());
assertEquals(q.toString(), 1, s.iterator().nextDoc());
assertEquals(q.toString(), 3, s.iterator().nextDoc());
assertEquals(q.toString(), 4, s.iterator().nextDoc());
assertEquals(q.toString(), 5, s.iterator().nextDoc());
assertEquals(q.toString(), DocIdSetIterator.NO_MORE_DOCS, s.iterator().nextDoc());
}
reader.close();
}
}
} }