Add `FeatureQuery` weight caching in non-scoring case (#12118)

While FeatureQuery is a powerful tool in the scoring case, there are scenarios when caching should be allowed and scoring disabled.

A particular case is when the FeatureQuery is used in conjunction with learned-sparse retrieval. It is useful to iterate and calculate the entire matching doc set when combined with various other queries.

related to: https://github.com/apache/lucene/issues/11799
This commit is contained in:
Benjamin Trent 2023-02-02 13:00:33 -05:00 committed by GitHub
parent d591c9c37a
commit 4bbc273a43
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 99 additions and 3 deletions

View File

@ -121,6 +121,8 @@ Optimizations
in order to achieve the same false positive probability with less memory. in order to achieve the same false positive probability with less memory.
(Jean-François Boeuf) (Jean-François Boeuf)
* GITHUB#12118 Optimize FeatureQuery to TermQuery & weight when scoring is not required (Ben Trent, Robert Muir)
Bug Fixes Bug Fixes
--------------------- ---------------------
(No changes) (No changes)

View File

@ -22,6 +22,7 @@ import org.apache.lucene.document.FeatureField.FeatureFunction;
import org.apache.lucene.index.ImpactsEnum; import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms; import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.DocIdSetIterator;
@ -32,6 +33,7 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor; import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.ScoreMode; import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Scorer; import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.Weight; import org.apache.lucene.search.Weight;
import org.apache.lucene.search.similarities.Similarity.SimScorer; import org.apache.lucene.search.similarities.Similarity.SimScorer;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
@ -80,6 +82,13 @@ final class FeatureQuery extends Query {
@Override @Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
throws IOException { throws IOException {
if (!scoreMode.needsScores()) {
// We don't need scores (e.g. for faceting), and since features are stored as terms,
// allow TermQuery to optimize in this case
TermQuery tq = new TermQuery(new Term(fieldName, featureName));
return searcher.rewrite(tq).createWeight(searcher, scoreMode, boost);
}
return new Weight(this) { return new Weight(this) {
@Override @Override

View File

@ -17,6 +17,7 @@
package org.apache.lucene.document; package org.apache.lucene.document;
import java.io.IOException; import java.io.IOException;
import java.util.List;
import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.LeafReaderContext;
@ -82,7 +83,7 @@ public class TestFeatureField extends LuceneTestCase {
DirectoryReader reader = writer.getReader(); DirectoryReader reader = writer.getReader();
writer.close(); writer.close();
IndexSearcher searcher = new IndexSearcher(reader); IndexSearcher searcher = LuceneTestCase.newSearcher(reader);
LeafReaderContext context = reader.leaves().get(0); LeafReaderContext context = reader.leaves().get(0);
Query q = FeatureField.newLogQuery("features", "pagerank", 3f, 4.5f); Query q = FeatureField.newLogQuery("features", "pagerank", 3f, 4.5f);
@ -208,7 +209,7 @@ public class TestFeatureField extends LuceneTestCase {
DirectoryReader reader = writer.getReader(); DirectoryReader reader = writer.getReader();
writer.close(); writer.close();
IndexSearcher searcher = new IndexSearcher(reader); IndexSearcher searcher = LuceneTestCase.newSearcher(reader);
QueryUtils.check( QueryUtils.check(
random(), FeatureField.newLogQuery("features", "pagerank", 1f, 4.5f), searcher); random(), FeatureField.newLogQuery("features", "pagerank", 1f, 4.5f), searcher);
@ -341,7 +342,7 @@ public class TestFeatureField extends LuceneTestCase {
// NOTE: If you need to make changes below, then you likely also need to // NOTE: If you need to make changes below, then you likely also need to
// update javadocs of FeatureField. // update javadocs of FeatureField.
IndexSearcher searcher = new IndexSearcher(reader); IndexSearcher searcher = LuceneTestCase.newSearcher(reader);
searcher.setSimilarity(new BM25Similarity()); searcher.setSimilarity(new BM25Similarity());
Query query = Query query =
new BooleanQuery.Builder() new BooleanQuery.Builder()
@ -361,4 +362,88 @@ public class TestFeatureField extends LuceneTestCase {
reader.close(); reader.close();
dir.close(); dir.close();
} }
public void testBasicsNonScoringCase() throws IOException {
try (Directory dir = newDirectory()) {
DirectoryReader reader;
try (RandomIndexWriter writer =
new RandomIndexWriter(
random(),
dir,
newIndexWriterConfig().setMergePolicy(newLogMergePolicy(random().nextBoolean())))) {
Document doc = new Document();
FeatureField pagerank = new FeatureField("features", "pagerank", 1);
FeatureField urlLength = new FeatureField("features", "urlLen", 1);
doc.add(pagerank);
doc.add(urlLength);
pagerank.setFeatureValue(10);
urlLength.setFeatureValue(1f / 24);
writer.addDocument(doc);
pagerank.setFeatureValue(100);
urlLength.setFeatureValue(1f / 20);
writer.addDocument(doc);
writer.addDocument(new Document()); // gap
pagerank.setFeatureValue(1);
urlLength.setFeatureValue(1f / 100);
writer.addDocument(doc);
pagerank.setFeatureValue(42);
urlLength.setFeatureValue(1f / 23);
writer.addDocument(doc);
Document urlLenDoc = new Document();
urlLenDoc.add(urlLength);
writer.addDocument(urlLenDoc);
Document pageRankDoc = new Document();
pageRankDoc.add(pagerank);
writer.addDocument(pageRankDoc);
reader = writer.getReader();
writer.forceMerge(1);
}
IndexSearcher searcher = LuceneTestCase.newSearcher(reader);
LeafReaderContext context = reader.leaves().get(0);
for (Query q :
List.of(
FeatureField.newLogQuery("features", "pagerank", 3f, 4.5f),
FeatureField.newLinearQuery("features", "pagerank", 2f),
FeatureField.newSaturationQuery("features", "pagerank", 3f, 4.5f),
FeatureField.newSigmoidQuery("features", "pagerank", 3f, 4.5f, 0.6f))) {
Weight w = q.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1);
Scorer s = w.scorer(context);
assertEquals(q.toString(), 0, s.iterator().nextDoc());
assertEquals(q.toString(), 1, s.iterator().nextDoc());
assertEquals(q.toString(), 3, s.iterator().nextDoc());
assertEquals(q.toString(), 4, s.iterator().nextDoc());
assertEquals(q.toString(), 6, s.iterator().nextDoc());
assertEquals(q.toString(), DocIdSetIterator.NO_MORE_DOCS, s.iterator().nextDoc());
}
for (Query q :
List.of(
FeatureField.newLogQuery("features", "urlLen", 3f, 4.5f),
FeatureField.newLinearQuery("features", "urlLen", 2f),
FeatureField.newSaturationQuery("features", "urlLen", 3f, 4.5f),
FeatureField.newSigmoidQuery("features", "urlLen", 3f, 4.5f, 0.6f))) {
Weight w = q.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1);
Scorer s = w.scorer(context);
assertEquals(q.toString(), 0, s.iterator().nextDoc());
assertEquals(q.toString(), 1, s.iterator().nextDoc());
assertEquals(q.toString(), 3, s.iterator().nextDoc());
assertEquals(q.toString(), 4, s.iterator().nextDoc());
assertEquals(q.toString(), 5, s.iterator().nextDoc());
assertEquals(q.toString(), DocIdSetIterator.NO_MORE_DOCS, s.iterator().nextDoc());
}
reader.close();
}
}
} }