mirror of https://github.com/apache/lucene.git
Add `FeatureQuery` weight caching in non-scoring case (#12118)
While FeatureQuery is a powerful tool in the scoring case, there are scenarios when caching should be allowed and scoring disabled. A particular case is when the FeatureQuery is used in conjunction with learned-sparse retrieval. It is useful to iterate and calculate the entire matching doc set when combined with various other queries. related to: https://github.com/apache/lucene/issues/11799
This commit is contained in:
parent
d591c9c37a
commit
4bbc273a43
|
@ -120,6 +120,8 @@ Optimizations
|
|||
* GITHUB#11900: BloomFilteringPostingsFormat now uses multiple hash functions
|
||||
in order to achieve the same false positive probability with less memory.
|
||||
(Jean-François Boeuf)
|
||||
|
||||
* GITHUB#12118 Optimize FeatureQuery to TermQuery & weight when scoring is not required (Ben Trent, Robert Muir)
|
||||
|
||||
Bug Fixes
|
||||
---------------------
|
||||
|
|
|
@ -22,6 +22,7 @@ import org.apache.lucene.document.FeatureField.FeatureFunction;
|
|||
import org.apache.lucene.index.ImpactsEnum;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
|
@ -32,6 +33,7 @@ import org.apache.lucene.search.Query;
|
|||
import org.apache.lucene.search.QueryVisitor;
|
||||
import org.apache.lucene.search.ScoreMode;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.Weight;
|
||||
import org.apache.lucene.search.similarities.Similarity.SimScorer;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
@ -80,6 +82,13 @@ final class FeatureQuery extends Query {
|
|||
@Override
|
||||
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
|
||||
throws IOException {
|
||||
if (!scoreMode.needsScores()) {
|
||||
// We don't need scores (e.g. for faceting), and since features are stored as terms,
|
||||
// allow TermQuery to optimize in this case
|
||||
TermQuery tq = new TermQuery(new Term(fieldName, featureName));
|
||||
return searcher.rewrite(tq).createWeight(searcher, scoreMode, boost);
|
||||
}
|
||||
|
||||
return new Weight(this) {
|
||||
|
||||
@Override
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
package org.apache.lucene.document;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.document.Field.Store;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
|
@ -82,7 +83,7 @@ public class TestFeatureField extends LuceneTestCase {
|
|||
DirectoryReader reader = writer.getReader();
|
||||
writer.close();
|
||||
|
||||
IndexSearcher searcher = new IndexSearcher(reader);
|
||||
IndexSearcher searcher = LuceneTestCase.newSearcher(reader);
|
||||
LeafReaderContext context = reader.leaves().get(0);
|
||||
|
||||
Query q = FeatureField.newLogQuery("features", "pagerank", 3f, 4.5f);
|
||||
|
@ -208,7 +209,7 @@ public class TestFeatureField extends LuceneTestCase {
|
|||
DirectoryReader reader = writer.getReader();
|
||||
writer.close();
|
||||
|
||||
IndexSearcher searcher = new IndexSearcher(reader);
|
||||
IndexSearcher searcher = LuceneTestCase.newSearcher(reader);
|
||||
|
||||
QueryUtils.check(
|
||||
random(), FeatureField.newLogQuery("features", "pagerank", 1f, 4.5f), searcher);
|
||||
|
@ -341,7 +342,7 @@ public class TestFeatureField extends LuceneTestCase {
|
|||
// NOTE: If you need to make changes below, then you likely also need to
|
||||
// update javadocs of FeatureField.
|
||||
|
||||
IndexSearcher searcher = new IndexSearcher(reader);
|
||||
IndexSearcher searcher = LuceneTestCase.newSearcher(reader);
|
||||
searcher.setSimilarity(new BM25Similarity());
|
||||
Query query =
|
||||
new BooleanQuery.Builder()
|
||||
|
@ -361,4 +362,88 @@ public class TestFeatureField extends LuceneTestCase {
|
|||
reader.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testBasicsNonScoringCase() throws IOException {
|
||||
try (Directory dir = newDirectory()) {
|
||||
DirectoryReader reader;
|
||||
try (RandomIndexWriter writer =
|
||||
new RandomIndexWriter(
|
||||
random(),
|
||||
dir,
|
||||
newIndexWriterConfig().setMergePolicy(newLogMergePolicy(random().nextBoolean())))) {
|
||||
Document doc = new Document();
|
||||
FeatureField pagerank = new FeatureField("features", "pagerank", 1);
|
||||
FeatureField urlLength = new FeatureField("features", "urlLen", 1);
|
||||
doc.add(pagerank);
|
||||
doc.add(urlLength);
|
||||
|
||||
pagerank.setFeatureValue(10);
|
||||
urlLength.setFeatureValue(1f / 24);
|
||||
writer.addDocument(doc);
|
||||
|
||||
pagerank.setFeatureValue(100);
|
||||
urlLength.setFeatureValue(1f / 20);
|
||||
writer.addDocument(doc);
|
||||
|
||||
writer.addDocument(new Document()); // gap
|
||||
|
||||
pagerank.setFeatureValue(1);
|
||||
urlLength.setFeatureValue(1f / 100);
|
||||
writer.addDocument(doc);
|
||||
|
||||
pagerank.setFeatureValue(42);
|
||||
urlLength.setFeatureValue(1f / 23);
|
||||
writer.addDocument(doc);
|
||||
|
||||
Document urlLenDoc = new Document();
|
||||
urlLenDoc.add(urlLength);
|
||||
writer.addDocument(urlLenDoc);
|
||||
|
||||
Document pageRankDoc = new Document();
|
||||
pageRankDoc.add(pagerank);
|
||||
writer.addDocument(pageRankDoc);
|
||||
|
||||
reader = writer.getReader();
|
||||
writer.forceMerge(1);
|
||||
}
|
||||
|
||||
IndexSearcher searcher = LuceneTestCase.newSearcher(reader);
|
||||
LeafReaderContext context = reader.leaves().get(0);
|
||||
|
||||
for (Query q :
|
||||
List.of(
|
||||
FeatureField.newLogQuery("features", "pagerank", 3f, 4.5f),
|
||||
FeatureField.newLinearQuery("features", "pagerank", 2f),
|
||||
FeatureField.newSaturationQuery("features", "pagerank", 3f, 4.5f),
|
||||
FeatureField.newSigmoidQuery("features", "pagerank", 3f, 4.5f, 0.6f))) {
|
||||
Weight w = q.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1);
|
||||
Scorer s = w.scorer(context);
|
||||
|
||||
assertEquals(q.toString(), 0, s.iterator().nextDoc());
|
||||
assertEquals(q.toString(), 1, s.iterator().nextDoc());
|
||||
assertEquals(q.toString(), 3, s.iterator().nextDoc());
|
||||
assertEquals(q.toString(), 4, s.iterator().nextDoc());
|
||||
assertEquals(q.toString(), 6, s.iterator().nextDoc());
|
||||
assertEquals(q.toString(), DocIdSetIterator.NO_MORE_DOCS, s.iterator().nextDoc());
|
||||
}
|
||||
|
||||
for (Query q :
|
||||
List.of(
|
||||
FeatureField.newLogQuery("features", "urlLen", 3f, 4.5f),
|
||||
FeatureField.newLinearQuery("features", "urlLen", 2f),
|
||||
FeatureField.newSaturationQuery("features", "urlLen", 3f, 4.5f),
|
||||
FeatureField.newSigmoidQuery("features", "urlLen", 3f, 4.5f, 0.6f))) {
|
||||
Weight w = q.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1);
|
||||
Scorer s = w.scorer(context);
|
||||
|
||||
assertEquals(q.toString(), 0, s.iterator().nextDoc());
|
||||
assertEquals(q.toString(), 1, s.iterator().nextDoc());
|
||||
assertEquals(q.toString(), 3, s.iterator().nextDoc());
|
||||
assertEquals(q.toString(), 4, s.iterator().nextDoc());
|
||||
assertEquals(q.toString(), 5, s.iterator().nextDoc());
|
||||
assertEquals(q.toString(), DocIdSetIterator.NO_MORE_DOCS, s.iterator().nextDoc());
|
||||
}
|
||||
reader.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue