mirror of https://github.com/apache/lucene.git
Add `FeatureQuery` weight caching in non-scoring case (#12118)
While FeatureQuery is a powerful tool in the scoring case, there are scenarios when caching should be allowed and scoring disabled. A particular case is when the FeatureQuery is used in conjunction with learned-sparse retrieval. It is useful to iterate and calculate the entire matching doc set when combined with various other queries. related to: https://github.com/apache/lucene/issues/11799
This commit is contained in:
parent
d591c9c37a
commit
4bbc273a43
|
@ -121,6 +121,8 @@ Optimizations
|
||||||
in order to achieve the same false positive probability with less memory.
|
in order to achieve the same false positive probability with less memory.
|
||||||
(Jean-François Boeuf)
|
(Jean-François Boeuf)
|
||||||
|
|
||||||
|
* GITHUB#12118 Optimize FeatureQuery to TermQuery & weight when scoring is not required (Ben Trent, Robert Muir)
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
---------------------
|
---------------------
|
||||||
(No changes)
|
(No changes)
|
||||||
|
|
|
@ -22,6 +22,7 @@ import org.apache.lucene.document.FeatureField.FeatureFunction;
|
||||||
import org.apache.lucene.index.ImpactsEnum;
|
import org.apache.lucene.index.ImpactsEnum;
|
||||||
import org.apache.lucene.index.LeafReaderContext;
|
import org.apache.lucene.index.LeafReaderContext;
|
||||||
import org.apache.lucene.index.PostingsEnum;
|
import org.apache.lucene.index.PostingsEnum;
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.index.Terms;
|
import org.apache.lucene.index.Terms;
|
||||||
import org.apache.lucene.index.TermsEnum;
|
import org.apache.lucene.index.TermsEnum;
|
||||||
import org.apache.lucene.search.DocIdSetIterator;
|
import org.apache.lucene.search.DocIdSetIterator;
|
||||||
|
@ -32,6 +33,7 @@ import org.apache.lucene.search.Query;
|
||||||
import org.apache.lucene.search.QueryVisitor;
|
import org.apache.lucene.search.QueryVisitor;
|
||||||
import org.apache.lucene.search.ScoreMode;
|
import org.apache.lucene.search.ScoreMode;
|
||||||
import org.apache.lucene.search.Scorer;
|
import org.apache.lucene.search.Scorer;
|
||||||
|
import org.apache.lucene.search.TermQuery;
|
||||||
import org.apache.lucene.search.Weight;
|
import org.apache.lucene.search.Weight;
|
||||||
import org.apache.lucene.search.similarities.Similarity.SimScorer;
|
import org.apache.lucene.search.similarities.Similarity.SimScorer;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
@ -80,6 +82,13 @@ final class FeatureQuery extends Query {
|
||||||
@Override
|
@Override
|
||||||
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
|
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
if (!scoreMode.needsScores()) {
|
||||||
|
// We don't need scores (e.g. for faceting), and since features are stored as terms,
|
||||||
|
// allow TermQuery to optimize in this case
|
||||||
|
TermQuery tq = new TermQuery(new Term(fieldName, featureName));
|
||||||
|
return searcher.rewrite(tq).createWeight(searcher, scoreMode, boost);
|
||||||
|
}
|
||||||
|
|
||||||
return new Weight(this) {
|
return new Weight(this) {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -17,6 +17,7 @@
|
||||||
package org.apache.lucene.document;
|
package org.apache.lucene.document;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.List;
|
||||||
import org.apache.lucene.document.Field.Store;
|
import org.apache.lucene.document.Field.Store;
|
||||||
import org.apache.lucene.index.DirectoryReader;
|
import org.apache.lucene.index.DirectoryReader;
|
||||||
import org.apache.lucene.index.LeafReaderContext;
|
import org.apache.lucene.index.LeafReaderContext;
|
||||||
|
@ -82,7 +83,7 @@ public class TestFeatureField extends LuceneTestCase {
|
||||||
DirectoryReader reader = writer.getReader();
|
DirectoryReader reader = writer.getReader();
|
||||||
writer.close();
|
writer.close();
|
||||||
|
|
||||||
IndexSearcher searcher = new IndexSearcher(reader);
|
IndexSearcher searcher = LuceneTestCase.newSearcher(reader);
|
||||||
LeafReaderContext context = reader.leaves().get(0);
|
LeafReaderContext context = reader.leaves().get(0);
|
||||||
|
|
||||||
Query q = FeatureField.newLogQuery("features", "pagerank", 3f, 4.5f);
|
Query q = FeatureField.newLogQuery("features", "pagerank", 3f, 4.5f);
|
||||||
|
@ -208,7 +209,7 @@ public class TestFeatureField extends LuceneTestCase {
|
||||||
DirectoryReader reader = writer.getReader();
|
DirectoryReader reader = writer.getReader();
|
||||||
writer.close();
|
writer.close();
|
||||||
|
|
||||||
IndexSearcher searcher = new IndexSearcher(reader);
|
IndexSearcher searcher = LuceneTestCase.newSearcher(reader);
|
||||||
|
|
||||||
QueryUtils.check(
|
QueryUtils.check(
|
||||||
random(), FeatureField.newLogQuery("features", "pagerank", 1f, 4.5f), searcher);
|
random(), FeatureField.newLogQuery("features", "pagerank", 1f, 4.5f), searcher);
|
||||||
|
@ -341,7 +342,7 @@ public class TestFeatureField extends LuceneTestCase {
|
||||||
// NOTE: If you need to make changes below, then you likely also need to
|
// NOTE: If you need to make changes below, then you likely also need to
|
||||||
// update javadocs of FeatureField.
|
// update javadocs of FeatureField.
|
||||||
|
|
||||||
IndexSearcher searcher = new IndexSearcher(reader);
|
IndexSearcher searcher = LuceneTestCase.newSearcher(reader);
|
||||||
searcher.setSimilarity(new BM25Similarity());
|
searcher.setSimilarity(new BM25Similarity());
|
||||||
Query query =
|
Query query =
|
||||||
new BooleanQuery.Builder()
|
new BooleanQuery.Builder()
|
||||||
|
@ -361,4 +362,88 @@ public class TestFeatureField extends LuceneTestCase {
|
||||||
reader.close();
|
reader.close();
|
||||||
dir.close();
|
dir.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testBasicsNonScoringCase() throws IOException {
|
||||||
|
try (Directory dir = newDirectory()) {
|
||||||
|
DirectoryReader reader;
|
||||||
|
try (RandomIndexWriter writer =
|
||||||
|
new RandomIndexWriter(
|
||||||
|
random(),
|
||||||
|
dir,
|
||||||
|
newIndexWriterConfig().setMergePolicy(newLogMergePolicy(random().nextBoolean())))) {
|
||||||
|
Document doc = new Document();
|
||||||
|
FeatureField pagerank = new FeatureField("features", "pagerank", 1);
|
||||||
|
FeatureField urlLength = new FeatureField("features", "urlLen", 1);
|
||||||
|
doc.add(pagerank);
|
||||||
|
doc.add(urlLength);
|
||||||
|
|
||||||
|
pagerank.setFeatureValue(10);
|
||||||
|
urlLength.setFeatureValue(1f / 24);
|
||||||
|
writer.addDocument(doc);
|
||||||
|
|
||||||
|
pagerank.setFeatureValue(100);
|
||||||
|
urlLength.setFeatureValue(1f / 20);
|
||||||
|
writer.addDocument(doc);
|
||||||
|
|
||||||
|
writer.addDocument(new Document()); // gap
|
||||||
|
|
||||||
|
pagerank.setFeatureValue(1);
|
||||||
|
urlLength.setFeatureValue(1f / 100);
|
||||||
|
writer.addDocument(doc);
|
||||||
|
|
||||||
|
pagerank.setFeatureValue(42);
|
||||||
|
urlLength.setFeatureValue(1f / 23);
|
||||||
|
writer.addDocument(doc);
|
||||||
|
|
||||||
|
Document urlLenDoc = new Document();
|
||||||
|
urlLenDoc.add(urlLength);
|
||||||
|
writer.addDocument(urlLenDoc);
|
||||||
|
|
||||||
|
Document pageRankDoc = new Document();
|
||||||
|
pageRankDoc.add(pagerank);
|
||||||
|
writer.addDocument(pageRankDoc);
|
||||||
|
|
||||||
|
reader = writer.getReader();
|
||||||
|
writer.forceMerge(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
IndexSearcher searcher = LuceneTestCase.newSearcher(reader);
|
||||||
|
LeafReaderContext context = reader.leaves().get(0);
|
||||||
|
|
||||||
|
for (Query q :
|
||||||
|
List.of(
|
||||||
|
FeatureField.newLogQuery("features", "pagerank", 3f, 4.5f),
|
||||||
|
FeatureField.newLinearQuery("features", "pagerank", 2f),
|
||||||
|
FeatureField.newSaturationQuery("features", "pagerank", 3f, 4.5f),
|
||||||
|
FeatureField.newSigmoidQuery("features", "pagerank", 3f, 4.5f, 0.6f))) {
|
||||||
|
Weight w = q.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1);
|
||||||
|
Scorer s = w.scorer(context);
|
||||||
|
|
||||||
|
assertEquals(q.toString(), 0, s.iterator().nextDoc());
|
||||||
|
assertEquals(q.toString(), 1, s.iterator().nextDoc());
|
||||||
|
assertEquals(q.toString(), 3, s.iterator().nextDoc());
|
||||||
|
assertEquals(q.toString(), 4, s.iterator().nextDoc());
|
||||||
|
assertEquals(q.toString(), 6, s.iterator().nextDoc());
|
||||||
|
assertEquals(q.toString(), DocIdSetIterator.NO_MORE_DOCS, s.iterator().nextDoc());
|
||||||
|
}
|
||||||
|
|
||||||
|
for (Query q :
|
||||||
|
List.of(
|
||||||
|
FeatureField.newLogQuery("features", "urlLen", 3f, 4.5f),
|
||||||
|
FeatureField.newLinearQuery("features", "urlLen", 2f),
|
||||||
|
FeatureField.newSaturationQuery("features", "urlLen", 3f, 4.5f),
|
||||||
|
FeatureField.newSigmoidQuery("features", "urlLen", 3f, 4.5f, 0.6f))) {
|
||||||
|
Weight w = q.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1);
|
||||||
|
Scorer s = w.scorer(context);
|
||||||
|
|
||||||
|
assertEquals(q.toString(), 0, s.iterator().nextDoc());
|
||||||
|
assertEquals(q.toString(), 1, s.iterator().nextDoc());
|
||||||
|
assertEquals(q.toString(), 3, s.iterator().nextDoc());
|
||||||
|
assertEquals(q.toString(), 4, s.iterator().nextDoc());
|
||||||
|
assertEquals(q.toString(), 5, s.iterator().nextDoc());
|
||||||
|
assertEquals(q.toString(), DocIdSetIterator.NO_MORE_DOCS, s.iterator().nextDoc());
|
||||||
|
}
|
||||||
|
reader.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue