mirror of https://github.com/apache/lucene.git
SOLR-9027: Add GraphTermsQuery to limit traversal on high frequency nodes
This commit is contained in:
parent
499ae951ec
commit
06f9bd2bf9
|
@ -0,0 +1,292 @@
|
|||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.search;
|
||||
|
||||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermContext;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.BulkScorer;
|
||||
import org.apache.lucene.search.ConstantScoreQuery;
|
||||
import org.apache.lucene.search.ConstantScoreScorer;
|
||||
import org.apache.lucene.search.ConstantScoreWeight;
|
||||
import org.apache.lucene.search.DocIdSet;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.MatchNoDocsQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.search.Weight;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefBuilder;
|
||||
import org.apache.lucene.util.DocIdSetBuilder;
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.request.SolrQueryRequest;
|
||||
import org.apache.solr.schema.FieldType;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* The GraphTermsQuery builds a disjunction query from a list of terms. The terms are first filtered by the maxDocFreq parameter.
|
||||
* This allows graph traversals to skip traversing high frequency nodes which is often desirable from a performance standpoint.
|
||||
*
|
||||
* Syntax: {!graphTerms f=field maxDocFreq=10000}term1,term2,term3
|
||||
**/
|
||||
|
||||
public class GraphTermsQParserPlugin extends QParserPlugin {
|
||||
public static final String NAME = "graphTerms";
|
||||
|
||||
@Override
|
||||
public QParser createParser(String qstr, SolrParams localParams, SolrParams params, SolrQueryRequest req) {
|
||||
return new QParser(qstr, localParams, params, req) {
|
||||
@Override
|
||||
public Query parse() throws SyntaxError {
|
||||
String fname = localParams.get(QueryParsing.F);
|
||||
FieldType ft = req.getSchema().getFieldTypeNoEx(fname);
|
||||
int maxDocFreq = localParams.getInt("maxDocFreq", Integer.MAX_VALUE);
|
||||
String qstr = localParams.get(QueryParsing.V);//never null
|
||||
|
||||
if (qstr.length() == 0) {
|
||||
return new MatchNoDocsQuery();
|
||||
}
|
||||
|
||||
final String[] splitVals = qstr.split(",");
|
||||
|
||||
BytesRef[] bytesRefs = new BytesRef[splitVals.length];
|
||||
BytesRefBuilder term = new BytesRefBuilder();
|
||||
for (int i = 0; i < splitVals.length; i++) {
|
||||
String stringVal = splitVals[i].trim();
|
||||
if (ft != null) {
|
||||
ft.readableToIndexed(stringVal, term);
|
||||
} else {
|
||||
term.copyChars(stringVal);
|
||||
}
|
||||
bytesRefs[i] = term.toBytesRef();
|
||||
}
|
||||
|
||||
return new ConstantScoreQuery(new GraphTermsQuery(fname, bytesRefs, maxDocFreq));
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private class GraphTermsQuery extends Query {
|
||||
|
||||
private Term[] queryTerms;
|
||||
private List<TermContext> finalContexts;
|
||||
private List<Term> finalTerms;
|
||||
private String field;
|
||||
private int maxDocFreq;
|
||||
private Object id;
|
||||
|
||||
public GraphTermsQuery(String field, BytesRef[] terms, int maxDocFreq) {
|
||||
this.maxDocFreq = maxDocFreq;
|
||||
this.field = field;
|
||||
this.queryTerms = new Term[terms.length];
|
||||
this.id = new Object();
|
||||
for(int i=0; i<terms.length; i++) {
|
||||
this.queryTerms[i] = new Term(field, terms[i]);
|
||||
}
|
||||
}
|
||||
|
||||
//Just for cloning
|
||||
private GraphTermsQuery(String field, Term[] terms, int maxDocFreq, Object id) {
|
||||
this.field = field;
|
||||
this.queryTerms = terms;
|
||||
this.maxDocFreq = maxDocFreq;
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query rewrite(IndexReader reader) throws IOException {
|
||||
if(this.finalContexts == null) {
|
||||
//This query has not been re-written yet.
|
||||
//Rewriting the query does not effect the cache key as this query is not designed to be cached.
|
||||
this.finalContexts = new ArrayList();
|
||||
this.finalTerms = new ArrayList();
|
||||
List<LeafReaderContext> contexts = reader.leaves();
|
||||
TermContext[] termContexts = new TermContext[this.queryTerms.length];
|
||||
collectTermContext(reader, contexts, termContexts, this.queryTerms);
|
||||
for(int i=0; i<termContexts.length; i++) {
|
||||
TermContext termContext = termContexts[i];
|
||||
if(termContext != null && termContext.docFreq() < this.maxDocFreq) {
|
||||
this.finalContexts.add(termContext);
|
||||
this.finalTerms.add(queryTerms[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return 31 * super.hashCode() + id.hashCode();
|
||||
}
|
||||
|
||||
public boolean equals(Object o) {
|
||||
if (super.equals(o) == false) {
|
||||
return false;
|
||||
}
|
||||
|
||||
GraphTermsQuery q = (GraphTermsQuery)o;
|
||||
return id == q.id;
|
||||
}
|
||||
|
||||
public GraphTermsQuery clone() {
|
||||
GraphTermsQuery clone = new GraphTermsQuery(this.field, this.queryTerms, this.maxDocFreq, this.id);
|
||||
return clone;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString(String defaultField) {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
boolean first = true;
|
||||
for (Term term : this.queryTerms) {
|
||||
if (!first) {
|
||||
builder.append(',');
|
||||
}
|
||||
first = false;
|
||||
builder.append(term.toString());
|
||||
}
|
||||
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
private class WeightOrDocIdSet {
|
||||
final Weight weight;
|
||||
final DocIdSet set;
|
||||
|
||||
WeightOrDocIdSet(DocIdSet bitset) {
|
||||
this.set = bitset;
|
||||
this.weight = null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Weight createWeight(IndexSearcher searcher, boolean needsScores) throws IOException {
|
||||
return new ConstantScoreWeight(this) {
|
||||
|
||||
@Override
|
||||
public void extractTerms(Set<Term> terms) {
|
||||
// no-op
|
||||
// This query is for abuse cases when the number of terms is too high to
|
||||
// run efficiently as a BooleanQuery. So likewise we hide its terms in
|
||||
// order to protect highlighters
|
||||
}
|
||||
|
||||
private WeightOrDocIdSet rewrite(LeafReaderContext context) throws IOException {
|
||||
final LeafReader reader = context.reader();
|
||||
final Fields fields = reader.fields();
|
||||
Terms terms = fields.terms(field);
|
||||
TermsEnum termsEnum = terms.iterator();
|
||||
PostingsEnum docs = null;
|
||||
DocIdSetBuilder builder = new DocIdSetBuilder(reader.maxDoc());
|
||||
for (int i=0; i<finalContexts.size(); i++) {
|
||||
TermContext termContext = finalContexts.get(i);
|
||||
TermState termState = termContext.get(context.ord);
|
||||
if(termState != null) {
|
||||
Term term = finalTerms.get(i);
|
||||
termsEnum.seekExact(term.bytes(), termContext.get(context.ord));
|
||||
docs = termsEnum.postings(docs, PostingsEnum.NONE);
|
||||
builder.add(docs);
|
||||
}
|
||||
}
|
||||
return new WeightOrDocIdSet(builder.build());
|
||||
}
|
||||
|
||||
private Scorer scorer(DocIdSet set) throws IOException {
|
||||
if (set == null) {
|
||||
return null;
|
||||
}
|
||||
final DocIdSetIterator disi = set.iterator();
|
||||
if (disi == null) {
|
||||
return null;
|
||||
}
|
||||
return new ConstantScoreScorer(this, score(), disi);
|
||||
}
|
||||
|
||||
@Override
|
||||
public BulkScorer bulkScorer(LeafReaderContext context) throws IOException {
|
||||
final WeightOrDocIdSet weightOrBitSet = rewrite(context);
|
||||
if (weightOrBitSet.weight != null) {
|
||||
return weightOrBitSet.weight.bulkScorer(context);
|
||||
} else {
|
||||
final Scorer scorer = scorer(weightOrBitSet.set);
|
||||
if (scorer == null) {
|
||||
return null;
|
||||
}
|
||||
return new DefaultBulkScorer(scorer);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Scorer scorer(LeafReaderContext context) throws IOException {
|
||||
final WeightOrDocIdSet weightOrBitSet = rewrite(context);
|
||||
if (weightOrBitSet.weight != null) {
|
||||
return weightOrBitSet.weight.scorer(context);
|
||||
} else {
|
||||
return scorer(weightOrBitSet.set);
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private void collectTermContext(IndexReader reader,
|
||||
List<LeafReaderContext> leaves,
|
||||
TermContext[] contextArray,
|
||||
Term[] queryTerms) throws IOException {
|
||||
TermsEnum termsEnum = null;
|
||||
for (LeafReaderContext context : leaves) {
|
||||
final Fields fields = context.reader().fields();
|
||||
for (int i = 0; i < queryTerms.length; i++) {
|
||||
Term term = queryTerms[i];
|
||||
TermContext termContext = contextArray[i];
|
||||
final Terms terms = fields.terms(term.field());
|
||||
if (terms == null) {
|
||||
// field does not exist
|
||||
continue;
|
||||
}
|
||||
termsEnum = terms.iterator();
|
||||
assert termsEnum != null;
|
||||
|
||||
if (termsEnum == TermsEnum.EMPTY) continue;
|
||||
if (termsEnum.seekExact(term.bytes())) {
|
||||
if (termContext == null) {
|
||||
contextArray[i] = new TermContext(reader.getContext(),
|
||||
termsEnum.termState(), context.ord, termsEnum.docFreq(),
|
||||
termsEnum.totalTermFreq());
|
||||
} else {
|
||||
termContext.register(termsEnum.termState(), context.ord,
|
||||
termsEnum.docFreq(), termsEnum.totalTermFreq());
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -76,6 +76,7 @@ public abstract class QParserPlugin implements NamedListInitializedPlugin, SolrI
|
|||
map.put(HashQParserPlugin.NAME, HashQParserPlugin.class);
|
||||
map.put(GraphQParserPlugin.NAME, GraphQParserPlugin.class);
|
||||
map.put(XmlQParserPlugin.NAME, XmlQParserPlugin.class);
|
||||
map.put(GraphTermsQParserPlugin.NAME, GraphTermsQParserPlugin.class);
|
||||
standardPlugins = Collections.unmodifiableMap(map);
|
||||
}
|
||||
|
||||
|
|
|
@ -165,6 +165,15 @@ public class QueryEqualityTest extends SolrTestCaseJ4 {
|
|||
}
|
||||
}
|
||||
|
||||
public void testGraphTermsQuery() throws Exception {
|
||||
SolrQueryRequest req = req("q", "*:*");
|
||||
try {
|
||||
assertQueryEquals("graphTerms", req, "{!graphTerms f=field1 maxDocFreq=1000}term1,term2");
|
||||
} finally {
|
||||
req.close();
|
||||
}
|
||||
}
|
||||
|
||||
public void testQuerySwitch() throws Exception {
|
||||
SolrQueryRequest req = req("myXXX", "XXX",
|
||||
"myField", "foo_s",
|
||||
|
|
|
@ -0,0 +1,135 @@
|
|||
package org.apache.solr.search;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Set;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
|
||||
import org.apache.solr.SolrTestCaseJ4;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.params.ModifiableSolrParams;
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.search.CollapsingQParserPlugin.GroupHeadSelector;
|
||||
import org.apache.solr.search.CollapsingQParserPlugin.GroupHeadSelectorType;
|
||||
import org.junit.Before;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
//We want codecs that support DocValues, and ones supporting blank/empty values.
|
||||
@SuppressCodecs({"Appending","Lucene3x","Lucene40","Lucene41","Lucene42"})
|
||||
public class TestGraphTermsQParserPlugin extends SolrTestCaseJ4 {
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClass() throws Exception {
|
||||
initCore("solrconfig-collapseqparser.xml", "schema11.xml");
|
||||
}
|
||||
|
||||
@Override
|
||||
@Before
|
||||
public void setUp() throws Exception {
|
||||
// if you override setUp or tearDown, you better call
|
||||
// the super classes version
|
||||
super.setUp();
|
||||
clearIndex();
|
||||
assertU(commit());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testQueries() throws Exception {
|
||||
|
||||
String group = "group_s";
|
||||
|
||||
String[] doc = {"id","1", "term_s", "YYYY", group, "1", "test_ti", "5", "test_tl", "10", "test_tf", "2000"};
|
||||
assertU(adoc(doc));
|
||||
String[] doc1 = {"id","2", "term_s","YYYY", group, "1", "test_ti", "5", "test_tl", "100", "test_tf", "200"};
|
||||
assertU(adoc(doc1));
|
||||
|
||||
String[] doc2 = {"id","3", "term_s", "YYYY", "test_ti", "5000", "test_tl", "100", "test_tf", "200"};
|
||||
assertU(adoc(doc2));
|
||||
assertU(commit());
|
||||
String[] doc3 = {"id","4", "term_s", "YYYY", "test_ti", "500", "test_tl", "1000", "test_tf", "2000"};
|
||||
assertU(adoc(doc3));
|
||||
|
||||
String[] doc4 = {"id","5", "term_s", "YYYY", group, "2", "test_ti", "5", "test_tl", "10", "test_tf", "2000"};
|
||||
assertU(adoc(doc4));
|
||||
assertU(commit());
|
||||
String[] doc5 = {"id","6", "term_s","YYYY", group, "2", "test_ti", "10", "test_tl", "100", "test_tf", "200"};
|
||||
assertU(adoc(doc5));
|
||||
assertU(commit());
|
||||
|
||||
String[] doc6 = {"id","7", "term_s", "YYYY", group, "1", "test_ti", "10", "test_tl", "50", "test_tf", "300"};
|
||||
assertU(adoc(doc6));
|
||||
assertU(commit());
|
||||
|
||||
ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.add("q", "{!graphTerms f=group_s maxDocFreq=10}1,2");
|
||||
params.add("sort", "id asc");
|
||||
assertQ(req(params, "indent", "on"), "*[count(//doc)=5]",
|
||||
"//result/doc[1]/float[@name='id'][.='1.0']",
|
||||
"//result/doc[2]/float[@name='id'][.='2.0']",
|
||||
"//result/doc[3]/float[@name='id'][.='5.0']",
|
||||
"//result/doc[4]/float[@name='id'][.='6.0']",
|
||||
"//result/doc[5]/float[@name='id'][.='7.0']"
|
||||
);
|
||||
|
||||
//Test without maxDocFreq param. Should default to Integer.MAX_VALUE and match all terms.
|
||||
params = new ModifiableSolrParams();
|
||||
params.add("q", "{!graphTerms f=group_s}1,2");
|
||||
params.add("sort", "id asc");
|
||||
assertQ(req(params, "indent", "on"), "*[count(//doc)=5]",
|
||||
"//result/doc[1]/float[@name='id'][.='1.0']",
|
||||
"//result/doc[2]/float[@name='id'][.='2.0']",
|
||||
"//result/doc[3]/float[@name='id'][.='5.0']",
|
||||
"//result/doc[4]/float[@name='id'][.='6.0']",
|
||||
"//result/doc[5]/float[@name='id'][.='7.0']"
|
||||
);
|
||||
|
||||
params = new ModifiableSolrParams();
|
||||
params.add("q", "{!graphTerms f=group_s maxDocFreq=1}1,2");
|
||||
params.add("sort", "id asc");
|
||||
assertQ(req(params, "indent", "on"), "*[count(//doc)=0]"
|
||||
);
|
||||
|
||||
//Test with int field
|
||||
params = new ModifiableSolrParams();
|
||||
params.add("q", "{!graphTerms f=test_ti maxDocFreq=10}5,10");
|
||||
params.add("sort", "id asc");
|
||||
assertQ(req(params, "indent", "on"), "*[count(//doc)=5]",
|
||||
"//result/doc[1]/float[@name='id'][.='1.0']",
|
||||
"//result/doc[2]/float[@name='id'][.='2.0']",
|
||||
"//result/doc[3]/float[@name='id'][.='5.0']",
|
||||
"//result/doc[4]/float[@name='id'][.='6.0']",
|
||||
"//result/doc[5]/float[@name='id'][.='7.0']"
|
||||
);
|
||||
|
||||
//Test with int field
|
||||
params = new ModifiableSolrParams();
|
||||
params.add("q", "{!graphTerms f=test_ti maxDocFreq=3}5,10");
|
||||
params.add("sort", "id asc");
|
||||
assertQ(req(params, "indent", "on"), "*[count(//doc)=2]",
|
||||
"//result/doc[1]/float[@name='id'][.='6.0']",
|
||||
"//result/doc[2]/float[@name='id'][.='7.0']"
|
||||
);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue