SOLR-9027: Add GraphTermsQuery to limit traversal on high frequency nodes

2016-04-26 10:15:03 -04:00 · 2016-04-26 10:15:03 -04:00 · d66f5515e6
parent 9c69c4cf12
commit d66f5515e6
4 changed files with 437 additions and 0 deletions
--- a/solr/core/src/java/org/apache/solr/search/GraphTermsQParserPlugin.java
+++ b/solr/core/src/java/org/apache/solr/search/GraphTermsQParserPlugin.java
@ -0,0 +1,292 @@
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.search;
+
+import org.apache.lucene.index.Fields;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermContext;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermState;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.BulkScorer;
+import org.apache.lucene.search.ConstantScoreQuery;
+import org.apache.lucene.search.ConstantScoreScorer;
+import org.apache.lucene.search.ConstantScoreWeight;
+import org.apache.lucene.search.DocIdSet;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.MatchNoDocsQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.Weight;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.DocIdSetBuilder;
+import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.schema.FieldType;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+
+/**
+ *  The GraphTermsQuery builds a disjunction query from a list of terms. The terms are first filtered by the maxDocFreq parameter.
+ *  This allows graph traversals to skip traversing high frequency nodes which is often desirable from a performance standpoint.
+ *
+ *   Syntax: {!graphTerms f=field maxDocFreq=10000}term1,term2,term3
+ **/
+
+public class GraphTermsQParserPlugin extends QParserPlugin {
+  public static final String NAME = "graphTerms";
+
+  @Override
+  public QParser createParser(String qstr, SolrParams localParams, SolrParams params, SolrQueryRequest req) {
+    return new QParser(qstr, localParams, params, req) {
+      @Override
+      public Query parse() throws SyntaxError {
+        String fname = localParams.get(QueryParsing.F);
+        FieldType ft = req.getSchema().getFieldTypeNoEx(fname);
+        int maxDocFreq = localParams.getInt("maxDocFreq", Integer.MAX_VALUE);
+        String qstr = localParams.get(QueryParsing.V);//never null
+
+        if (qstr.length() == 0) {
+          return new MatchNoDocsQuery();
+        }
+
+        final String[] splitVals = qstr.split(",");
+
+        BytesRef[] bytesRefs = new BytesRef[splitVals.length];
+        BytesRefBuilder term = new BytesRefBuilder();
+        for (int i = 0; i < splitVals.length; i++) {
+          String stringVal = splitVals[i].trim();
+          if (ft != null) {
+            ft.readableToIndexed(stringVal, term);
+          } else {
+            term.copyChars(stringVal);
+          }
+          bytesRefs[i] = term.toBytesRef();
+        }
+
+        return new ConstantScoreQuery(new GraphTermsQuery(fname, bytesRefs, maxDocFreq));
+      }
+    };
+  }
+
+  private class GraphTermsQuery extends Query {
+
+    private Term[] queryTerms;
+    private List<TermContext> finalContexts;
+    private List<Term> finalTerms;
+    private String field;
+    private int maxDocFreq;
+    private Object id;
+
+    public GraphTermsQuery(String field, BytesRef[] terms, int maxDocFreq) {
+      this.maxDocFreq = maxDocFreq;
+      this.field = field;
+      this.queryTerms = new Term[terms.length];
+      this.id = new Object();
+      for(int i=0; i<terms.length; i++) {
+        this.queryTerms[i] = new Term(field, terms[i]);
+      }
+    }
+
+    //Just for cloning
+    private GraphTermsQuery(String field, Term[] terms, int maxDocFreq, Object id) {
+      this.field = field;
+      this.queryTerms = terms;
+      this.maxDocFreq = maxDocFreq;
+      this.id = id;
+    }
+
+    @Override
+    public Query rewrite(IndexReader reader) throws IOException {
+      if(this.finalContexts == null) {
+        //This query has not been re-written yet.
+        //Rewriting the query does not effect the cache key as this query is not designed to be cached.
+        this.finalContexts = new ArrayList();
+        this.finalTerms = new ArrayList();
+        List<LeafReaderContext> contexts = reader.leaves();
+        TermContext[] termContexts = new TermContext[this.queryTerms.length];
+        collectTermContext(reader, contexts, termContexts, this.queryTerms);
+        for(int i=0; i<termContexts.length; i++) {
+          TermContext termContext = termContexts[i];
+          if(termContext != null && termContext.docFreq() < this.maxDocFreq) {
+            this.finalContexts.add(termContext);
+            this.finalTerms.add(queryTerms[i]);
+          }
+        }
+      }
+
+      return this;
+    }
+
+    public int hashCode() {
+      return 31 * super.hashCode() + id.hashCode();
+    }
+
+    public boolean equals(Object o) {
+      if (super.equals(o) == false) {
+        return false;
+      }
+
+      GraphTermsQuery q = (GraphTermsQuery)o;
+      return id == q.id;
+    }
+
+    public GraphTermsQuery clone() {
+      GraphTermsQuery clone = new GraphTermsQuery(this.field, this.queryTerms, this.maxDocFreq, this.id);
+      return clone;
+    }
+
+    @Override
+    public String toString(String defaultField) {
+      StringBuilder builder = new StringBuilder();
+      boolean first = true;
+      for (Term term : this.queryTerms) {
+        if (!first) {
+          builder.append(',');
+        }
+        first = false;
+        builder.append(term.toString());
+      }
+
+      return builder.toString();
+    }
+
+    private class WeightOrDocIdSet {
+      final Weight weight;
+      final DocIdSet set;
+
+      WeightOrDocIdSet(DocIdSet bitset) {
+        this.set = bitset;
+        this.weight = null;
+      }
+    }
+
+    @Override
+    public Weight createWeight(IndexSearcher searcher, boolean needsScores) throws IOException {
+      return new ConstantScoreWeight(this) {
+
+        @Override
+        public void extractTerms(Set<Term> terms) {
+          // no-op
+          // This query is for abuse cases when the number of terms is too high to
+          // run efficiently as a BooleanQuery. So likewise we hide its terms in
+          // order to protect highlighters
+        }
+
+        private WeightOrDocIdSet rewrite(LeafReaderContext context) throws IOException {
+          final LeafReader reader = context.reader();
+          final Fields fields = reader.fields();
+          Terms terms = fields.terms(field);
+          TermsEnum  termsEnum = terms.iterator();
+          PostingsEnum docs = null;
+          DocIdSetBuilder builder = new DocIdSetBuilder(reader.maxDoc());
+          for (int i=0; i<finalContexts.size(); i++) {
+            TermContext termContext = finalContexts.get(i);
+            TermState termState = termContext.get(context.ord);
+            if(termState != null) {
+              Term term = finalTerms.get(i);
+              termsEnum.seekExact(term.bytes(), termContext.get(context.ord));
+              docs = termsEnum.postings(docs, PostingsEnum.NONE);
+              builder.add(docs);
+            }
+          }
+          return new WeightOrDocIdSet(builder.build());
+        }
+
+        private Scorer scorer(DocIdSet set) throws IOException {
+          if (set == null) {
+            return null;
+          }
+          final DocIdSetIterator disi = set.iterator();
+          if (disi == null) {
+            return null;
+          }
+          return new ConstantScoreScorer(this, score(), disi);
+        }
+
+        @Override
+        public BulkScorer bulkScorer(LeafReaderContext context) throws IOException {
+          final WeightOrDocIdSet weightOrBitSet = rewrite(context);
+          if (weightOrBitSet.weight != null) {
+            return weightOrBitSet.weight.bulkScorer(context);
+          } else {
+            final Scorer scorer = scorer(weightOrBitSet.set);
+            if (scorer == null) {
+              return null;
+            }
+            return new DefaultBulkScorer(scorer);
+          }
+        }
+
+        @Override
+        public Scorer scorer(LeafReaderContext context) throws IOException {
+          final WeightOrDocIdSet weightOrBitSet = rewrite(context);
+          if (weightOrBitSet.weight != null) {
+            return weightOrBitSet.weight.scorer(context);
+          } else {
+            return scorer(weightOrBitSet.set);
+          }
+        }
+      };
+    }
+
+    private void collectTermContext(IndexReader reader,
+                                    List<LeafReaderContext> leaves,
+                                    TermContext[] contextArray,
+                                    Term[] queryTerms) throws IOException {
+      TermsEnum termsEnum = null;
+      for (LeafReaderContext context : leaves) {
+        final Fields fields = context.reader().fields();
+        for (int i = 0; i < queryTerms.length; i++) {
+          Term term = queryTerms[i];
+          TermContext termContext = contextArray[i];
+          final Terms terms = fields.terms(term.field());
+          if (terms == null) {
+            // field does not exist
+            continue;
+          }
+          termsEnum = terms.iterator();
+          assert termsEnum != null;
+
+          if (termsEnum == TermsEnum.EMPTY) continue;
+          if (termsEnum.seekExact(term.bytes())) {
+            if (termContext == null) {
+              contextArray[i] = new TermContext(reader.getContext(),
+                  termsEnum.termState(), context.ord, termsEnum.docFreq(),
+                  termsEnum.totalTermFreq());
+            } else {
+              termContext.register(termsEnum.termState(), context.ord,
+                  termsEnum.docFreq(), termsEnum.totalTermFreq());
+            }
+
+          }
+        }
+      }
+    }
+  }
+}
--- a/solr/core/src/java/org/apache/solr/search/QParserPlugin.java
+++ b/solr/core/src/java/org/apache/solr/search/QParserPlugin.java
@ -76,6 +76,7 @@ public abstract class QParserPlugin implements NamedListInitializedPlugin, SolrI
    map.put(HashQParserPlugin.NAME, HashQParserPlugin.class);
    map.put(GraphQParserPlugin.NAME, GraphQParserPlugin.class);
    map.put(XmlQParserPlugin.NAME, XmlQParserPlugin.class);
+    map.put(GraphTermsQParserPlugin.NAME, GraphTermsQParserPlugin.class);
    standardPlugins = Collections.unmodifiableMap(map);
  }

--- a/solr/core/src/test/org/apache/solr/search/QueryEqualityTest.java
+++ b/solr/core/src/test/org/apache/solr/search/QueryEqualityTest.java
@ -165,6 +165,15 @@ public class QueryEqualityTest extends SolrTestCaseJ4 {
    }
  }

+  public void testGraphTermsQuery() throws Exception {
+    SolrQueryRequest req = req("q", "*:*");
+    try {
+      assertQueryEquals("graphTerms", req, "{!graphTerms f=field1 maxDocFreq=1000}term1,term2");
+    } finally {
+      req.close();
+    }
+  }
+
  public void testQuerySwitch() throws Exception {
    SolrQueryRequest req = req("myXXX", "XXX", 
                               "myField", "foo_s",
--- a/solr/core/src/test/org/apache/solr/search/TestGraphTermsQParserPlugin.java
+++ b/solr/core/src/test/org/apache/solr/search/TestGraphTermsQParserPlugin.java
@ -0,0 +1,135 @@
+package org.apache.solr.search;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Collections;
+import java.util.List;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Set;
+import java.util.HashSet;
+import java.util.Iterator;
+
+import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
+import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.params.ModifiableSolrParams;
+import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.search.CollapsingQParserPlugin.GroupHeadSelector;
+import org.apache.solr.search.CollapsingQParserPlugin.GroupHeadSelectorType;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+//We want codecs that support DocValues, and ones supporting blank/empty values.
+@SuppressCodecs({"Appending","Lucene3x","Lucene40","Lucene41","Lucene42"})
+public class TestGraphTermsQParserPlugin extends SolrTestCaseJ4 {
+
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    initCore("solrconfig-collapseqparser.xml", "schema11.xml");
+  }
+
+  @Override
+  @Before
+  public void setUp() throws Exception {
+    // if you override setUp or tearDown, you better call
+    // the super classes version
+    super.setUp();
+    clearIndex();
+    assertU(commit());
+  }
+
+  @Test
+  public void testQueries() throws Exception {
+
+    String group = "group_s";
+
+    String[] doc = {"id","1", "term_s", "YYYY", group, "1", "test_ti", "5", "test_tl", "10", "test_tf", "2000"};
+    assertU(adoc(doc));
+    String[] doc1 = {"id","2", "term_s","YYYY", group, "1", "test_ti", "5", "test_tl", "100", "test_tf", "200"};
+    assertU(adoc(doc1));
+
+    String[] doc2 = {"id","3", "term_s", "YYYY", "test_ti", "5000", "test_tl", "100", "test_tf", "200"};
+    assertU(adoc(doc2));
+    assertU(commit());
+    String[] doc3 = {"id","4", "term_s", "YYYY", "test_ti", "500", "test_tl", "1000", "test_tf", "2000"};
+    assertU(adoc(doc3));
+
+    String[] doc4 = {"id","5", "term_s", "YYYY", group, "2", "test_ti", "5", "test_tl", "10", "test_tf", "2000"};
+    assertU(adoc(doc4));
+    assertU(commit());
+    String[] doc5 = {"id","6", "term_s","YYYY", group, "2", "test_ti", "10", "test_tl", "100", "test_tf", "200"};
+    assertU(adoc(doc5));
+    assertU(commit());
+
+    String[] doc6 = {"id","7", "term_s", "YYYY", group, "1", "test_ti", "10", "test_tl", "50", "test_tf", "300"};
+    assertU(adoc(doc6));
+    assertU(commit());
+
+    ModifiableSolrParams params = new ModifiableSolrParams();
+    params.add("q", "{!graphTerms f=group_s maxDocFreq=10}1,2");
+    params.add("sort", "id asc");
+    assertQ(req(params, "indent", "on"), "*[count(//doc)=5]",
+        "//result/doc[1]/float[@name='id'][.='1.0']",
+        "//result/doc[2]/float[@name='id'][.='2.0']",
+        "//result/doc[3]/float[@name='id'][.='5.0']",
+        "//result/doc[4]/float[@name='id'][.='6.0']",
+        "//result/doc[5]/float[@name='id'][.='7.0']"
+    );
+
+    //Test without maxDocFreq param. Should default to Integer.MAX_VALUE and match all terms.
+    params = new ModifiableSolrParams();
+    params.add("q", "{!graphTerms f=group_s}1,2");
+    params.add("sort", "id asc");
+    assertQ(req(params, "indent", "on"), "*[count(//doc)=5]",
+        "//result/doc[1]/float[@name='id'][.='1.0']",
+        "//result/doc[2]/float[@name='id'][.='2.0']",
+        "//result/doc[3]/float[@name='id'][.='5.0']",
+        "//result/doc[4]/float[@name='id'][.='6.0']",
+        "//result/doc[5]/float[@name='id'][.='7.0']"
+    );
+
+    params = new ModifiableSolrParams();
+    params.add("q", "{!graphTerms f=group_s maxDocFreq=1}1,2");
+    params.add("sort", "id asc");
+    assertQ(req(params, "indent", "on"), "*[count(//doc)=0]"
+    );
+
+    //Test with int field
+    params = new ModifiableSolrParams();
+    params.add("q", "{!graphTerms f=test_ti maxDocFreq=10}5,10");
+    params.add("sort", "id asc");
+    assertQ(req(params, "indent", "on"), "*[count(//doc)=5]",
+        "//result/doc[1]/float[@name='id'][.='1.0']",
+        "//result/doc[2]/float[@name='id'][.='2.0']",
+        "//result/doc[3]/float[@name='id'][.='5.0']",
+        "//result/doc[4]/float[@name='id'][.='6.0']",
+        "//result/doc[5]/float[@name='id'][.='7.0']"
+    );
+
+    //Test with int field
+    params = new ModifiableSolrParams();
+    params.add("q", "{!graphTerms f=test_ti maxDocFreq=3}5,10");
+    params.add("sort", "id asc");
+    assertQ(req(params, "indent", "on"), "*[count(//doc)=2]",
+        "//result/doc[1]/float[@name='id'][.='6.0']",
+        "//result/doc[2]/float[@name='id'][.='7.0']"
+    );
+  }
+}