negative queries: SOLR-80

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@498813 13f79535-47bb-0310-9956-ffa450edef68
2007-01-22 21:23:22 +00:00 · 2007-01-22 21:23:22 +00:00 · befe8bd62b
parent 4dffe932f5
commit befe8bd62b
4 changed files with 511 additions and 63 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -47,6 +47,13 @@ New Features
    (facet.offset, facet.limit), and explicit sorting (facet.sort).
    facet.zeros is now deprecated.  (yonik)

+ 5. SOLR-80: Negative queries are now allowed everywhere.  Negative queries
+    are generated and cached as their positive counterpart, speeding
+    generation and generally resulting in smaller sets to cache.
+    Set intersections in SolrIndexSearcher are more efficient,
+    starting with the smallest positive set, subtracting all negative
+    sets, then intersecting with all other positive sets.  (yonik)
+
 Changes in runtime behavior
 1. Highlighting using DisMax will only pick up terms from the main 
    user query, not boost or filter queries (klaas).
--- a/src/java/org/apache/solr/search/QueryUtils.java
+++ b/src/java/org/apache/solr/search/QueryUtils.java
@ -0,0 +1,121 @@
+package org.apache.solr.search;
+
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.MatchAllDocsQuery;
+
+import java.util.List;
+import java.util.Arrays;
+
+/**
+ * @author yonik
+ * @version $Id$
+ */
+public class QueryUtils {
+
+  /** return true if this query has no positive components */
+  static boolean isNegative(Query q) {
+    if (!(q instanceof BooleanQuery)) return false;
+    BooleanQuery bq = (BooleanQuery)q;
+    // TODO: use after next lucene update
+    //for (BooleanClause clause: (List <BooleanClause>)bq.clauses()) {
+    // if (bq.getClauses().size()==0) return false;
+    BooleanClause[] clauses = bq.getClauses();
+    if (clauses.length==0) return false;
+    for (BooleanClause clause: clauses) {
+      if (!clause.isProhibited()) return false;
+    }
+    return true;
+  }
+
+  /** Returns the original query if it was already a positive query, otherwise
+   * return the negative of the query (i.e., a positive query).
+   * <p>
+   * Example: both id:10 and id:-10 will return id:10
+   * <p>
+   * The caller can tell the sign of the original by a reference comparison between
+   * the original and returned query.
+   * @param q
+   * @return
+   */
+  static Query getAbs(Query q) {
+    if (!(q instanceof BooleanQuery)) return q;
+    BooleanQuery bq = (BooleanQuery)q;
+
+    BooleanClause[] clauses = bq.getClauses();
+    if (clauses.length==0) return q;
+
+
+    for (BooleanClause clause: clauses) {
+      if (!clause.isProhibited()) return q;
+    }
+
+    if (clauses.length==1) {
+      // if only one clause, dispense with the wrapping BooleanQuery
+      Query negClause = clauses[0].getQuery();
+      // we shouldn't need to worry about adjusting the boosts since the negative
+      // clause would have never been selected in a positive query, and hence would
+      // not contribute to a score.
+      return negClause;
+    } else {
+      BooleanQuery newBq = new BooleanQuery(bq.isCoordDisabled());
+      newBq.setBoost(bq.getBoost());
+      // ignore minNrShouldMatch... it doesn't make sense for a negative query
+
+      // the inverse of -a -b is a OR b
+      for (BooleanClause clause: clauses) {
+        newBq.add(clause.getQuery(), BooleanClause.Occur.SHOULD);
+      }
+      return newBq;
+    }
+
+
+    /*** TODO: use after next lucene update
+    List <BooleanClause> clauses = (List <BooleanClause>)bq.clauses();
+    // A single filtered out stopword currently causes a BooleanQuery with
+    // zero clauses.
+    if (clauses.size()==0) return q;
+
+    for (BooleanClause clause: clauses) {
+      if (!clause.isProhibited()) return q;
+    }
+
+    if (clauses.size()==1) {
+      // if only one clause, dispense with the wrapping BooleanQuery
+      Query negClause = clauses.get(0).getQuery();
+      // we shouldn't need to worry about adjusting the boosts since the negative
+      // clause would have never been selected in a positive query, and hence the
+      // boost is meaningless.
+      return negClause;
+    } else {
+      BooleanQuery newBq = new BooleanQuery(bq.isCoordDisabled());
+      newBq.setBoost(bq.getBoost());
+      // ignore minNrShouldMatch... it doesn't make sense for a negative query
+
+      // the inverse of -a -b is a b
+      for (BooleanClause clause: clauses) {
+        newBq.add(clause.getQuery(), BooleanClause.Occur.SHOULD);
+      }
+      return newBq;
+    }
+    ***/
+  }
+
+  /** Makes negative queries suitable for querying by
+   * lucene.
+   */
+  static Query makeQueryable(Query q) {
+    return isNegative(q) ? fixNegativeQuery(q) : q;
+  }
+
+  /** Fixes a negative query by adding a MatchAllDocs query clause.
+   * The query passed in *must* be a negative query.
+   */
+  static Query fixNegativeQuery(Query q) {
+    BooleanQuery newBq = (BooleanQuery)q.clone();
+    newBq.add(new MatchAllDocsQuery(), BooleanClause.Occur.MUST);
+    return newBq;    
+  }
+
+}
--- a/src/java/org/apache/solr/search/SolrIndexSearcher.java
+++ b/src/java/org/apache/solr/search/SolrIndexSearcher.java
@ -469,44 +469,98 @@ public class SolrIndexSearcher extends Searcher implements SolrInfoMBean {
   * Returns the set of document ids matching a query.
   * This method is cache-aware and attempts to retrieve the answer from the cache if possible.
   * If the answer was not cached, it may have been inserted into the cache as a result of this call.
+   * This method can handle negative queries.
   * <p>
   * The DocSet returned should <b>not</b> be modified.
   */
  public DocSet getDocSet(Query query) throws IOException {
-    DocSet answer;
-    if (filterCache != null) {
-      answer = (DocSet)filterCache.get(query);
-      if (answer!=null) return answer;
-    }
-
-    answer = getDocSetNC(query, null);
+    // Get the absolute value (positive version) of this query.  If we
+    // get back the same reference, we know it's positive.
+    Query absQ = QueryUtils.getAbs(query);
+    boolean positive = query==absQ;

    if (filterCache != null) {
-      filterCache.put(query, answer);
-    }
-
-    return answer;
-  }
-
-
-  // TODO: do a more efficient version that starts with the
-  // smallest DocSet and drives the intersection off that
-  // or implement an intersection() function that takes multiple
-  // DocSets (prob the better way)
-  protected DocSet getDocSet(List<Query> queries) throws IOException {
-    DocSet answer=null;
-    if (queries==null) return null;
-    for (Query q : queries) {
-      if (answer==null) {
-        answer = getDocSet(q);
-      } else {
-        answer = answer.intersection(getDocSet(q));
+      DocSet absAnswer = (DocSet)filterCache.get(absQ);
+      if (absAnswer!=null) {
+        if (positive) return absAnswer;
+        else return getPositiveDocSet(matchAllDocsQuery).andNot(absAnswer);
      }
    }
+
+    DocSet absAnswer = getDocSetNC(absQ, null);
+    DocSet answer = positive ? absAnswer : getPositiveDocSet(matchAllDocsQuery).andNot(absAnswer);
+
+    if (filterCache != null) {
+      // cache negative queries as positive
+      filterCache.put(absQ, absAnswer);
+    }
+
+    return answer;
+  }
+
+  // only handle positive (non negative) queries
+  DocSet getPositiveDocSet(Query q) throws IOException {
+    DocSet answer;
+    if (filterCache != null) {
+      answer = (DocSet)filterCache.get(q);
+      if (answer!=null) return answer;
+    }
+    answer = getDocSetNC(q,null);
+    if (filterCache != null) filterCache.put(q,answer);
    return answer;
  }


+  private static Query matchAllDocsQuery = new MatchAllDocsQuery();
+
+
+  protected DocSet getDocSet(List<Query> queries) throws IOException {
+    if (queries==null) return null;
+    if (queries.size()==1) return getDocSet(queries.get(0));
+    DocSet answer=null;
+
+    boolean[] neg = new boolean[queries.size()];
+    DocSet[] sets = new DocSet[queries.size()];
+
+    int smallestIndex = -1;
+    int smallestCount = Integer.MAX_VALUE;
+    for (int i=0; i<sets.length; i++) {
+      Query q = queries.get(i);
+      Query posQuery = QueryUtils.getAbs(q);
+      sets[i] = getPositiveDocSet(posQuery);
+      // Negative query if absolute value different from original
+      if (q==posQuery) {
+        neg[i] = false;
+        // keep track of the smallest positive set.
+        // This optimization is only worth it if size() is cached, which it would
+        // be if we don't do any set operations.
+        int sz = sets[i].size();
+        if (sz<smallestCount) {
+          smallestCount=sz;
+          smallestIndex=i;
+          answer = sets[i];
+        }
+      } else {
+        neg[i] = true;
+      }
+    }
+
+    // if no positive queries, start off with all docs
+    if (answer==null) answer = getPositiveDocSet(matchAllDocsQuery);
+
+    // do negative queries first to shrink set size
+    for (int i=0; i<sets.length; i++) {
+      if (neg[i]) answer = answer.andNot(sets[i]);
+    }
+
+    for (int i=0; i<sets.length; i++) {
+      if (!neg[i] && i!=smallestIndex) answer = answer.intersection(sets[i]);
+    }
+
+    return answer;
+  }
+
+  // query must be positive
  protected DocSet getDocSetNC(Query query, DocSet filter) throws IOException {
    if (filter==null) {
      DocSetHitCollector hc = new DocSetHitCollector(maxDoc());
@ -552,41 +606,22 @@ public class SolrIndexSearcher extends Searcher implements SolrInfoMBean {
  public DocSet getDocSet(Query query, DocSet filter) throws IOException {
    if (filter==null) return getDocSet(query);

+    // Negative query if absolute value different from original
+    Query absQ = QueryUtils.getAbs(query);
+    boolean positive = absQ==query;
+
    DocSet first;
    if (filterCache != null) {
-      first = (DocSet)filterCache.get(query);
+      first = (DocSet)filterCache.get(absQ);
      if (first==null) {
-        first = getDocSetNC(query,null);
-        filterCache.put(query,first);
+        first = getDocSetNC(absQ,null);
+        filterCache.put(absQ,first);
      }
-      return first.intersection(filter);
+      return positive ? first.intersection(filter) : filter.andNot(first);
    }

-
-    // If there isn't a cache, then do a single filtered query.
-    return getDocSetNC(query,filter);
-
-
-    /******* OLD VERSION that did a filtered query instead of
-     * an intersection if the query docset wasn't found in the cache.
-     * It made misses != inserts (even if no evictions)
-    DocSet first=null;
-    if (filterCache != null) {
-      first = (DocSet)filterCache.get(query);
-      if (first != null) {
-        return first.intersection(filter);
-      }
-    }
-
-    DocSet answer = getDocSetNC(query, filter);
-    // nothing is inserted into the cache, because we don't cache materialized filters.
-    // Hmmm, we *could* make a hitcollector that made a DocSet out of the query at the
-    // same time it was running the filter though...
-
-    // Q: we could call getDocSet(query) and then take the intersection instead of running
-    // the query as a filter.  Then it could be cached.
-    return answer;
-    ****************/
+    // If there isn't a cache, then do a single filtered query if positive.
+    return positive ? getDocSetNC(absQ,filter) : filter.andNot(getPositiveDocSet(absQ));
  }


@ -778,6 +813,7 @@ public class SolrIndexSearcher extends Searcher implements SolrInfoMBean {
    int[] ids;
    float[] scores;

+    query = QueryUtils.makeQueryable(query);

    // handle zero case...
    if (lastDocRequested<=0) {
@ -935,6 +971,9 @@ public class SolrIndexSearcher extends Searcher implements SolrInfoMBean {
    int[] ids;
    float[] scores;
    final DocSetHitCollector setHC = new DocSetHitCollector(maxDoc());
+
+    query = QueryUtils.makeQueryable(query);
+
    // TODO: perhaps unify getDocListAndSetNC and getDocListNC without imposing a significant performance hit

    // Comment: gathering the set before the filter is applied allows one to cache
@ -1271,13 +1310,10 @@ public class SolrIndexSearcher extends Searcher implements SolrInfoMBean {
   * @throws IOException
   */
  public int numDocs(Query a, DocSet b) throws IOException {
-    // reverse: do the query on filter and filter using docs...
-    // or if filter is a term query, can get the freq and
-    // drive things off that.
-    //  this higher level API leaves open more optimization possibilities.
-    // prob only worth it if cacheHitRatio is bad...
-
-    return b.intersectionSize(getDocSet(a));
+    // Negative query if absolute value different from original
+    Query absQ = QueryUtils.getAbs(a);
+    DocSet positiveA = getPositiveDocSet(absQ);
+    return a==absQ ? b.intersectionSize(positiveA) : b.andNotSize(positiveA);
  }

   /**
@ -1291,7 +1327,25 @@ public class SolrIndexSearcher extends Searcher implements SolrInfoMBean {
   * @throws IOException
   */
  public int numDocs(Query a, Query b) throws IOException {
-    return getDocSet(b).intersectionSize(getDocSet(a));
+    Query absA = QueryUtils.getAbs(a);
+    Query absB = QueryUtils.getAbs(b);     
+    DocSet positiveA = getPositiveDocSet(absA);
+    DocSet positiveB = getPositiveDocSet(absB);
+
+    // Negative query if absolute value different from original
+    if (a==absA) {
+      if (b==absB) return positiveA.intersectionSize(positiveB);
+      return positiveA.andNotSize(positiveB);
+    }
+    if (b==absB) return positiveB.andNotSize(positiveA);
+
+    // if both negative, we need to create a temp DocSet since we
+    // don't have a counting method that takes three.
+    DocSet all = getPositiveDocSet(matchAllDocsQuery);
+
+    // -a -b == *:*.andNot(a).andNotSize(b) == *.*.andNotSize(a.union(b))
+    // we use the last form since the intermediate DocSet should normally be smaller.
+    return all.andNotSize(positiveA.union(positiveB));
  }


--- a/src/test/org/apache/solr/search/TestQueryUtils.java
+++ b/src/test/org/apache/solr/search/TestQueryUtils.java
@ -0,0 +1,266 @@
+package org.apache.solr.search;
+
+import junit.framework.TestCase;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.index.Term;
+import org.apache.solr.util.AbstractSolrTestCase;
+
+/**
+ * @author yonik
+ * @version $Id$
+ */
+public class TestQueryUtils extends AbstractSolrTestCase {
+
+  public String getSchemaFile() { return "schema.xml"; }
+  public String getSolrConfigFile() { return "solrconfig.xml"; }
+
+  public void setUp() throws Exception {
+    super.setUp();
+  }
+  public void tearDown() throws Exception {
+    super.tearDown();
+  }
+
+  public void positive(Query q) {
+    assertFalse(QueryUtils.isNegative(q));
+    assertTrue(QueryUtils.getAbs(q)==q);
+    BooleanClause[] clauses = (q instanceof BooleanQuery) ? ((BooleanQuery)q).getClauses() : null;
+    if (clauses != null) {
+      if (clauses.length != 0) {
+        assertTrue(QueryUtils.makeQueryable(q)==q);
+      }
+    } else {
+      assertTrue(QueryUtils.makeQueryable(q)==q);
+    }
+  }
+
+  public void negative(Query q) {
+    assertTrue(QueryUtils.isNegative(q));
+    Query abs = QueryUtils.getAbs(q);
+    assertTrue(q != abs);
+    Query neg2 = QueryUtils.fixNegativeQuery(q);
+
+    assertFalse(abs.equals(q));
+    assertFalse(neg2.equals(q));
+  }
+
+  public void testNegativeQueries() {
+    TermQuery tq = new TermQuery(new Term("hi","there"));
+    TermQuery tq2 = new TermQuery(new Term("wow","dude"));
+    BooleanQuery bq = new BooleanQuery();
+
+    positive(tq);
+    // positive(bq);
+    bq.add(tq, BooleanClause.Occur.SHOULD);
+    positive(bq);
+    bq.add(tq2, BooleanClause.Occur.MUST_NOT);
+    positive(bq);
+
+    bq = new BooleanQuery();
+    bq.add(tq,BooleanClause.Occur.MUST_NOT);
+    negative(bq);
+
+    bq.add(tq2,BooleanClause.Occur.MUST_NOT);
+    negative(bq);
+
+
+    String f = "name";  // name is whitespace tokenized
+
+    assertU(adoc("id", "1",  f, "A"));
+    assertU(adoc("id", "2",  f, "B"));
+    assertU(adoc("id", "3",  f, "C"));
+    assertU(adoc("id", "4",  f, "C"));
+    assertU(adoc("id", "5",  f, "D"));
+    assertU(adoc("id", "6",  f, "E"));
+    assertU(adoc("id", "7",  f, "E"));
+    assertU(adoc("id", "8",  f, "E W"));
+    assertU(adoc("id", "9",  f, "F W"));
+    assertU(adoc("id", "10", f, "G W"));
+    assertU(adoc("id", "11", f, "G X "));
+    assertU(adoc("id", "12", f, "G X Y"));
+    assertU(adoc("id", "13", f, "G X Y Z"));
+    assertU(adoc("id", "14", f, "G Y Z"));
+    assertU(adoc("id", "15", f, "G Z"));
+    assertU(adoc("id", "16", f, "G"));
+    assertU(commit());
+
+    assertQ("test negative base q matching nothing",
+            req("-qlkciyopsbgzyvkylsjhchghjrdf")
+            ,"//result[@numFound='16']"
+            );
+
+    assertQ("test negative base q matching something",
+            req("-name:E")
+            ,"//result[@numFound='13']"
+            );
+
+    assertQ("test negative base q with two terms",
+            req("-name:G -name:W")
+            ,"//result[@numFound='7']"
+            );
+
+    assertQ("test negative base q with three terms",
+            req("-name:G -name:W -name:E")
+            ,"//result[@numFound='5']"
+            );
+
+    assertQ("test negative boolean query",
+            req("-(name:G OR name:W)")
+            ,"//result[@numFound='7']"
+            );
+
+    assertQ("test non negative q",
+            req("-name:G -name:W -name:E id:[* TO *]")
+            ,"//result[@numFound='5']"
+            );
+
+    assertQ("test non negative q",
+            req("-name:G -name:W -name:E +id:[* TO *]")
+            ,"//result[@numFound='5']"
+            );
+
+    // now for the filters...
+    assertQ("test negative base q matching nothing, with filters",
+            req("q","-qlkciyopsbgzyvkylsjhchghjrdf"
+                ,"fq","name:A"
+            )
+            ,"//result[@numFound='1']"
+            );
+
+    assertQ("test negative filters",
+            req("q","name:A"
+                ,"fq","-name:A"
+            )
+            ,"//result[@numFound='0']"
+            );
+    assertQ("test negative filters",
+            req("q","name:A"
+                ,"fq","-name:A"
+            )
+            ,"//result[@numFound='0']"
+            );
+    assertQ("test negative filters",
+            req("q","-name:E"
+                ,"fq","name:E"
+            )
+            ,"//result[@numFound='0']"
+            );
+    assertQ("test negative filters",
+            req("q","-name:E"
+                ,"fq","name:W"
+            )
+            ,"//result[@numFound='2']"
+            );
+    assertQ("test negative filters",
+            req("q","-name:E"
+                ,"fq","name:W"
+            )
+            ,"//result[@numFound='2']"
+            );
+    assertQ("one pos filter, one neg",
+            req("q","-name:E"
+                ,"fq","name:W"
+                ,"fq","-name:G"
+            )
+            ,"//result[@numFound='1']"
+            );
+        assertQ("two neg filters",
+            req("q","-name:E"
+                ,"fq","-name:W"
+                ,"fq","-name:G"
+            )
+            ,"//result[@numFound='5']"  // ABCCD
+            );
+
+        assertQ("three neg filters",
+            req("q","-name:E"
+                ,"fq","-name:W"
+                ,"fq","-name:G"
+                ,"fq","-name:C"
+            )
+            ,"//result[@numFound='3']"  // ABD
+            );
+
+        assertQ("compound neg filters",
+            req("q","-name:E"
+                ,"fq","-name:W -name:G"
+                ,"fq","-name:C"
+            )
+            ,"//result[@numFound='3']"  // ABD
+            );
+
+         assertQ("compound neg filters",
+            req("q","-name:E"
+                ,"fq","-name:W -name:G -name:C"
+            )
+            ,"//result[@numFound='3']"  // ABD
+            );
+
+        assertQ("compound neg filters",
+            req("q","-name:E"
+                ,"fq","-(name:W name:G name:C)"
+            )
+            ,"//result[@numFound='3']"  // ABD
+            );
+
+        assertQ("three neg filters + pos",
+            req("q","-name:E"
+                ,"fq","-name:W"
+                ,"fq","-name:G"
+                ,"fq","-name:C"
+                ,"fq","name:G"
+            )
+            ,"//result[@numFound='0']"
+            );
+        assertQ("three neg filters + pos",
+            req("q","-name:E"
+                ,"fq","-name:W"
+                ,"fq","-name:G"
+                ,"fq","-name:C"
+                ,"fq","+id:1"
+            )
+            ,"//result[@numFound='1']"  // A
+            );
+         assertQ("three neg filters + pos",
+            req("q","-name:E"
+                ,"fq","-name:W"
+                ,"fq","-name:G"
+                ,"fq","-name:C"
+                ,"fq","id:[* TO *]"
+            )
+            ,"//result[@numFound='3']"  // ABD
+            );
+
+         // QueryParser turns term queries on stopwords into a BooleanQuery with
+         // zero clauses.
+         assertQ("neg base query on stopword",
+            req("q","-text:stopworda")
+            ,"//result[@numFound='16']"  // ABD
+            );
+
+         assertQ("negative filter on stopword",
+            req("q","id:[* TO *]"
+                ,"fq","-text:stopworda"
+            )
+            ,"//result[@numFound='16']"  // ABD
+            );
+         assertQ("two negative filters on stopword",
+            req("q","id:[* TO *]"
+                ,"fq","-text:stopworda"
+                ,"fq","-text:stopworda"
+            )
+            ,"//result[@numFound='16']"  // ABD
+            );
+         assertQ("compound negative filters with stopword",
+            req("q","id:[* TO *]"
+                ,"fq","-text:stopworda -id:1"
+            )
+            ,"//result[@numFound='15']"  // ABD
+            );
+  }
+
+
+}