negative queries: SOLR-80

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@498813 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2007-01-22 21:23:22 +00:00
parent 4dffe932f5
commit befe8bd62b
4 changed files with 511 additions and 63 deletions

View File

@ -47,6 +47,13 @@ New Features
(facet.offset, facet.limit), and explicit sorting (facet.sort).
facet.zeros is now deprecated. (yonik)
5. SOLR-80: Negative queries are now allowed everywhere. Negative queries
are generated and cached as their positive counterpart, speeding
generation and generally resulting in smaller sets to cache.
Set intersections in SolrIndexSearcher are more efficient,
starting with the smallest positive set, subtracting all negative
sets, then intersecting with all other positive sets. (yonik)
Changes in runtime behavior
1. Highlighting using DisMax will only pick up terms from the main
user query, not boost or filter queries (klaas).

View File

@ -0,0 +1,121 @@
package org.apache.solr.search;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.MatchAllDocsQuery;
import java.util.List;
import java.util.Arrays;
/**
* @author yonik
* @version $Id$
*/
public class QueryUtils {
/** return true if this query has no positive components */
static boolean isNegative(Query q) {
if (!(q instanceof BooleanQuery)) return false;
BooleanQuery bq = (BooleanQuery)q;
// TODO: use after next lucene update
//for (BooleanClause clause: (List <BooleanClause>)bq.clauses()) {
// if (bq.getClauses().size()==0) return false;
BooleanClause[] clauses = bq.getClauses();
if (clauses.length==0) return false;
for (BooleanClause clause: clauses) {
if (!clause.isProhibited()) return false;
}
return true;
}
/** Returns the original query if it was already a positive query, otherwise
* return the negative of the query (i.e., a positive query).
* <p>
* Example: both id:10 and id:-10 will return id:10
* <p>
* The caller can tell the sign of the original by a reference comparison between
* the original and returned query.
* @param q
* @return
*/
static Query getAbs(Query q) {
if (!(q instanceof BooleanQuery)) return q;
BooleanQuery bq = (BooleanQuery)q;
BooleanClause[] clauses = bq.getClauses();
if (clauses.length==0) return q;
for (BooleanClause clause: clauses) {
if (!clause.isProhibited()) return q;
}
if (clauses.length==1) {
// if only one clause, dispense with the wrapping BooleanQuery
Query negClause = clauses[0].getQuery();
// we shouldn't need to worry about adjusting the boosts since the negative
// clause would have never been selected in a positive query, and hence would
// not contribute to a score.
return negClause;
} else {
BooleanQuery newBq = new BooleanQuery(bq.isCoordDisabled());
newBq.setBoost(bq.getBoost());
// ignore minNrShouldMatch... it doesn't make sense for a negative query
// the inverse of -a -b is a OR b
for (BooleanClause clause: clauses) {
newBq.add(clause.getQuery(), BooleanClause.Occur.SHOULD);
}
return newBq;
}
/*** TODO: use after next lucene update
List <BooleanClause> clauses = (List <BooleanClause>)bq.clauses();
// A single filtered out stopword currently causes a BooleanQuery with
// zero clauses.
if (clauses.size()==0) return q;
for (BooleanClause clause: clauses) {
if (!clause.isProhibited()) return q;
}
if (clauses.size()==1) {
// if only one clause, dispense with the wrapping BooleanQuery
Query negClause = clauses.get(0).getQuery();
// we shouldn't need to worry about adjusting the boosts since the negative
// clause would have never been selected in a positive query, and hence the
// boost is meaningless.
return negClause;
} else {
BooleanQuery newBq = new BooleanQuery(bq.isCoordDisabled());
newBq.setBoost(bq.getBoost());
// ignore minNrShouldMatch... it doesn't make sense for a negative query
// the inverse of -a -b is a b
for (BooleanClause clause: clauses) {
newBq.add(clause.getQuery(), BooleanClause.Occur.SHOULD);
}
return newBq;
}
***/
}
/** Makes negative queries suitable for querying by
* lucene.
*/
static Query makeQueryable(Query q) {
return isNegative(q) ? fixNegativeQuery(q) : q;
}
/** Fixes a negative query by adding a MatchAllDocs query clause.
* The query passed in *must* be a negative query.
*/
static Query fixNegativeQuery(Query q) {
BooleanQuery newBq = (BooleanQuery)q.clone();
newBq.add(new MatchAllDocsQuery(), BooleanClause.Occur.MUST);
return newBq;
}
}

View File

@ -469,44 +469,98 @@ public class SolrIndexSearcher extends Searcher implements SolrInfoMBean {
* Returns the set of document ids matching a query.
* This method is cache-aware and attempts to retrieve the answer from the cache if possible.
* If the answer was not cached, it may have been inserted into the cache as a result of this call.
* This method can handle negative queries.
* <p>
* The DocSet returned should <b>not</b> be modified.
*/
public DocSet getDocSet(Query query) throws IOException {
DocSet answer;
if (filterCache != null) {
answer = (DocSet)filterCache.get(query);
if (answer!=null) return answer;
}
answer = getDocSetNC(query, null);
// Get the absolute value (positive version) of this query. If we
// get back the same reference, we know it's positive.
Query absQ = QueryUtils.getAbs(query);
boolean positive = query==absQ;
if (filterCache != null) {
filterCache.put(query, answer);
}
return answer;
}
// TODO: do a more efficient version that starts with the
// smallest DocSet and drives the intersection off that
// or implement an intersection() function that takes multiple
// DocSets (prob the better way)
protected DocSet getDocSet(List<Query> queries) throws IOException {
DocSet answer=null;
if (queries==null) return null;
for (Query q : queries) {
if (answer==null) {
answer = getDocSet(q);
} else {
answer = answer.intersection(getDocSet(q));
DocSet absAnswer = (DocSet)filterCache.get(absQ);
if (absAnswer!=null) {
if (positive) return absAnswer;
else return getPositiveDocSet(matchAllDocsQuery).andNot(absAnswer);
}
}
DocSet absAnswer = getDocSetNC(absQ, null);
DocSet answer = positive ? absAnswer : getPositiveDocSet(matchAllDocsQuery).andNot(absAnswer);
if (filterCache != null) {
// cache negative queries as positive
filterCache.put(absQ, absAnswer);
}
return answer;
}
// only handle positive (non negative) queries
DocSet getPositiveDocSet(Query q) throws IOException {
DocSet answer;
if (filterCache != null) {
answer = (DocSet)filterCache.get(q);
if (answer!=null) return answer;
}
answer = getDocSetNC(q,null);
if (filterCache != null) filterCache.put(q,answer);
return answer;
}
private static Query matchAllDocsQuery = new MatchAllDocsQuery();
protected DocSet getDocSet(List<Query> queries) throws IOException {
if (queries==null) return null;
if (queries.size()==1) return getDocSet(queries.get(0));
DocSet answer=null;
boolean[] neg = new boolean[queries.size()];
DocSet[] sets = new DocSet[queries.size()];
int smallestIndex = -1;
int smallestCount = Integer.MAX_VALUE;
for (int i=0; i<sets.length; i++) {
Query q = queries.get(i);
Query posQuery = QueryUtils.getAbs(q);
sets[i] = getPositiveDocSet(posQuery);
// Negative query if absolute value different from original
if (q==posQuery) {
neg[i] = false;
// keep track of the smallest positive set.
// This optimization is only worth it if size() is cached, which it would
// be if we don't do any set operations.
int sz = sets[i].size();
if (sz<smallestCount) {
smallestCount=sz;
smallestIndex=i;
answer = sets[i];
}
} else {
neg[i] = true;
}
}
// if no positive queries, start off with all docs
if (answer==null) answer = getPositiveDocSet(matchAllDocsQuery);
// do negative queries first to shrink set size
for (int i=0; i<sets.length; i++) {
if (neg[i]) answer = answer.andNot(sets[i]);
}
for (int i=0; i<sets.length; i++) {
if (!neg[i] && i!=smallestIndex) answer = answer.intersection(sets[i]);
}
return answer;
}
// query must be positive
protected DocSet getDocSetNC(Query query, DocSet filter) throws IOException {
if (filter==null) {
DocSetHitCollector hc = new DocSetHitCollector(maxDoc());
@ -552,41 +606,22 @@ public class SolrIndexSearcher extends Searcher implements SolrInfoMBean {
public DocSet getDocSet(Query query, DocSet filter) throws IOException {
if (filter==null) return getDocSet(query);
// Negative query if absolute value different from original
Query absQ = QueryUtils.getAbs(query);
boolean positive = absQ==query;
DocSet first;
if (filterCache != null) {
first = (DocSet)filterCache.get(query);
first = (DocSet)filterCache.get(absQ);
if (first==null) {
first = getDocSetNC(query,null);
filterCache.put(query,first);
first = getDocSetNC(absQ,null);
filterCache.put(absQ,first);
}
return first.intersection(filter);
return positive ? first.intersection(filter) : filter.andNot(first);
}
// If there isn't a cache, then do a single filtered query.
return getDocSetNC(query,filter);
/******* OLD VERSION that did a filtered query instead of
* an intersection if the query docset wasn't found in the cache.
* It made misses != inserts (even if no evictions)
DocSet first=null;
if (filterCache != null) {
first = (DocSet)filterCache.get(query);
if (first != null) {
return first.intersection(filter);
}
}
DocSet answer = getDocSetNC(query, filter);
// nothing is inserted into the cache, because we don't cache materialized filters.
// Hmmm, we *could* make a hitcollector that made a DocSet out of the query at the
// same time it was running the filter though...
// Q: we could call getDocSet(query) and then take the intersection instead of running
// the query as a filter. Then it could be cached.
return answer;
****************/
// If there isn't a cache, then do a single filtered query if positive.
return positive ? getDocSetNC(absQ,filter) : filter.andNot(getPositiveDocSet(absQ));
}
@ -778,6 +813,7 @@ public class SolrIndexSearcher extends Searcher implements SolrInfoMBean {
int[] ids;
float[] scores;
query = QueryUtils.makeQueryable(query);
// handle zero case...
if (lastDocRequested<=0) {
@ -935,6 +971,9 @@ public class SolrIndexSearcher extends Searcher implements SolrInfoMBean {
int[] ids;
float[] scores;
final DocSetHitCollector setHC = new DocSetHitCollector(maxDoc());
query = QueryUtils.makeQueryable(query);
// TODO: perhaps unify getDocListAndSetNC and getDocListNC without imposing a significant performance hit
// Comment: gathering the set before the filter is applied allows one to cache
@ -1271,13 +1310,10 @@ public class SolrIndexSearcher extends Searcher implements SolrInfoMBean {
* @throws IOException
*/
public int numDocs(Query a, DocSet b) throws IOException {
// reverse: do the query on filter and filter using docs...
// or if filter is a term query, can get the freq and
// drive things off that.
// this higher level API leaves open more optimization possibilities.
// prob only worth it if cacheHitRatio is bad...
return b.intersectionSize(getDocSet(a));
// Negative query if absolute value different from original
Query absQ = QueryUtils.getAbs(a);
DocSet positiveA = getPositiveDocSet(absQ);
return a==absQ ? b.intersectionSize(positiveA) : b.andNotSize(positiveA);
}
/**
@ -1291,7 +1327,25 @@ public class SolrIndexSearcher extends Searcher implements SolrInfoMBean {
* @throws IOException
*/
public int numDocs(Query a, Query b) throws IOException {
return getDocSet(b).intersectionSize(getDocSet(a));
Query absA = QueryUtils.getAbs(a);
Query absB = QueryUtils.getAbs(b);
DocSet positiveA = getPositiveDocSet(absA);
DocSet positiveB = getPositiveDocSet(absB);
// Negative query if absolute value different from original
if (a==absA) {
if (b==absB) return positiveA.intersectionSize(positiveB);
return positiveA.andNotSize(positiveB);
}
if (b==absB) return positiveB.andNotSize(positiveA);
// if both negative, we need to create a temp DocSet since we
// don't have a counting method that takes three.
DocSet all = getPositiveDocSet(matchAllDocsQuery);
// -a -b == *:*.andNot(a).andNotSize(b) == *.*.andNotSize(a.union(b))
// we use the last form since the intermediate DocSet should normally be smaller.
return all.andNotSize(positiveA.union(positiveB));
}

View File

@ -0,0 +1,266 @@
package org.apache.solr.search;
import junit.framework.TestCase;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.Query;
import org.apache.lucene.index.Term;
import org.apache.solr.util.AbstractSolrTestCase;
/**
* @author yonik
* @version $Id$
*/
public class TestQueryUtils extends AbstractSolrTestCase {
public String getSchemaFile() { return "schema.xml"; }
public String getSolrConfigFile() { return "solrconfig.xml"; }
public void setUp() throws Exception {
super.setUp();
}
public void tearDown() throws Exception {
super.tearDown();
}
public void positive(Query q) {
assertFalse(QueryUtils.isNegative(q));
assertTrue(QueryUtils.getAbs(q)==q);
BooleanClause[] clauses = (q instanceof BooleanQuery) ? ((BooleanQuery)q).getClauses() : null;
if (clauses != null) {
if (clauses.length != 0) {
assertTrue(QueryUtils.makeQueryable(q)==q);
}
} else {
assertTrue(QueryUtils.makeQueryable(q)==q);
}
}
public void negative(Query q) {
assertTrue(QueryUtils.isNegative(q));
Query abs = QueryUtils.getAbs(q);
assertTrue(q != abs);
Query neg2 = QueryUtils.fixNegativeQuery(q);
assertFalse(abs.equals(q));
assertFalse(neg2.equals(q));
}
public void testNegativeQueries() {
TermQuery tq = new TermQuery(new Term("hi","there"));
TermQuery tq2 = new TermQuery(new Term("wow","dude"));
BooleanQuery bq = new BooleanQuery();
positive(tq);
// positive(bq);
bq.add(tq, BooleanClause.Occur.SHOULD);
positive(bq);
bq.add(tq2, BooleanClause.Occur.MUST_NOT);
positive(bq);
bq = new BooleanQuery();
bq.add(tq,BooleanClause.Occur.MUST_NOT);
negative(bq);
bq.add(tq2,BooleanClause.Occur.MUST_NOT);
negative(bq);
String f = "name"; // name is whitespace tokenized
assertU(adoc("id", "1", f, "A"));
assertU(adoc("id", "2", f, "B"));
assertU(adoc("id", "3", f, "C"));
assertU(adoc("id", "4", f, "C"));
assertU(adoc("id", "5", f, "D"));
assertU(adoc("id", "6", f, "E"));
assertU(adoc("id", "7", f, "E"));
assertU(adoc("id", "8", f, "E W"));
assertU(adoc("id", "9", f, "F W"));
assertU(adoc("id", "10", f, "G W"));
assertU(adoc("id", "11", f, "G X "));
assertU(adoc("id", "12", f, "G X Y"));
assertU(adoc("id", "13", f, "G X Y Z"));
assertU(adoc("id", "14", f, "G Y Z"));
assertU(adoc("id", "15", f, "G Z"));
assertU(adoc("id", "16", f, "G"));
assertU(commit());
assertQ("test negative base q matching nothing",
req("-qlkciyopsbgzyvkylsjhchghjrdf")
,"//result[@numFound='16']"
);
assertQ("test negative base q matching something",
req("-name:E")
,"//result[@numFound='13']"
);
assertQ("test negative base q with two terms",
req("-name:G -name:W")
,"//result[@numFound='7']"
);
assertQ("test negative base q with three terms",
req("-name:G -name:W -name:E")
,"//result[@numFound='5']"
);
assertQ("test negative boolean query",
req("-(name:G OR name:W)")
,"//result[@numFound='7']"
);
assertQ("test non negative q",
req("-name:G -name:W -name:E id:[* TO *]")
,"//result[@numFound='5']"
);
assertQ("test non negative q",
req("-name:G -name:W -name:E +id:[* TO *]")
,"//result[@numFound='5']"
);
// now for the filters...
assertQ("test negative base q matching nothing, with filters",
req("q","-qlkciyopsbgzyvkylsjhchghjrdf"
,"fq","name:A"
)
,"//result[@numFound='1']"
);
assertQ("test negative filters",
req("q","name:A"
,"fq","-name:A"
)
,"//result[@numFound='0']"
);
assertQ("test negative filters",
req("q","name:A"
,"fq","-name:A"
)
,"//result[@numFound='0']"
);
assertQ("test negative filters",
req("q","-name:E"
,"fq","name:E"
)
,"//result[@numFound='0']"
);
assertQ("test negative filters",
req("q","-name:E"
,"fq","name:W"
)
,"//result[@numFound='2']"
);
assertQ("test negative filters",
req("q","-name:E"
,"fq","name:W"
)
,"//result[@numFound='2']"
);
assertQ("one pos filter, one neg",
req("q","-name:E"
,"fq","name:W"
,"fq","-name:G"
)
,"//result[@numFound='1']"
);
assertQ("two neg filters",
req("q","-name:E"
,"fq","-name:W"
,"fq","-name:G"
)
,"//result[@numFound='5']" // ABCCD
);
assertQ("three neg filters",
req("q","-name:E"
,"fq","-name:W"
,"fq","-name:G"
,"fq","-name:C"
)
,"//result[@numFound='3']" // ABD
);
assertQ("compound neg filters",
req("q","-name:E"
,"fq","-name:W -name:G"
,"fq","-name:C"
)
,"//result[@numFound='3']" // ABD
);
assertQ("compound neg filters",
req("q","-name:E"
,"fq","-name:W -name:G -name:C"
)
,"//result[@numFound='3']" // ABD
);
assertQ("compound neg filters",
req("q","-name:E"
,"fq","-(name:W name:G name:C)"
)
,"//result[@numFound='3']" // ABD
);
assertQ("three neg filters + pos",
req("q","-name:E"
,"fq","-name:W"
,"fq","-name:G"
,"fq","-name:C"
,"fq","name:G"
)
,"//result[@numFound='0']"
);
assertQ("three neg filters + pos",
req("q","-name:E"
,"fq","-name:W"
,"fq","-name:G"
,"fq","-name:C"
,"fq","+id:1"
)
,"//result[@numFound='1']" // A
);
assertQ("three neg filters + pos",
req("q","-name:E"
,"fq","-name:W"
,"fq","-name:G"
,"fq","-name:C"
,"fq","id:[* TO *]"
)
,"//result[@numFound='3']" // ABD
);
// QueryParser turns term queries on stopwords into a BooleanQuery with
// zero clauses.
assertQ("neg base query on stopword",
req("q","-text:stopworda")
,"//result[@numFound='16']" // ABD
);
assertQ("negative filter on stopword",
req("q","id:[* TO *]"
,"fq","-text:stopworda"
)
,"//result[@numFound='16']" // ABD
);
assertQ("two negative filters on stopword",
req("q","id:[* TO *]"
,"fq","-text:stopworda"
,"fq","-text:stopworda"
)
,"//result[@numFound='16']" // ABD
);
assertQ("compound negative filters with stopword",
req("q","id:[* TO *]"
,"fq","-text:stopworda -id:1"
)
,"//result[@numFound='15']" // ABD
);
}
}