From 040dc8a2f6dbc05b3fa330e07003d05679408a7d Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Thu, 13 Oct 2011 17:00:16 +0000 Subject: [PATCH] LUCENE-3510: don't limit number of prohibited clauses to 32 in BooleanScorer git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1182982 13f79535-47bb-0310-9956-ffa450edef68 --- .../apache/lucene/search/BooleanQuery.java | 6 +- .../apache/lucene/search/BooleanScorer.java | 79 +++++++---------- .../lucene/search/TestBooleanScorer.java | 88 +++++++++++++++++-- 3 files changed, 115 insertions(+), 58 deletions(-) diff --git a/lucene/src/java/org/apache/lucene/search/BooleanQuery.java b/lucene/src/java/org/apache/lucene/search/BooleanQuery.java index 77a474eb133..d9dce63536a 100644 --- a/lucene/src/java/org/apache/lucene/search/BooleanQuery.java +++ b/lucene/src/java/org/apache/lucene/search/BooleanQuery.java @@ -329,7 +329,7 @@ public class BooleanQuery extends Query implements Iterable { } // Check if we can return a BooleanScorer - if (!scoreDocsInOrder && topScorer && required.size() == 0 && prohibited.size() < 32) { + if (!scoreDocsInOrder && topScorer && required.size() == 0) { return new BooleanScorer(this, disableCoord, minNrShouldMatch, optional, prohibited, maxCoord); } @@ -375,10 +375,6 @@ public class BooleanQuery extends Query implements Iterable { } } - if (numProhibited > 32) { // cannot use BS - return false; - } - // scorer() will return an out-of-order scorer if requested. return true; } diff --git a/lucene/src/java/org/apache/lucene/search/BooleanScorer.java b/lucene/src/java/org/apache/lucene/search/BooleanScorer.java index 07c17f2f032..21c8731146c 100644 --- a/lucene/src/java/org/apache/lucene/search/BooleanScorer.java +++ b/lucene/src/java/org/apache/lucene/search/BooleanScorer.java @@ -29,10 +29,10 @@ import org.apache.lucene.search.BooleanQuery.BooleanWeight; /* Description from Doug Cutting (excerpted from * LUCENE-1483): * - * BooleanScorer uses a ~16k array to score windows of - * docs. So it scores docs 0-16k first, then docs 16-32k, + * BooleanScorer uses an array to score windows of + * 2K docs. So it scores docs 0-2K first, then docs 2K-4K, * etc. For each window it iterates through all query terms - * and accumulates a score in table[doc%16k]. It also stores + * and accumulates a score in table[doc%2K]. It also stores * in the table a bitmask representing which terms * contributed to the score. Non-zero scores are chained in * a linked list. At the end of scoring each window it then @@ -75,9 +75,7 @@ final class BooleanScorer extends Scorer { public void collect(final int doc) throws IOException { final BucketTable table = bucketTable; final int i = doc & BucketTable.MASK; - Bucket bucket = table.buckets[i]; - if (bucket == null) - table.buckets[i] = bucket = new Bucket(); + final Bucket bucket = table.buckets[i]; if (bucket.doc != doc) { // invalid bucket bucket.doc = doc; // set doc @@ -143,6 +141,9 @@ final class BooleanScorer extends Scorer { static final class Bucket { int doc = -1; // tells if bucket is valid float score; // incremental score + // TODO: break out bool anyProhibited, int + // numRequiredMatched; then we can remove 32 limit on + // required clauses int bits; // used for bool constraints int coord; // count of terms in score Bucket next; // next valid bucket @@ -156,7 +157,13 @@ final class BooleanScorer extends Scorer { final Bucket[] buckets = new Bucket[SIZE]; Bucket first = null; // head of valid list - public BucketTable() {} + public BucketTable() { + // Pre-fill to save the lazy init when collecting + // each sub: + for(int idx=0;idx optionalScorers, List prohibitedScorers, int maxCoord) throws IOException { @@ -215,11 +223,8 @@ final class BooleanScorer extends Scorer { if (prohibitedScorers != null && prohibitedScorers.size() > 0) { for (Scorer scorer : prohibitedScorers) { - int mask = nextMask; - nextMask = nextMask << 1; - prohibitedMask |= mask; // update prohibited mask if (scorer.nextDoc() != NO_MORE_DOCS) { - scorers = new SubScorer(scorer, false, true, bucketTable.newCollector(mask), scorers); + scorers = new SubScorer(scorer, false, true, bucketTable.newCollector(PROHIBITED_MASK), scorers); } } } @@ -233,9 +238,12 @@ final class BooleanScorer extends Scorer { // firstDocID is ignored since nextDoc() initializes 'current' @Override public boolean score(Collector collector, int max, int firstDocID) throws IOException { + // Make sure it's only BooleanScorer that calls us: + assert firstDocID == -1; boolean more; Bucket tmp; BucketScorer bs = new BucketScorer(weight); + // The internal loop will set the score and doc before calling collect. collector.setScorer(bs); do { @@ -244,12 +252,13 @@ final class BooleanScorer extends Scorer { while (current != null) { // more queued // check prohibited & required - if ((current.bits & prohibitedMask) == 0) { + if ((current.bits & PROHIBITED_MASK) == 0) { - // TODO: re-enable this if BQ ever sends us required - // clauses - //&& (current.bits & requiredMask) == requiredMask) { + // TODO: re-enable this if BQ ever sends us required + // clauses + //&& (current.bits & requiredMask) == requiredMask) { + // TODO: can we remove this? if (current.doc >= max){ tmp = current; current = current.next; @@ -298,48 +307,22 @@ final class BooleanScorer extends Scorer { @Override public int docID() { - return doc; + throw new UnsupportedOperationException(); } @Override public int nextDoc() throws IOException { - boolean more; - do { - while (bucketTable.first != null) { // more queued - current = bucketTable.first; - bucketTable.first = current.next; // pop the queue - - // check prohibited & required, and minNrShouldMatch - if ((current.bits & prohibitedMask) == 0 && - current.coord >= minNrShouldMatch) { - // TODO: re-enable this if BQ ever sends us required clauses - // (current.bits & requiredMask) == requiredMask && - return doc = current.doc; - } - } - - // refill the queue - more = false; - end += BucketTable.SIZE; - for (SubScorer sub = scorers; sub != null; sub = sub.next) { - int subScorerDocID = sub.scorer.docID(); - if (subScorerDocID != NO_MORE_DOCS) { - more |= sub.scorer.score(sub.collector, end, subScorerDocID); - } - } - } while (bucketTable.first != null || more); - - return doc = NO_MORE_DOCS; + throw new UnsupportedOperationException(); } @Override public float score() { - return current.score * coordFactors[current.coord]; + throw new UnsupportedOperationException(); } @Override public void score(Collector collector) throws IOException { - score(collector, Integer.MAX_VALUE, nextDoc()); + score(collector, Integer.MAX_VALUE, -1); } @Override diff --git a/lucene/src/test/org/apache/lucene/search/TestBooleanScorer.java b/lucene/src/test/org/apache/lucene/search/TestBooleanScorer.java index 94d4598a302..864fbf51a02 100644 --- a/lucene/src/test/org/apache/lucene/search/TestBooleanScorer.java +++ b/lucene/src/test/org/apache/lucene/search/TestBooleanScorer.java @@ -18,16 +18,19 @@ package org.apache.lucene.search; */ import java.io.IOException; +import java.util.ArrayList; import java.util.Arrays; +import java.util.List; import org.apache.lucene.document.Document; import org.apache.lucene.document.StringField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanQuery.BooleanWeight; import org.apache.lucene.store.Directory; - import org.apache.lucene.util.LuceneTestCase; public class TestBooleanScorer extends LuceneTestCase @@ -93,13 +96,88 @@ public class TestBooleanScorer extends LuceneTestCase }}; BooleanScorer bs = new BooleanScorer(weight, false, 1, Arrays.asList(scorers), null, scorers.length); - - assertEquals("should have received 3000", 3000, bs.nextDoc()); - assertEquals("should have received NO_MORE_DOCS", DocIdSetIterator.NO_MORE_DOCS, bs.nextDoc()); + + final List hits = new ArrayList(); + bs.score(new Collector() { + int docBase; + @Override + public void setScorer(Scorer scorer) { + } + + @Override + public void collect(int doc) throws IOException { + hits.add(docBase+doc); + } + + @Override + public void setNextReader(AtomicReaderContext context) { + docBase = context.docBase; + } + + @Override + public boolean acceptsDocsOutOfOrder() { + return true; + } + }); + + assertEquals("should have only 1 hit", 1, hits.size()); + assertEquals("hit should have been docID=3000", 3000, hits.get(0).intValue()); searcher.close(); ir.close(); directory.close(); - } + public void testMoreThan32ProhibitedClauses() throws Exception { + final Directory d = newDirectory(); + final RandomIndexWriter w = new RandomIndexWriter(random, d); + Document doc = new Document(); + doc.add(new TextField("field", "0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33")); + w.addDocument(doc); + doc = new Document(); + doc.add(new TextField("field", "33")); + w.addDocument(doc); + final IndexReader r = w.getReader(); + w.close(); + final IndexSearcher s = newSearcher(r); + + final BooleanQuery q = new BooleanQuery(); + for(int term=0;term<33;term++) { + q.add(new BooleanClause(new TermQuery(new Term("field", ""+term)), + BooleanClause.Occur.MUST_NOT)); + } + q.add(new BooleanClause(new TermQuery(new Term("field", "33")), + BooleanClause.Occur.SHOULD)); + + final int[] count = new int[1]; + s.search(q, new Collector() { + private Scorer scorer; + + @Override + public void setScorer(Scorer scorer) { + // Make sure we got BooleanScorer: + this.scorer = scorer; + assertEquals("Scorer is implemented by wrong class", BooleanScorer.class.getName() + "$BucketScorer", scorer.getClass().getName()); + } + + @Override + public void collect(int doc) throws IOException { + count[0]++; + } + + @Override + public void setNextReader(AtomicReaderContext context) { + } + + @Override + public boolean acceptsDocsOutOfOrder() { + return true; + } + }); + + assertEquals(1, count[0]); + + s.close(); + r.close(); + d.close(); + } }