LUCENE-693: Speed up nested conjunctions

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@597702 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2007-11-23 16:58:54 +00:00
parent 02d9b51ede
commit f7ee9f6dfb
4 changed files with 118 additions and 117 deletions

View File

@ -259,6 +259,11 @@ Optimizations
raw bytes for each contiguous range of non-deleted documents.
(Robert Engels via Mike McCandless)
13. LUCENE-693: Speed up nested conjunctions (~2x) that match many
documents, and a slight performance increase for top level
conjunctions. (yonik)
Documentation
1. LUCENE-1051: Generate separate javadocs for core, demo and contrib

View File

@ -139,7 +139,7 @@ class BooleanScorer2 extends Scorer {
* When "sum" is used in a name it means score value summing
* over the matching scorers
*/
private void initCountingSumScorer() {
private void initCountingSumScorer() throws IOException {
coordinator.init();
countingSumScorer = makeCountingSumScorer();
}
@ -192,10 +192,10 @@ class BooleanScorer2 extends Scorer {
private static Similarity defaultSimilarity = new DefaultSimilarity();
private Scorer countingConjunctionSumScorer(List requiredScorers) {
private Scorer countingConjunctionSumScorer(List requiredScorers) throws IOException {
// each scorer from the list counted as a single matcher
final int requiredNrMatchers = requiredScorers.size();
ConjunctionScorer cs = new ConjunctionScorer(defaultSimilarity) {
return new ConjunctionScorer(defaultSimilarity, requiredScorers) {
private int lastScoredDoc = -1;
public float score() throws IOException {
@ -210,34 +210,26 @@ class BooleanScorer2 extends Scorer {
return super.score();
}
};
Iterator rsi = requiredScorers.iterator();
while (rsi.hasNext()) {
cs.add((Scorer) rsi.next());
}
return cs;
}
private Scorer dualConjunctionSumScorer(Scorer req1, Scorer req2) { // non counting.
ConjunctionScorer cs = new ConjunctionScorer(defaultSimilarity);
private Scorer dualConjunctionSumScorer(Scorer req1, Scorer req2) throws IOException { // non counting.
return new ConjunctionScorer(defaultSimilarity, new Scorer[]{req1, req2});
// All scorers match, so defaultSimilarity always has 1 as
// the coordination factor.
// Therefore the sum of the scores of two scorers
// is used as score.
cs.add(req1);
cs.add(req2);
return cs;
}
/** Returns the scorer to be used for match counting and score summing.
* Uses requiredScorers, optionalScorers and prohibitedScorers.
*/
private Scorer makeCountingSumScorer() { // each scorer counted as a single matcher
private Scorer makeCountingSumScorer() throws IOException { // each scorer counted as a single matcher
return (requiredScorers.size() == 0)
? makeCountingSumScorerNoReq()
: makeCountingSumScorerSomeReq();
}
private Scorer makeCountingSumScorerNoReq() { // No required scorers
private Scorer makeCountingSumScorerNoReq() throws IOException { // No required scorers
if (optionalScorers.size() == 0) {
return new NonMatchingScorer(); // no clauses or only prohibited clauses
} else { // No required scorers. At least one optional scorer.
@ -258,7 +250,7 @@ class BooleanScorer2 extends Scorer {
}
}
private Scorer makeCountingSumScorerSomeReq() { // At least one required scorer.
private Scorer makeCountingSumScorerSomeReq() throws IOException { // At least one required scorer.
if (optionalScorers.size() < minNrShouldMatch) {
return new NonMatchingScorer(); // fewer optional clauses than minimum that should match
} else if (optionalScorers.size() == minNrShouldMatch) { // all optional scorers also required.

View File

@ -18,118 +18,105 @@ package org.apache.lucene.search;
*/
import java.io.IOException;
import java.util.Collection;
import java.util.Arrays;
import java.util.Comparator;
/** Scorer for conjunctions, sets of queries, all of which are required. */
class ConjunctionScorer extends Scorer {
private Scorer[] scorers = new Scorer[2];
private int length = 0;
private int first = 0;
private int last = -1;
private boolean firstTime = true;
private boolean more = true;
private float coord;
private final Scorer[] scorers;
public ConjunctionScorer(Similarity similarity) {
private boolean firstTime=true;
private boolean more;
private final float coord;
private int lastDoc=-1;
public ConjunctionScorer(Similarity similarity, Collection scorers) throws IOException {
this(similarity, (Scorer[])scorers.toArray(new Scorer[scorers.size()]));
}
public ConjunctionScorer(Similarity similarity, Scorer[] scorers) throws IOException {
super(similarity);
this.scorers = scorers;
coord = getSimilarity().coord(this.scorers.length, this.scorers.length);
}
final void add(Scorer scorer) {
if (length >= scorers.length) {
// grow the array
Scorer[] temps = new Scorer[scorers.length * 2];
System.arraycopy(scorers, 0, temps, 0, length);
scorers = temps;
}
last += 1;
length += 1;
scorers[last] = scorer;
}
public int doc() { return scorers[first].doc(); }
public int doc() { return lastDoc; }
public boolean next() throws IOException {
if (firstTime) {
init(true);
} else if (more) {
more = scorers[last].next(); // trigger further scanning
}
if (firstTime)
return init(0);
else if (more)
more = scorers[(scorers.length-1)].next();
return doNext();
}
private boolean doNext() throws IOException {
while (more && scorers[first].doc() < scorers[last].doc()) { // find doc w/ all clauses
more = scorers[first].skipTo(scorers[last].doc()); // skip first upto last
last = first; // move first to last
first = (first == length-1) ? 0 : first+1;
int first=0;
Scorer lastScorer = scorers[scorers.length-1];
Scorer firstScorer;
while (more && (firstScorer=scorers[first]).doc() < (lastDoc=lastScorer.doc())) {
more = firstScorer.skipTo(lastDoc);
lastScorer = firstScorer;
first = (first == (scorers.length-1)) ? 0 : first+1;
}
return more; // found a doc with all clauses
return more;
}
public boolean skipTo(int target) throws IOException {
if(firstTime) {
init(false);
}
for (int i = 0, pos = first; i < length; i++) {
if (!more) break;
more = scorers[pos].skipTo(target);
pos = (pos == length-1) ? 0 : pos+1;
}
if (more)
sortScorers(); // re-sort scorers
if (firstTime)
return init(target);
else if (more)
more = scorers[(scorers.length-1)].skipTo(target);
return doNext();
}
public float score() throws IOException {
float sum = 0.0f;
for (int i = 0; i < length; i++) {
sum += scorers[i].score();
}
return sum * coord;
}
private void init(boolean initScorers) throws IOException {
// compute coord factor
coord = getSimilarity().coord(length, length);
more = length > 0;
if(initScorers){
// move each scorer to its first entry
for (int i = 0, pos = first; i < length; i++) {
if (!more) break;
more = scorers[pos].next();
pos = (pos == length-1) ? 0 : pos+1;
}
// initial sort of simulated list
if (more)
sortScorers();
// Note... most of this could be done in the constructor
// thus skipping a check for firstTime per call to next() and skipTo()
private boolean init(int target) throws IOException {
firstTime=false;
more = scorers.length>1;
for (int i=0; i<scorers.length; i++) {
more = target==0 ? scorers[i].next() : scorers[i].skipTo(target);
if (!more)
return false;
}
firstTime = false;
}
// Sort the array the first time...
// We don't need to sort the array in any future calls because we know
// it will already start off sorted (all scorers on same doc).
private void sortScorers() {
// squeeze the array down for the sort
if (length != scorers.length) {
Scorer[] temps = new Scorer[length];
System.arraycopy(scorers, 0, temps, 0, length);
scorers = temps;
}
// note that this comparator is not consistent with equals!
Arrays.sort(scorers, new Comparator() { // sort the array
public int compare(Object o1, Object o2) {
return ((Scorer)o1).doc() - ((Scorer)o2).doc();
}
});
first = 0;
last = length - 1;
doNext();
// If first-time skip distance is any predictor of
// scorer sparseness, then we should always try to skip first on
// those scorers.
// Keep last scorer in it's last place (it will be the first
// to be skipped on), but reverse all of the others so that
// they will be skipped on in order of original high skip.
int end=(scorers.length-1)-1;
for (int i=0; i<(end>>1); i++) {
Scorer tmp = scorers[i];
scorers[i] = scorers[end-i];
scorers[end-i] = tmp;
}
return more;
}
public float score() throws IOException {
float sum = 0.0f;
for (int i = 0; i < scorers.length; i++) {
sum += scorers[i].score();
}
return sum * coord;
}
public Explanation explain(int doc) {

View File

@ -42,6 +42,7 @@ public class TestScorerPerf extends LuceneTestCase {
boolean validate = true; // set to false when doing performance testing
BitSet[] sets;
Term[] terms;
IndexSearcher s;
public void createDummySearcher() throws Exception {
@ -55,22 +56,25 @@ public class TestScorerPerf extends LuceneTestCase {
public void createRandomTerms(int nDocs, int nTerms, double power, Directory dir) throws Exception {
int[] freq = new int[nTerms];
terms = new Term[nTerms];
for (int i=0; i<nTerms; i++) {
int f = (nTerms+1)-i; // make first terms less frequent
freq[i] = (int)Math.ceil(Math.pow(f,power));
terms[i] = new Term("f",Character.toString((char)('A'+i)));
}
IndexWriter iw = new IndexWriter(dir,new WhitespaceAnalyzer(), true);
iw.setMaxBufferedDocs(123);
for (int i=0; i<nDocs; i++) {
Document d = new Document();
for (int j=0; j<nTerms; j++) {
if (r.nextInt(freq[j]) == 0) {
d.add(new Field("f", Character.toString((char)j), Field.Store.NO, Field.Index.UN_TOKENIZED));
d.add(new Field("f", terms[j].text(), Field.Store.NO, Field.Index.UN_TOKENIZED));
//System.out.println(d);
}
}
iw.addDocument(d);
}
iw.optimize();
iw.close();
}
@ -168,6 +172,7 @@ public class TestScorerPerf extends LuceneTestCase {
public int doNestedConjunctions(int iter, int maxOuterClauses, int maxClauses) throws IOException {
int ret=0;
long nMatches=0;
for (int i=0; i<iter; i++) {
int oClauses = r.nextInt(maxOuterClauses-1)+2;
@ -185,15 +190,15 @@ public class TestScorerPerf extends LuceneTestCase {
oq.add(bq, BooleanClause.Occur.MUST);
} // outer
CountingHitCollector hc = validate ? new MatchingHitCollector(result)
: new CountingHitCollector();
s.search(oq, hc);
nMatches += hc.getCount();
ret += hc.getSum();
if (validate) assertEquals(result.cardinality(), hc.getCount());
// System.out.println(hc.getCount());
}
System.out.println("Average number of matches="+(nMatches/iter));
return ret;
}
@ -205,22 +210,28 @@ public class TestScorerPerf extends LuceneTestCase {
) throws IOException {
int ret=0;
long nMatches=0;
for (int i=0; i<iter; i++) {
int nClauses = r.nextInt(maxClauses-1)+2; // min 2 clauses
BooleanQuery bq = new BooleanQuery();
BitSet terms = new BitSet(termsInIndex);
BitSet termflag = new BitSet(termsInIndex);
for (int j=0; j<nClauses; j++) {
int tnum;
// don't pick same clause twice
do {tnum = r.nextInt(termsInIndex);} while (terms.get(tnum));
Query tq = new TermQuery(new Term("f",Character.toString((char)tnum)));
tnum = r.nextInt(termsInIndex);
if (termflag.get(tnum)) tnum=termflag.nextClearBit(tnum);
if (tnum<0 || tnum>=termsInIndex) tnum=termflag.nextClearBit(0);
termflag.set(tnum);
Query tq = new TermQuery(terms[tnum]);
bq.add(tq, BooleanClause.Occur.MUST);
}
CountingHitCollector hc = new CountingHitCollector();
s.search(bq, hc);
nMatches += hc.getCount();
ret += hc.getSum();
}
System.out.println("Average number of matches="+(nMatches/iter));
return ret;
}
@ -233,7 +244,7 @@ public class TestScorerPerf extends LuceneTestCase {
int iter
) throws IOException {
int ret=0;
long nMatches=0;
for (int i=0; i<iter; i++) {
int oClauses = r.nextInt(maxOuterClauses-1)+2;
BooleanQuery oq = new BooleanQuery();
@ -241,12 +252,15 @@ public class TestScorerPerf extends LuceneTestCase {
int nClauses = r.nextInt(maxClauses-1)+2; // min 2 clauses
BooleanQuery bq = new BooleanQuery();
BitSet terms = new BitSet(termsInIndex);
BitSet termflag = new BitSet(termsInIndex);
for (int j=0; j<nClauses; j++) {
int tnum;
// don't pick same clause twice
do {tnum = r.nextInt(termsInIndex);} while (terms.get(tnum));
Query tq = new TermQuery(new Term("f",Character.toString((char)tnum)));
tnum = r.nextInt(termsInIndex);
if (termflag.get(tnum)) tnum=termflag.nextClearBit(tnum);
if (tnum<0 || tnum>=25) tnum=termflag.nextClearBit(0);
termflag.set(tnum);
Query tq = new TermQuery(terms[tnum]);
bq.add(tq, BooleanClause.Occur.MUST);
} // inner
@ -256,9 +270,10 @@ public class TestScorerPerf extends LuceneTestCase {
CountingHitCollector hc = new CountingHitCollector();
s.search(oq, hc);
nMatches += hc.getCount();
ret += hc.getSum();
}
System.out.println("Average number of matches="+(nMatches/iter));
return ret;
}
@ -275,7 +290,7 @@ public class TestScorerPerf extends LuceneTestCase {
PhraseQuery q = new PhraseQuery();
for (int j=0; j<nClauses; j++) {
int tnum = r.nextInt(termsInIndex);
q.add(new Term("f",Character.toString((char)tnum)), j);
q.add(new Term("f",Character.toString((char)(tnum+'A'))), j);
}
q.setSlop(termsInIndex); // this could be random too
@ -299,7 +314,8 @@ public class TestScorerPerf extends LuceneTestCase {
}
/***
int bigIter=6;
int bigIter=10;
public void testConjunctionPerf() throws Exception {
createDummySearcher();
validate=false;
@ -326,16 +342,17 @@ public class TestScorerPerf extends LuceneTestCase {
s.close();
}
public void testConjunctionTerms() throws Exception {
validate=false;
RAMDirectory dir = new RAMDirectory();
System.out.println("Creating index");
createRandomTerms(100000,25,2, dir);
createRandomTerms(100000,25,.5, dir);
s = new IndexSearcher(dir);
System.out.println("Starting performance test");
for (int i=0; i<bigIter; i++) {
long start = System.currentTimeMillis();
doTermConjunctions(s,25,5,10000);
doTermConjunctions(s,25,5,1000);
long end = System.currentTimeMillis();
System.out.println("milliseconds="+(end-start));
}
@ -346,12 +363,12 @@ public class TestScorerPerf extends LuceneTestCase {
validate=false;
RAMDirectory dir = new RAMDirectory();
System.out.println("Creating index");
createRandomTerms(100000,25,2, dir);
createRandomTerms(100000,25,.2, dir);
s = new IndexSearcher(dir);
System.out.println("Starting performance test");
for (int i=0; i<bigIter; i++) {
long start = System.currentTimeMillis();
doNestedTermConjunctions(s,25,5,5,1000);
doNestedTermConjunctions(s,25,3,3,200);
long end = System.currentTimeMillis();
System.out.println("milliseconds="+(end-start));
}
@ -373,8 +390,8 @@ public class TestScorerPerf extends LuceneTestCase {
System.out.println("milliseconds="+(end-start));
}
s.close();
}
***/
}