mirror of https://github.com/apache/lucene.git
LUCENE-693: Speed up nested conjunctions
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@597702 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
02d9b51ede
commit
f7ee9f6dfb
|
@ -259,6 +259,11 @@ Optimizations
|
|||
raw bytes for each contiguous range of non-deleted documents.
|
||||
(Robert Engels via Mike McCandless)
|
||||
|
||||
13. LUCENE-693: Speed up nested conjunctions (~2x) that match many
|
||||
documents, and a slight performance increase for top level
|
||||
conjunctions. (yonik)
|
||||
|
||||
|
||||
Documentation
|
||||
|
||||
1. LUCENE-1051: Generate separate javadocs for core, demo and contrib
|
||||
|
|
|
@ -139,7 +139,7 @@ class BooleanScorer2 extends Scorer {
|
|||
* When "sum" is used in a name it means score value summing
|
||||
* over the matching scorers
|
||||
*/
|
||||
private void initCountingSumScorer() {
|
||||
private void initCountingSumScorer() throws IOException {
|
||||
coordinator.init();
|
||||
countingSumScorer = makeCountingSumScorer();
|
||||
}
|
||||
|
@ -192,10 +192,10 @@ class BooleanScorer2 extends Scorer {
|
|||
|
||||
private static Similarity defaultSimilarity = new DefaultSimilarity();
|
||||
|
||||
private Scorer countingConjunctionSumScorer(List requiredScorers) {
|
||||
private Scorer countingConjunctionSumScorer(List requiredScorers) throws IOException {
|
||||
// each scorer from the list counted as a single matcher
|
||||
final int requiredNrMatchers = requiredScorers.size();
|
||||
ConjunctionScorer cs = new ConjunctionScorer(defaultSimilarity) {
|
||||
return new ConjunctionScorer(defaultSimilarity, requiredScorers) {
|
||||
private int lastScoredDoc = -1;
|
||||
|
||||
public float score() throws IOException {
|
||||
|
@ -210,34 +210,26 @@ class BooleanScorer2 extends Scorer {
|
|||
return super.score();
|
||||
}
|
||||
};
|
||||
Iterator rsi = requiredScorers.iterator();
|
||||
while (rsi.hasNext()) {
|
||||
cs.add((Scorer) rsi.next());
|
||||
}
|
||||
return cs;
|
||||
}
|
||||
|
||||
private Scorer dualConjunctionSumScorer(Scorer req1, Scorer req2) { // non counting.
|
||||
ConjunctionScorer cs = new ConjunctionScorer(defaultSimilarity);
|
||||
private Scorer dualConjunctionSumScorer(Scorer req1, Scorer req2) throws IOException { // non counting.
|
||||
return new ConjunctionScorer(defaultSimilarity, new Scorer[]{req1, req2});
|
||||
// All scorers match, so defaultSimilarity always has 1 as
|
||||
// the coordination factor.
|
||||
// Therefore the sum of the scores of two scorers
|
||||
// is used as score.
|
||||
cs.add(req1);
|
||||
cs.add(req2);
|
||||
return cs;
|
||||
}
|
||||
|
||||
/** Returns the scorer to be used for match counting and score summing.
|
||||
* Uses requiredScorers, optionalScorers and prohibitedScorers.
|
||||
*/
|
||||
private Scorer makeCountingSumScorer() { // each scorer counted as a single matcher
|
||||
private Scorer makeCountingSumScorer() throws IOException { // each scorer counted as a single matcher
|
||||
return (requiredScorers.size() == 0)
|
||||
? makeCountingSumScorerNoReq()
|
||||
: makeCountingSumScorerSomeReq();
|
||||
}
|
||||
|
||||
private Scorer makeCountingSumScorerNoReq() { // No required scorers
|
||||
private Scorer makeCountingSumScorerNoReq() throws IOException { // No required scorers
|
||||
if (optionalScorers.size() == 0) {
|
||||
return new NonMatchingScorer(); // no clauses or only prohibited clauses
|
||||
} else { // No required scorers. At least one optional scorer.
|
||||
|
@ -258,7 +250,7 @@ class BooleanScorer2 extends Scorer {
|
|||
}
|
||||
}
|
||||
|
||||
private Scorer makeCountingSumScorerSomeReq() { // At least one required scorer.
|
||||
private Scorer makeCountingSumScorerSomeReq() throws IOException { // At least one required scorer.
|
||||
if (optionalScorers.size() < minNrShouldMatch) {
|
||||
return new NonMatchingScorer(); // fewer optional clauses than minimum that should match
|
||||
} else if (optionalScorers.size() == minNrShouldMatch) { // all optional scorers also required.
|
||||
|
|
|
@ -18,118 +18,105 @@ package org.apache.lucene.search;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
|
||||
/** Scorer for conjunctions, sets of queries, all of which are required. */
|
||||
class ConjunctionScorer extends Scorer {
|
||||
private Scorer[] scorers = new Scorer[2];
|
||||
private int length = 0;
|
||||
private int first = 0;
|
||||
private int last = -1;
|
||||
private boolean firstTime = true;
|
||||
private boolean more = true;
|
||||
private float coord;
|
||||
private final Scorer[] scorers;
|
||||
|
||||
public ConjunctionScorer(Similarity similarity) {
|
||||
private boolean firstTime=true;
|
||||
private boolean more;
|
||||
private final float coord;
|
||||
private int lastDoc=-1;
|
||||
|
||||
public ConjunctionScorer(Similarity similarity, Collection scorers) throws IOException {
|
||||
this(similarity, (Scorer[])scorers.toArray(new Scorer[scorers.size()]));
|
||||
}
|
||||
|
||||
public ConjunctionScorer(Similarity similarity, Scorer[] scorers) throws IOException {
|
||||
super(similarity);
|
||||
this.scorers = scorers;
|
||||
coord = getSimilarity().coord(this.scorers.length, this.scorers.length);
|
||||
}
|
||||
|
||||
final void add(Scorer scorer) {
|
||||
if (length >= scorers.length) {
|
||||
// grow the array
|
||||
Scorer[] temps = new Scorer[scorers.length * 2];
|
||||
System.arraycopy(scorers, 0, temps, 0, length);
|
||||
scorers = temps;
|
||||
}
|
||||
last += 1;
|
||||
length += 1;
|
||||
scorers[last] = scorer;
|
||||
}
|
||||
|
||||
public int doc() { return scorers[first].doc(); }
|
||||
public int doc() { return lastDoc; }
|
||||
|
||||
public boolean next() throws IOException {
|
||||
if (firstTime) {
|
||||
init(true);
|
||||
} else if (more) {
|
||||
more = scorers[last].next(); // trigger further scanning
|
||||
}
|
||||
if (firstTime)
|
||||
return init(0);
|
||||
else if (more)
|
||||
more = scorers[(scorers.length-1)].next();
|
||||
return doNext();
|
||||
}
|
||||
|
||||
|
||||
private boolean doNext() throws IOException {
|
||||
while (more && scorers[first].doc() < scorers[last].doc()) { // find doc w/ all clauses
|
||||
more = scorers[first].skipTo(scorers[last].doc()); // skip first upto last
|
||||
last = first; // move first to last
|
||||
first = (first == length-1) ? 0 : first+1;
|
||||
int first=0;
|
||||
Scorer lastScorer = scorers[scorers.length-1];
|
||||
Scorer firstScorer;
|
||||
while (more && (firstScorer=scorers[first]).doc() < (lastDoc=lastScorer.doc())) {
|
||||
more = firstScorer.skipTo(lastDoc);
|
||||
lastScorer = firstScorer;
|
||||
first = (first == (scorers.length-1)) ? 0 : first+1;
|
||||
}
|
||||
return more; // found a doc with all clauses
|
||||
return more;
|
||||
}
|
||||
|
||||
public boolean skipTo(int target) throws IOException {
|
||||
if(firstTime) {
|
||||
init(false);
|
||||
}
|
||||
|
||||
for (int i = 0, pos = first; i < length; i++) {
|
||||
if (!more) break;
|
||||
more = scorers[pos].skipTo(target);
|
||||
pos = (pos == length-1) ? 0 : pos+1;
|
||||
}
|
||||
|
||||
if (more)
|
||||
sortScorers(); // re-sort scorers
|
||||
|
||||
if (firstTime)
|
||||
return init(target);
|
||||
else if (more)
|
||||
more = scorers[(scorers.length-1)].skipTo(target);
|
||||
return doNext();
|
||||
}
|
||||
|
||||
public float score() throws IOException {
|
||||
float sum = 0.0f;
|
||||
for (int i = 0; i < length; i++) {
|
||||
sum += scorers[i].score();
|
||||
}
|
||||
return sum * coord;
|
||||
}
|
||||
|
||||
private void init(boolean initScorers) throws IOException {
|
||||
// compute coord factor
|
||||
coord = getSimilarity().coord(length, length);
|
||||
|
||||
more = length > 0;
|
||||
|
||||
if(initScorers){
|
||||
// move each scorer to its first entry
|
||||
for (int i = 0, pos = first; i < length; i++) {
|
||||
if (!more) break;
|
||||
more = scorers[pos].next();
|
||||
pos = (pos == length-1) ? 0 : pos+1;
|
||||
}
|
||||
// initial sort of simulated list
|
||||
if (more)
|
||||
sortScorers();
|
||||
// Note... most of this could be done in the constructor
|
||||
// thus skipping a check for firstTime per call to next() and skipTo()
|
||||
private boolean init(int target) throws IOException {
|
||||
firstTime=false;
|
||||
more = scorers.length>1;
|
||||
for (int i=0; i<scorers.length; i++) {
|
||||
more = target==0 ? scorers[i].next() : scorers[i].skipTo(target);
|
||||
if (!more)
|
||||
return false;
|
||||
}
|
||||
|
||||
firstTime = false;
|
||||
}
|
||||
// Sort the array the first time...
|
||||
// We don't need to sort the array in any future calls because we know
|
||||
// it will already start off sorted (all scorers on same doc).
|
||||
|
||||
private void sortScorers() {
|
||||
// squeeze the array down for the sort
|
||||
if (length != scorers.length) {
|
||||
Scorer[] temps = new Scorer[length];
|
||||
System.arraycopy(scorers, 0, temps, 0, length);
|
||||
scorers = temps;
|
||||
}
|
||||
|
||||
// note that this comparator is not consistent with equals!
|
||||
Arrays.sort(scorers, new Comparator() { // sort the array
|
||||
public int compare(Object o1, Object o2) {
|
||||
return ((Scorer)o1).doc() - ((Scorer)o2).doc();
|
||||
}
|
||||
});
|
||||
|
||||
first = 0;
|
||||
last = length - 1;
|
||||
|
||||
doNext();
|
||||
|
||||
// If first-time skip distance is any predictor of
|
||||
// scorer sparseness, then we should always try to skip first on
|
||||
// those scorers.
|
||||
// Keep last scorer in it's last place (it will be the first
|
||||
// to be skipped on), but reverse all of the others so that
|
||||
// they will be skipped on in order of original high skip.
|
||||
int end=(scorers.length-1)-1;
|
||||
for (int i=0; i<(end>>1); i++) {
|
||||
Scorer tmp = scorers[i];
|
||||
scorers[i] = scorers[end-i];
|
||||
scorers[end-i] = tmp;
|
||||
}
|
||||
|
||||
return more;
|
||||
}
|
||||
|
||||
public float score() throws IOException {
|
||||
float sum = 0.0f;
|
||||
for (int i = 0; i < scorers.length; i++) {
|
||||
sum += scorers[i].score();
|
||||
}
|
||||
return sum * coord;
|
||||
}
|
||||
|
||||
public Explanation explain(int doc) {
|
||||
|
|
|
@ -42,6 +42,7 @@ public class TestScorerPerf extends LuceneTestCase {
|
|||
boolean validate = true; // set to false when doing performance testing
|
||||
|
||||
BitSet[] sets;
|
||||
Term[] terms;
|
||||
IndexSearcher s;
|
||||
|
||||
public void createDummySearcher() throws Exception {
|
||||
|
@ -55,22 +56,25 @@ public class TestScorerPerf extends LuceneTestCase {
|
|||
|
||||
public void createRandomTerms(int nDocs, int nTerms, double power, Directory dir) throws Exception {
|
||||
int[] freq = new int[nTerms];
|
||||
terms = new Term[nTerms];
|
||||
for (int i=0; i<nTerms; i++) {
|
||||
int f = (nTerms+1)-i; // make first terms less frequent
|
||||
freq[i] = (int)Math.ceil(Math.pow(f,power));
|
||||
terms[i] = new Term("f",Character.toString((char)('A'+i)));
|
||||
}
|
||||
|
||||
IndexWriter iw = new IndexWriter(dir,new WhitespaceAnalyzer(), true);
|
||||
iw.setMaxBufferedDocs(123);
|
||||
for (int i=0; i<nDocs; i++) {
|
||||
Document d = new Document();
|
||||
for (int j=0; j<nTerms; j++) {
|
||||
if (r.nextInt(freq[j]) == 0) {
|
||||
d.add(new Field("f", Character.toString((char)j), Field.Store.NO, Field.Index.UN_TOKENIZED));
|
||||
d.add(new Field("f", terms[j].text(), Field.Store.NO, Field.Index.UN_TOKENIZED));
|
||||
//System.out.println(d);
|
||||
}
|
||||
}
|
||||
iw.addDocument(d);
|
||||
}
|
||||
iw.optimize();
|
||||
iw.close();
|
||||
}
|
||||
|
||||
|
@ -168,6 +172,7 @@ public class TestScorerPerf extends LuceneTestCase {
|
|||
|
||||
public int doNestedConjunctions(int iter, int maxOuterClauses, int maxClauses) throws IOException {
|
||||
int ret=0;
|
||||
long nMatches=0;
|
||||
|
||||
for (int i=0; i<iter; i++) {
|
||||
int oClauses = r.nextInt(maxOuterClauses-1)+2;
|
||||
|
@ -185,15 +190,15 @@ public class TestScorerPerf extends LuceneTestCase {
|
|||
oq.add(bq, BooleanClause.Occur.MUST);
|
||||
} // outer
|
||||
|
||||
|
||||
CountingHitCollector hc = validate ? new MatchingHitCollector(result)
|
||||
: new CountingHitCollector();
|
||||
s.search(oq, hc);
|
||||
nMatches += hc.getCount();
|
||||
ret += hc.getSum();
|
||||
if (validate) assertEquals(result.cardinality(), hc.getCount());
|
||||
// System.out.println(hc.getCount());
|
||||
}
|
||||
|
||||
System.out.println("Average number of matches="+(nMatches/iter));
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -205,22 +210,28 @@ public class TestScorerPerf extends LuceneTestCase {
|
|||
) throws IOException {
|
||||
int ret=0;
|
||||
|
||||
long nMatches=0;
|
||||
for (int i=0; i<iter; i++) {
|
||||
int nClauses = r.nextInt(maxClauses-1)+2; // min 2 clauses
|
||||
BooleanQuery bq = new BooleanQuery();
|
||||
BitSet terms = new BitSet(termsInIndex);
|
||||
BitSet termflag = new BitSet(termsInIndex);
|
||||
for (int j=0; j<nClauses; j++) {
|
||||
int tnum;
|
||||
// don't pick same clause twice
|
||||
do {tnum = r.nextInt(termsInIndex);} while (terms.get(tnum));
|
||||
Query tq = new TermQuery(new Term("f",Character.toString((char)tnum)));
|
||||
tnum = r.nextInt(termsInIndex);
|
||||
if (termflag.get(tnum)) tnum=termflag.nextClearBit(tnum);
|
||||
if (tnum<0 || tnum>=termsInIndex) tnum=termflag.nextClearBit(0);
|
||||
termflag.set(tnum);
|
||||
Query tq = new TermQuery(terms[tnum]);
|
||||
bq.add(tq, BooleanClause.Occur.MUST);
|
||||
}
|
||||
|
||||
CountingHitCollector hc = new CountingHitCollector();
|
||||
s.search(bq, hc);
|
||||
nMatches += hc.getCount();
|
||||
ret += hc.getSum();
|
||||
}
|
||||
System.out.println("Average number of matches="+(nMatches/iter));
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
@ -233,7 +244,7 @@ public class TestScorerPerf extends LuceneTestCase {
|
|||
int iter
|
||||
) throws IOException {
|
||||
int ret=0;
|
||||
|
||||
long nMatches=0;
|
||||
for (int i=0; i<iter; i++) {
|
||||
int oClauses = r.nextInt(maxOuterClauses-1)+2;
|
||||
BooleanQuery oq = new BooleanQuery();
|
||||
|
@ -241,12 +252,15 @@ public class TestScorerPerf extends LuceneTestCase {
|
|||
|
||||
int nClauses = r.nextInt(maxClauses-1)+2; // min 2 clauses
|
||||
BooleanQuery bq = new BooleanQuery();
|
||||
BitSet terms = new BitSet(termsInIndex);
|
||||
BitSet termflag = new BitSet(termsInIndex);
|
||||
for (int j=0; j<nClauses; j++) {
|
||||
int tnum;
|
||||
// don't pick same clause twice
|
||||
do {tnum = r.nextInt(termsInIndex);} while (terms.get(tnum));
|
||||
Query tq = new TermQuery(new Term("f",Character.toString((char)tnum)));
|
||||
tnum = r.nextInt(termsInIndex);
|
||||
if (termflag.get(tnum)) tnum=termflag.nextClearBit(tnum);
|
||||
if (tnum<0 || tnum>=25) tnum=termflag.nextClearBit(0);
|
||||
termflag.set(tnum);
|
||||
Query tq = new TermQuery(terms[tnum]);
|
||||
bq.add(tq, BooleanClause.Occur.MUST);
|
||||
} // inner
|
||||
|
||||
|
@ -256,9 +270,10 @@ public class TestScorerPerf extends LuceneTestCase {
|
|||
|
||||
CountingHitCollector hc = new CountingHitCollector();
|
||||
s.search(oq, hc);
|
||||
nMatches += hc.getCount();
|
||||
ret += hc.getSum();
|
||||
}
|
||||
|
||||
System.out.println("Average number of matches="+(nMatches/iter));
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -275,7 +290,7 @@ public class TestScorerPerf extends LuceneTestCase {
|
|||
PhraseQuery q = new PhraseQuery();
|
||||
for (int j=0; j<nClauses; j++) {
|
||||
int tnum = r.nextInt(termsInIndex);
|
||||
q.add(new Term("f",Character.toString((char)tnum)), j);
|
||||
q.add(new Term("f",Character.toString((char)(tnum+'A'))), j);
|
||||
}
|
||||
q.setSlop(termsInIndex); // this could be random too
|
||||
|
||||
|
@ -299,7 +314,8 @@ public class TestScorerPerf extends LuceneTestCase {
|
|||
}
|
||||
|
||||
/***
|
||||
int bigIter=6;
|
||||
int bigIter=10;
|
||||
|
||||
public void testConjunctionPerf() throws Exception {
|
||||
createDummySearcher();
|
||||
validate=false;
|
||||
|
@ -326,16 +342,17 @@ public class TestScorerPerf extends LuceneTestCase {
|
|||
s.close();
|
||||
}
|
||||
|
||||
|
||||
public void testConjunctionTerms() throws Exception {
|
||||
validate=false;
|
||||
RAMDirectory dir = new RAMDirectory();
|
||||
System.out.println("Creating index");
|
||||
createRandomTerms(100000,25,2, dir);
|
||||
createRandomTerms(100000,25,.5, dir);
|
||||
s = new IndexSearcher(dir);
|
||||
System.out.println("Starting performance test");
|
||||
for (int i=0; i<bigIter; i++) {
|
||||
long start = System.currentTimeMillis();
|
||||
doTermConjunctions(s,25,5,10000);
|
||||
doTermConjunctions(s,25,5,1000);
|
||||
long end = System.currentTimeMillis();
|
||||
System.out.println("milliseconds="+(end-start));
|
||||
}
|
||||
|
@ -346,12 +363,12 @@ public class TestScorerPerf extends LuceneTestCase {
|
|||
validate=false;
|
||||
RAMDirectory dir = new RAMDirectory();
|
||||
System.out.println("Creating index");
|
||||
createRandomTerms(100000,25,2, dir);
|
||||
createRandomTerms(100000,25,.2, dir);
|
||||
s = new IndexSearcher(dir);
|
||||
System.out.println("Starting performance test");
|
||||
for (int i=0; i<bigIter; i++) {
|
||||
long start = System.currentTimeMillis();
|
||||
doNestedTermConjunctions(s,25,5,5,1000);
|
||||
doNestedTermConjunctions(s,25,3,3,200);
|
||||
long end = System.currentTimeMillis();
|
||||
System.out.println("milliseconds="+(end-start));
|
||||
}
|
||||
|
@ -373,8 +390,8 @@ public class TestScorerPerf extends LuceneTestCase {
|
|||
System.out.println("milliseconds="+(end-start));
|
||||
}
|
||||
s.close();
|
||||
|
||||
}
|
||||
***/
|
||||
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue