LUCENE-2674: improve interaction of MTQ & terms cache

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1002246 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2010-09-28 16:31:44 +00:00
parent f5031a6b27
commit a15dd4b5cb
23 changed files with 193 additions and 78 deletions

View File

@ -121,6 +121,9 @@ Changes in backwards compatibility policy
priority queue size, you can use FuzzyQuery(Term, float, int, int) to specify priority queue size, you can use FuzzyQuery(Term, float, int, int) to specify
those explicitly. those explicitly.
* LUCENE-2674: MultiTermQuery.TermCollector.collect now accepts the
TermsEnum as well. (Robert Muir, Mike McCandless)
Changes in Runtime Behavior Changes in Runtime Behavior
* LUCENE-2650: The behavior of FSDirectory.open has changed. On 64-bit * LUCENE-2650: The behavior of FSDirectory.open has changed. On 64-bit
@ -150,6 +153,11 @@ API Changes
commit points when they are not needed anymore (instead of waiting for the commit points when they are not needed anymore (instead of waiting for the
next commit). (Shai Erera) next commit). (Shai Erera)
* LUCENE-2674: A new idfExplain method was added to Similarity, that
accepts an incoming docFreq. If you subclass Similarity, make sure
you also override this method on upgrade. (Robert Muir, Mike
McCandless)
New features New features
* LUCENE-2604: Added RegexpQuery support to QueryParser. Regular expressions * LUCENE-2604: Added RegexpQuery support to QueryParser. Regular expressions

View File

@ -278,3 +278,8 @@ LUCENE-1458, LUCENE-2111: Flexible Indexing
// document is deleted... // document is deleted...
} }
* LUCENE-2674: A new idfExplain method was added to Similarity, that
accepts an incoming docFreq. If you subclass Similarity, make sure
you also override this method on upgrade, otherwise your
customizations won't run for certain MultiTermQuerys.

View File

@ -90,6 +90,10 @@ public class InstantiatedTermsEnum extends TermsEnum {
} }
} }
@Override
public void cacheCurrentTerm() {
}
@Override @Override
public BytesRef term() { public BytesRef term() {
return br; return br;

View File

@ -873,6 +873,10 @@ public class MemoryIndex implements Serializable {
return br; return br;
} }
@Override
public void cacheCurrentTerm() {
}
@Override @Override
public long ord() { public long ord() {
return termUpto; return termUpto;

View File

@ -129,6 +129,11 @@ public class FilterIndexReader extends IndexReader {
return in.seek(text, useCache); return in.seek(text, useCache);
} }
@Override
public void cacheCurrentTerm() throws IOException {
in.cacheCurrentTerm();
}
@Override @Override
public SeekStatus seek(long ord) throws IOException { public SeekStatus seek(long ord) throws IOException {
return in.seek(ord); return in.seek(ord);

View File

@ -90,6 +90,13 @@ public final class MultiTermsEnum extends TermsEnum {
return current; return current;
} }
@Override
public void cacheCurrentTerm() throws IOException {
for(int i=0;i<numTop;i++) {
top[i].terms.cacheCurrentTerm();
}
}
@Override @Override
public Comparator<BytesRef> getComparator() { public Comparator<BytesRef> getComparator() {
return termComp; return termComp;
@ -213,7 +220,7 @@ public final class MultiTermsEnum extends TermsEnum {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
private final void pullTop() { private void pullTop() {
// extract all subs from the queue that have the same // extract all subs from the queue that have the same
// top term // top term
assert numTop == 0; assert numTop == 0;
@ -226,7 +233,7 @@ public final class MultiTermsEnum extends TermsEnum {
current = top[0].current; current = top[0].current;
} }
private final void pushTop() throws IOException { private void pushTop() throws IOException {
// call next() on each top, and put back into queue // call next() on each top, and put back into queue
for(int i=0;i<numTop;i++) { for(int i=0;i<numTop;i++) {
top[i].current = top[i].terms.next(); top[i].current = top[i].terms.next();
@ -418,7 +425,7 @@ public final class MultiTermsEnum extends TermsEnum {
} }
@Override @Override
protected final boolean lessThan(TermsEnumWithSlice termsA, TermsEnumWithSlice termsB) { protected boolean lessThan(TermsEnumWithSlice termsA, TermsEnumWithSlice termsB) {
final int cmp = termComp.compare(termsA.current, termsB.current); final int cmp = termComp.compare(termsA.current, termsB.current);
if (cmp != 0) { if (cmp != 0) {
return cmp < 0; return cmp < 0;

View File

@ -123,6 +123,10 @@ public abstract class TermsEnum {
* instance & reuse it. */ * instance & reuse it. */
public abstract Comparator<BytesRef> getComparator() throws IOException; public abstract Comparator<BytesRef> getComparator() throws IOException;
/** Optional optimization hint: informs the codec that the
* current term is likely to be re-seek'd-to soon. */
public abstract void cacheCurrentTerm() throws IOException;
/** An empty TermsEnum for quickly returning an empty instance e.g. /** An empty TermsEnum for quickly returning an empty instance e.g.
* in {@link org.apache.lucene.search.MultiTermQuery} * in {@link org.apache.lucene.search.MultiTermQuery}
* <p><em>Please note:</em> This enum should be unmodifiable, * <p><em>Please note:</em> This enum should be unmodifiable,
@ -137,6 +141,9 @@ public abstract class TermsEnum {
@Override @Override
public SeekStatus seek(long ord) { return SeekStatus.END; } public SeekStatus seek(long ord) { return SeekStatus.END; }
@Override
public void cacheCurrentTerm() {}
@Override @Override
public BytesRef term() { public BytesRef term() {
throw new IllegalStateException("this method should never be called"); throw new IllegalStateException("this method should never be called");

View File

@ -82,6 +82,11 @@ public class PrefixCodedTermsReader extends FieldsProducer {
public FieldAndTerm() { public FieldAndTerm() {
} }
public FieldAndTerm(String field, BytesRef term) {
this.field = field;
this.term = new BytesRef(term);
}
public FieldAndTerm(FieldAndTerm other) { public FieldAndTerm(FieldAndTerm other) {
field = other.field; field = other.field;
term = new BytesRef(other.term); term = new BytesRef(other.term);
@ -297,6 +302,14 @@ public class PrefixCodedTermsReader extends FieldsProducer {
return termComp; return termComp;
} }
@Override
public void cacheCurrentTerm() {
TermState stateCopy = (TermState) state.clone();
stateCopy.filePointer = in.getFilePointer();
termsCache.put(new FieldAndTerm(fieldInfo.name, bytesReader.term),
stateCopy);
}
/** Seeks until the first term that's >= the provided /** Seeks until the first term that's >= the provided
* text; returns SeekStatus.FOUND if the exact term * text; returns SeekStatus.FOUND if the exact term
* is found, SeekStatus.NOT_FOUND if a different term * is found, SeekStatus.NOT_FOUND if a different term

View File

@ -316,7 +316,7 @@ public class PreFlexFields extends FieldsProducer {
} }
// Seek "back": // Seek "back":
getTermsDict().seekEnum(te, protoTerm.createTerm(term)); getTermsDict().seekEnum(te, protoTerm.createTerm(term), true);
// Test if the term we seek'd to in fact found a // Test if the term we seek'd to in fact found a
// surrogate pair at the same position as the E: // surrogate pair at the same position as the E:
@ -387,7 +387,7 @@ public class PreFlexFields extends FieldsProducer {
if (seekToNonBMP(seekTermEnum, prevTerm, downTo)) { if (seekToNonBMP(seekTermEnum, prevTerm, downTo)) {
// TODO: more efficient seek? // TODO: more efficient seek?
getTermsDict().seekEnum(termEnum, seekTermEnum.term()); getTermsDict().seekEnum(termEnum, seekTermEnum.term(), true);
//newSuffixStart = downTo+4; //newSuffixStart = downTo+4;
newSuffixStart = downTo; newSuffixStart = downTo;
scratchTerm.copy(termEnum.term().bytes()); scratchTerm.copy(termEnum.term().bytes());
@ -443,7 +443,7 @@ public class PreFlexFields extends FieldsProducer {
// TODO: more efficient seek? can we simply swap // TODO: more efficient seek? can we simply swap
// the enums? // the enums?
getTermsDict().seekEnum(termEnum, protoTerm.createTerm(scratchTerm)); getTermsDict().seekEnum(termEnum, protoTerm.createTerm(scratchTerm), true);
final Term t2 = termEnum.term(); final Term t2 = termEnum.term();
@ -619,7 +619,7 @@ public class PreFlexFields extends FieldsProducer {
// Seek "forward": // Seek "forward":
// TODO: more efficient seek? // TODO: more efficient seek?
getTermsDict().seekEnum(seekTermEnum, protoTerm.createTerm(scratchTerm)); getTermsDict().seekEnum(seekTermEnum, protoTerm.createTerm(scratchTerm), true);
scratchTerm.bytes[upTo] = scratch[0]; scratchTerm.bytes[upTo] = scratch[0];
scratchTerm.bytes[upTo+1] = scratch[1]; scratchTerm.bytes[upTo+1] = scratch[1];
@ -668,7 +668,7 @@ public class PreFlexFields extends FieldsProducer {
// OK seek "back" // OK seek "back"
// TODO: more efficient seek? // TODO: more efficient seek?
getTermsDict().seekEnum(termEnum, seekTermEnum.term()); getTermsDict().seekEnum(termEnum, seekTermEnum.term(), true);
scratchTerm.copy(seekTermEnum.term().bytes()); scratchTerm.copy(seekTermEnum.term().bytes());
@ -701,7 +701,7 @@ public class PreFlexFields extends FieldsProducer {
seekTermEnum = getTermsDict().terms(protoTerm); seekTermEnum = getTermsDict().terms(protoTerm);
//System.out.println(" term=" + termEnum.term()); //System.out.println(" term=" + termEnum.term());
} else { } else {
getTermsDict().seekEnum(termEnum, protoTerm); getTermsDict().seekEnum(termEnum, protoTerm, true);
} }
skipNext = true; skipNext = true;
@ -726,6 +726,11 @@ public class PreFlexFields extends FieldsProducer {
} }
} }
@Override
public void cacheCurrentTerm() throws IOException {
getTermsDict().cacheCurrentTerm(termEnum);
}
@Override @Override
public SeekStatus seek(long ord) throws IOException { public SeekStatus seek(long ord) throws IOException {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
@ -747,7 +752,7 @@ public class PreFlexFields extends FieldsProducer {
assert termEnum != null; assert termEnum != null;
tis.seekEnum(termEnum, t0); tis.seekEnum(termEnum, t0, useCache);
final Term t = termEnum.term(); final Term t = termEnum.term();
@ -783,7 +788,7 @@ public class PreFlexFields extends FieldsProducer {
if (seekToNonBMP(seekTermEnum, scratchTerm, i)) { if (seekToNonBMP(seekTermEnum, scratchTerm, i)) {
scratchTerm.copy(seekTermEnum.term().bytes()); scratchTerm.copy(seekTermEnum.term().bytes());
getTermsDict().seekEnum(termEnum, seekTermEnum.term()); getTermsDict().seekEnum(termEnum, seekTermEnum.term(), useCache);
newSuffixStart = 1+i; newSuffixStart = 1+i;

View File

@ -52,7 +52,7 @@ public final class SegmentTermEnum implements Cloneable {
private TermBuffer prevBuffer = new TermBuffer(); private TermBuffer prevBuffer = new TermBuffer();
private TermBuffer scanBuffer = new TermBuffer(); // used for scanning private TermBuffer scanBuffer = new TermBuffer(); // used for scanning
private TermInfo termInfo = new TermInfo(); TermInfo termInfo = new TermInfo();
private int format; private int format;
private boolean isIndex = false; private boolean isIndex = false;
@ -61,7 +61,6 @@ public final class SegmentTermEnum implements Cloneable {
int skipInterval; int skipInterval;
int newSuffixStart; int newSuffixStart;
int maxSkipLevels; int maxSkipLevels;
private int formatM1SkipInterval;
SegmentTermEnum(IndexInput i, FieldInfos fis, boolean isi) SegmentTermEnum(IndexInput i, FieldInfos fis, boolean isi)
throws CorruptIndexException, IOException { throws CorruptIndexException, IOException {

View File

@ -54,8 +54,8 @@ public final class TermInfosReader {
// Just adds term's ord to TermInfo // Just adds term's ord to TermInfo
private final static class TermInfoAndOrd extends TermInfo { private final static class TermInfoAndOrd extends TermInfo {
final int termOrd; final long termOrd;
public TermInfoAndOrd(TermInfo ti, int termOrd) { public TermInfoAndOrd(TermInfo ti, long termOrd) {
super(ti); super(ti);
this.termOrd = termOrd; this.termOrd = termOrd;
} }
@ -228,14 +228,24 @@ public final class TermInfosReader {
return tiOrd; return tiOrd;
} }
return seekEnum(resources.termEnum, term, tiOrd); return seekEnum(resources.termEnum, term, tiOrd, true);
} }
TermInfo seekEnum(SegmentTermEnum enumerator, Term term) throws IOException { public void cacheCurrentTerm(SegmentTermEnum enumerator) {
return seekEnum(enumerator, term, termsCache.get(new CloneableTerm(term))); termsCache.put(new CloneableTerm(enumerator.term()),
new TermInfoAndOrd(enumerator.termInfo,
enumerator.position));
} }
TermInfo seekEnum(SegmentTermEnum enumerator, Term term, TermInfoAndOrd tiOrd) throws IOException { TermInfo seekEnum(SegmentTermEnum enumerator, Term term, boolean useCache) throws IOException {
if (useCache) {
return seekEnum(enumerator, term, termsCache.get(new CloneableTerm(term)), useCache);
} else {
return seekEnum(enumerator, term, null, useCache);
}
}
TermInfo seekEnum(SegmentTermEnum enumerator, Term term, TermInfoAndOrd tiOrd, boolean useCache) throws IOException {
if (size == 0) { if (size == 0) {
return null; return null;
} }
@ -252,7 +262,7 @@ public final class TermInfosReader {
final TermInfo ti; final TermInfo ti;
int numScans = enumerator.scanTo(term); int numScans = enumerator.scanTo(term);
if (enumerator.term() != null && term.compareToUTF16(enumerator.term()) == 0) { if (enumerator.term() != null && term.compareToUTF16(enumerator.term()) == 0) {
ti = enumerator.termInfo(); ti = enumerator.termInfo;
if (numScans > 1) { if (numScans > 1) {
// we only want to put this TermInfo into the cache if // we only want to put this TermInfo into the cache if
// scanEnum skipped more than one dictionary entry. // scanEnum skipped more than one dictionary entry.
@ -260,7 +270,9 @@ public final class TermInfosReader {
// wipe out the cache when they iterate over a large numbers // wipe out the cache when they iterate over a large numbers
// of terms in order // of terms in order
if (tiOrd == null) { if (tiOrd == null) {
termsCache.put(new CloneableTerm(term), new TermInfoAndOrd(ti, (int) enumerator.position)); if (useCache) {
termsCache.put(new CloneableTerm(term), new TermInfoAndOrd(ti, enumerator.position));
}
} else { } else {
assert sameTermInfo(ti, tiOrd, enumerator); assert sameTermInfo(ti, tiOrd, enumerator);
assert (int) enumerator.position == tiOrd.termOrd; assert (int) enumerator.position == tiOrd.termOrd;
@ -277,7 +289,7 @@ public final class TermInfosReader {
// random-access: must seek // random-access: must seek
final int indexPos; final int indexPos;
if (tiOrd != null) { if (tiOrd != null) {
indexPos = tiOrd.termOrd / totalIndexInterval; indexPos = (int) (tiOrd.termOrd / totalIndexInterval);
} else { } else {
// Must do binary search: // Must do binary search:
indexPos = getIndexOffset(term); indexPos = getIndexOffset(term);
@ -288,9 +300,11 @@ public final class TermInfosReader {
final TermInfo ti; final TermInfo ti;
if (enumerator.term() != null && term.compareToUTF16(enumerator.term()) == 0) { if (enumerator.term() != null && term.compareToUTF16(enumerator.term()) == 0) {
ti = enumerator.termInfo(); ti = enumerator.termInfo;
if (tiOrd == null) { if (tiOrd == null) {
termsCache.put(new CloneableTerm(term), new TermInfoAndOrd(ti, (int) enumerator.position)); if (useCache) {
termsCache.put(new CloneableTerm(term), new TermInfoAndOrd(ti, enumerator.position));
}
} else { } else {
assert sameTermInfo(ti, tiOrd, enumerator); assert sameTermInfo(ti, tiOrd, enumerator);
assert (int) enumerator.position == tiOrd.termOrd; assert (int) enumerator.position == tiOrd.termOrd;

View File

@ -32,6 +32,8 @@ import org.apache.lucene.util.StringHelper;
import java.io.IOException; import java.io.IOException;
import java.util.Comparator; import java.util.Comparator;
import java.util.Map;
import java.util.HashMap;
class SimpleTextFieldsReader extends FieldsProducer { class SimpleTextFieldsReader extends FieldsProducer {
@ -153,6 +155,10 @@ class SimpleTextFieldsReader extends FieldsProducer {
return SeekStatus.END; return SeekStatus.END;
} }
@Override
public void cacheCurrentTerm() {
}
@Override @Override
public BytesRef next() throws IOException { public BytesRef next() throws IOException {
assert !ended; assert !ended;
@ -468,16 +474,23 @@ class SimpleTextFieldsReader extends FieldsProducer {
return new SimpleTextFieldsEnum(); return new SimpleTextFieldsEnum();
} }
private final Map<String,Terms> termsCache = new HashMap<String,Terms>();
@Override @Override
public Terms terms(String field) throws IOException { synchronized public Terms terms(String field) throws IOException {
SimpleTextFieldsEnum fe = (SimpleTextFieldsEnum) iterator(); Terms terms = termsCache.get(field);
String fieldUpto; if (terms == null) {
while((fieldUpto = fe.next()) != null) { SimpleTextFieldsEnum fe = (SimpleTextFieldsEnum) iterator();
if (fieldUpto.equals(field)) { String fieldUpto;
return new SimpleTextTerms(field, fe.in.getFilePointer()); while((fieldUpto = fe.next()) != null) {
if (fieldUpto.equals(field)) {
terms = new SimpleTextTerms(field, fe.in.getFilePointer());
break;
}
} }
termsCache.put(field, terms);
} }
return null; return terms;
} }
@Override @Override

View File

@ -46,7 +46,6 @@ public abstract class FilteredTermsEnum extends TermsEnum {
private BytesRef initialSeekTerm = null; private BytesRef initialSeekTerm = null;
private boolean doSeek = true; private boolean doSeek = true;
private BytesRef actualTerm = null; private BytesRef actualTerm = null;
private boolean useTermsCache = false;
private final TermsEnum tenum; private final TermsEnum tenum;
@ -116,16 +115,6 @@ public abstract class FilteredTermsEnum extends TermsEnum {
return t; return t;
} }
/** Expert: enable or disable the terms cache when seeking. */
protected final void setUseTermsCache(boolean useTermsCache) {
this.useTermsCache = useTermsCache;
}
/** Expert: enable or disable the terms cache when seeking. */
protected final boolean getUseTermsCache() {
return useTermsCache;
}
/** /**
* Returns the related attributes, the returned {@link AttributeSource} * Returns the related attributes, the returned {@link AttributeSource}
* is shared with the delegate {@code TermsEnum}. * is shared with the delegate {@code TermsEnum}.
@ -189,6 +178,11 @@ public abstract class FilteredTermsEnum extends TermsEnum {
return tenum.docsAndPositions(bits, reuse); return tenum.docsAndPositions(bits, reuse);
} }
@Override
public void cacheCurrentTerm() throws IOException {
tenum.cacheCurrentTerm();
}
@Override @Override
public BytesRef next() throws IOException { public BytesRef next() throws IOException {
if (tenum == null) if (tenum == null)
@ -200,7 +194,7 @@ public abstract class FilteredTermsEnum extends TermsEnum {
final BytesRef t = nextSeekTerm(actualTerm); final BytesRef t = nextSeekTerm(actualTerm);
// Make sure we always seek forward: // Make sure we always seek forward:
assert actualTerm == null || t == null || getComparator().compare(t, actualTerm) > 0: "curTerm=" + actualTerm + " seekTerm=" + t; assert actualTerm == null || t == null || getComparator().compare(t, actualTerm) > 0: "curTerm=" + actualTerm + " seekTerm=" + t;
if (t == null || tenum.seek(t, useTermsCache) == SeekStatus.END) { if (t == null || tenum.seek(t, false) == SeekStatus.END) {
// no more terms to seek to or enum exhausted // no more terms to seek to or enum exhausted
return null; return null;
} }

View File

@ -219,6 +219,11 @@ public final class FuzzyTermsEnum extends TermsEnum {
return actualEnum.docFreq(); return actualEnum.docFreq();
} }
@Override
public void cacheCurrentTerm() throws IOException {
actualEnum.cacheCurrentTerm();
}
@Override @Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException { public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
return actualEnum.docs(skipDocs, reuse); return actualEnum.docs(skipDocs, reuse);

View File

@ -19,7 +19,6 @@ package org.apache.lucene.search;
import java.io.IOException; import java.io.IOException;
import java.io.Serializable; import java.io.Serializable;
import java.util.ArrayList;
import java.util.PriorityQueue; import java.util.PriorityQueue;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
@ -201,7 +200,8 @@ public abstract class MultiTermQuery extends Query {
int count = 0; int count = 0;
BytesRef bytes; BytesRef bytes;
while ((bytes = termsEnum.next()) != null) { while ((bytes = termsEnum.next()) != null) {
if (collector.collect(bytes, boostAtt.getBoost())) { if (collector.collect(termsEnum, bytes, boostAtt.getBoost())) {
termsEnum.cacheCurrentTerm();
count++; count++;
} else { } else {
break; break;
@ -215,7 +215,7 @@ public abstract class MultiTermQuery extends Query {
private BoostAttribute boostAtt = null; private BoostAttribute boostAtt = null;
/** return false to stop collecting */ /** return false to stop collecting */
public abstract boolean collect(BytesRef bytes, float boost) throws IOException; public abstract boolean collect(TermsEnum termsEnum, BytesRef bytes, float boost) throws IOException;
/** set the minimum boost as a hint for the term producer */ /** set the minimum boost as a hint for the term producer */
protected final void setMaxNonCompetitiveBoost(float maxNonCompetitiveBoost) { protected final void setMaxNonCompetitiveBoost(float maxNonCompetitiveBoost) {
@ -231,9 +231,10 @@ public abstract class MultiTermQuery extends Query {
final BooleanQuery result = new BooleanQuery(true); final BooleanQuery result = new BooleanQuery(true);
final Term placeholderTerm = new Term(query.field); final Term placeholderTerm = new Term(query.field);
query.incTotalNumberOfTerms(collectTerms(reader, query, new TermCollector() { query.incTotalNumberOfTerms(collectTerms(reader, query, new TermCollector() {
public boolean collect(BytesRef bytes, float boost) { @Override
public boolean collect(TermsEnum termsEnum, BytesRef bytes, float boost) {
// add new TQ, we must clone the term, else it may get overwritten! // add new TQ, we must clone the term, else it may get overwritten!
TermQuery tq = new TermQuery(placeholderTerm.createTerm(new BytesRef(bytes))); TermQuery tq = new TermQuery(placeholderTerm.createTerm(new BytesRef(bytes)), termsEnum.docFreq());
tq.setBoost(query.getBoost() * boost); // set the boost tq.setBoost(query.getBoost() * boost); // set the boost
result.add(tq, BooleanClause.Occur.SHOULD); // add to query result.add(tq, BooleanClause.Occur.SHOULD); // add to query
return true; return true;
@ -291,20 +292,22 @@ public abstract class MultiTermQuery extends Query {
} }
/** Return a suitable Query for a MultiTermQuery term. */ /** Return a suitable Query for a MultiTermQuery term. */
protected abstract Query getQuery(Term term); protected abstract Query getQuery(Term term, int docCount);
@Override @Override
public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException { public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException {
final int maxSize = Math.min(size, BooleanQuery.getMaxClauseCount()); final int maxSize = Math.min(size, BooleanQuery.getMaxClauseCount());
final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>(); final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>();
collectTerms(reader, query, new TermCollector() { collectTerms(reader, query, new TermCollector() {
public boolean collect(BytesRef bytes, float boost) { @Override
public boolean collect(TermsEnum termsEnum, BytesRef bytes, float boost) {
// ignore uncompetetive hits // ignore uncompetetive hits
if (stQueue.size() >= maxSize && boost <= stQueue.peek().boost) if (stQueue.size() >= maxSize && boost <= stQueue.peek().boost)
return true; return true;
// add new entry in PQ, we must clone the term, else it may get overwritten! // add new entry in PQ, we must clone the term, else it may get overwritten!
st.bytes.copy(bytes); st.bytes.copy(bytes);
st.boost = boost; st.boost = boost;
st.docFreq = termsEnum.docFreq();
stQueue.offer(st); stQueue.offer(st);
// possibly drop entries from queue // possibly drop entries from queue
st = (stQueue.size() > maxSize) ? stQueue.poll() : new ScoreTerm(); st = (stQueue.size() > maxSize) ? stQueue.poll() : new ScoreTerm();
@ -320,7 +323,7 @@ public abstract class MultiTermQuery extends Query {
final BooleanQuery bq = new BooleanQuery(true); final BooleanQuery bq = new BooleanQuery(true);
for (final ScoreTerm st : stQueue) { for (final ScoreTerm st : stQueue) {
// add new query, we must clone the term, else it may get overwritten! // add new query, we must clone the term, else it may get overwritten!
Query tq = getQuery(placeholderTerm.createTerm(st.bytes)); Query tq = getQuery(placeholderTerm.createTerm(st.bytes), st.docFreq);
tq.setBoost(query.getBoost() * st.boost); // set the boost tq.setBoost(query.getBoost() * st.boost); // set the boost
bq.add(tq, BooleanClause.Occur.SHOULD); // add to query bq.add(tq, BooleanClause.Occur.SHOULD); // add to query
} }
@ -349,6 +352,7 @@ public abstract class MultiTermQuery extends Query {
private static class ScoreTerm implements Comparable<ScoreTerm> { private static class ScoreTerm implements Comparable<ScoreTerm> {
public final BytesRef bytes = new BytesRef(); public final BytesRef bytes = new BytesRef();
public float boost; public float boost;
public int docFreq;
public int compareTo(ScoreTerm other) { public int compareTo(ScoreTerm other) {
if (this.boost == other.boost) if (this.boost == other.boost)
@ -395,8 +399,8 @@ public abstract class MultiTermQuery extends Query {
} }
@Override @Override
protected Query getQuery(Term term) { protected Query getQuery(Term term, int docFreq) {
return new TermQuery(term); return new TermQuery(term, docFreq);
} }
} }
@ -433,8 +437,8 @@ public abstract class MultiTermQuery extends Query {
} }
@Override @Override
protected Query getQuery(Term term) { protected Query getQuery(Term term, int docFreq) {
return new ConstantScoreQuery(new QueryWrapperFilter(new TermQuery(term))); return new ConstantScoreQuery(new QueryWrapperFilter(new TermQuery(term, docFreq)));
} }
} }
@ -567,18 +571,14 @@ public abstract class MultiTermQuery extends Query {
this.termCountLimit = termCountLimit; this.termCountLimit = termCountLimit;
} }
public boolean collect(BytesRef bytes, float boost) throws IOException { public boolean collect(TermsEnum termsEnum, BytesRef bytes, float boost) throws IOException {
termCount++; termCount++;
if (termCount >= termCountLimit || docVisitCount >= docCountCutoff) { if (termCount >= termCountLimit || docVisitCount >= docCountCutoff) {
hasCutOff = true; hasCutOff = true;
return false; return false;
} }
pendingTerms.copyUsingLengthPrefix(bytes); pendingTerms.copyUsingLengthPrefix(bytes);
// Loading the TermInfo from the terms dict here docVisitCount += termsEnum.docFreq();
// should not be costly, because 1) the
// query/filter will load the TermInfo when it
// runs, and 2) the terms dict has a cache:
docVisitCount += reader.docFreq(field, bytes);
return true; return true;
} }

View File

@ -741,7 +741,7 @@ public abstract class Similarity implements Serializable {
* The default implementation uses: * The default implementation uses:
* *
* <pre> * <pre>
* idf(searcher.docFreq(term), searcher.maxDoc()); * idf(docFreq, searcher.maxDoc());
* </pre> * </pre>
* *
* Note that {@link Searcher#maxDoc()} is used instead of * Note that {@link Searcher#maxDoc()} is used instead of
@ -752,12 +752,13 @@ public abstract class Similarity implements Serializable {
* *
* @param term the term in question * @param term the term in question
* @param searcher the document collection being searched * @param searcher the document collection being searched
* @param docFreq externally computed docFreq for this term
* @return an IDFExplain object that includes both an idf score factor * @return an IDFExplain object that includes both an idf score factor
and an explanation for the term. and an explanation for the term.
* @throws IOException * @throws IOException
*/ */
public IDFExplanation idfExplain(final Term term, final Searcher searcher) throws IOException { public IDFExplanation idfExplain(final Term term, final Searcher searcher, int docFreq) throws IOException {
final int df = searcher.docFreq(term); final int df = docFreq;
final int max = searcher.maxDoc(); final int max = searcher.maxDoc();
final float idf = idf(df, max); final float idf = idf(df, max);
return new IDFExplanation() { return new IDFExplanation() {
@ -772,6 +773,15 @@ public abstract class Similarity implements Serializable {
}}; }};
} }
/**
* This method forwards to {@link
* idfExplain(Term,Searcher,int)} by passing
* <code>searcher.docFreq(term)</code> as the docFreq.
*/
public IDFExplanation idfExplain(final Term term, final Searcher searcher) throws IOException {
return idfExplain(term, searcher, searcher.docFreq(term));
}
/** /**
* Computes a score factor for a phrase. * Computes a score factor for a phrase.
* *

View File

@ -30,7 +30,8 @@ import org.apache.lucene.util.ToStringUtils;
This may be combined with other terms with a {@link BooleanQuery}. This may be combined with other terms with a {@link BooleanQuery}.
*/ */
public class TermQuery extends Query { public class TermQuery extends Query {
private Term term; private final Term term;
private final int docFreq;
private class TermWeight extends Weight { private class TermWeight extends Weight {
private final Similarity similarity; private final Similarity similarity;
@ -43,7 +44,11 @@ public class TermQuery extends Query {
public TermWeight(Searcher searcher) public TermWeight(Searcher searcher)
throws IOException { throws IOException {
this.similarity = getSimilarity(searcher); this.similarity = getSimilarity(searcher);
idfExp = similarity.idfExplain(term, searcher); if (docFreq != -1) {
idfExp = similarity.idfExplain(term, searcher, docFreq);
} else {
idfExp = similarity.idfExplain(term, searcher);
}
idf = idfExp.getIdf(); idf = idfExp.getIdf();
} }
@ -160,7 +165,15 @@ public class TermQuery extends Query {
/** Constructs a query for the term <code>t</code>. */ /** Constructs a query for the term <code>t</code>. */
public TermQuery(Term t) { public TermQuery(Term t) {
this(t, -1);
}
/** Expert: constructs a TermQuery that will use the
* provided docFreq instead of looking up the docFreq
* against the searcher. */
public TermQuery(Term t, int docFreq) {
term = t; term = t;
this.docFreq = docFreq;
} }
/** Returns the term of this query. */ /** Returns the term of this query. */

View File

@ -284,6 +284,11 @@ public class DocTermsIndexCreator<T extends DocTermsIndex> extends EntryCreatorW
return term; return term;
} }
@Override
public void cacheCurrentTerm() throws IOException {
throw new UnsupportedOperationException();
}
@Override @Override
public BytesRef term() throws IOException { public BytesRef term() throws IOException {
return term; return term;

View File

@ -319,6 +319,10 @@ public class TestExternalCodecs extends LuceneTestCase {
return ramField.termToDocs.get(current).docs.size(); return ramField.termToDocs.get(current).docs.size();
} }
@Override
public void cacheCurrentTerm() {
}
@Override @Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) { public DocsEnum docs(Bits skipDocs, DocsEnum reuse) {
return new RAMDocsEnum(ramField.termToDocs.get(current), skipDocs); return new RAMDocsEnum(ramField.termToDocs.get(current), skipDocs);

View File

@ -33,6 +33,7 @@ import org.apache.lucene.store.MockDirectoryWrapper;
import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import static org.junit.Assume.*;
/** /**
* Tests lazy skipping on the proximity file. * Tests lazy skipping on the proximity file.
@ -119,12 +120,11 @@ public class TestLazyProxSkipping extends LuceneTestCase {
} }
public void testLazySkipping() throws IOException { public void testLazySkipping() throws IOException {
assumeTrue(!CodecProvider.getDefaultCodec().equals("SimpleText"));
// test whether only the minimum amount of seeks() // test whether only the minimum amount of seeks()
// are performed // are performed
if (!CodecProvider.getDefaultCodec().equals("SimpleText")) { performTest(5);
performTest(5); performTest(10);
performTest(10);
}
} }
public void testSeek() throws IOException { public void testSeek() throws IOException {

View File

@ -87,7 +87,6 @@ public class TestNumericRangeQuery64 extends LuceneTestCase {
ascfield2.setLongValue(val); ascfield2.setLongValue(val);
writer.addDocument(doc); writer.addDocument(doc);
} }
reader = writer.getReader(); reader = writer.getReader();
searcher=new IndexSearcher(reader); searcher=new IndexSearcher(reader);
writer.close(); writer.close();

View File

@ -191,9 +191,7 @@ public abstract class LuceneTestCase extends Assert {
private static Map<MockDirectoryWrapper,StackTraceElement[]> stores; private static Map<MockDirectoryWrapper,StackTraceElement[]> stores;
// TODO 4.0: make sure we re-enable SimpleText in the rotation private static final String[] TEST_CODECS = new String[] {"MockSep", "MockFixedIntBlock", "MockVariableIntBlock", "SimpleText"};
//private static final String[] TEST_CODECS = new String[] {"MockSep", "MockFixedIntBlock", "MockVariableIntBlock", "SimpleText"};
private static final String[] TEST_CODECS = new String[] {"MockSep", "MockFixedIntBlock", "MockVariableIntBlock"};
private static void swapCodec(Codec c) { private static void swapCodec(Codec c) {
final CodecProvider cp = CodecProvider.getDefault(); final CodecProvider cp = CodecProvider.getDefault();
@ -246,8 +244,7 @@ public abstract class LuceneTestCase extends Assert {
swapCodec(new MockFixedIntBlockCodec(codecHasParam && "MockFixedIntBlock".equals(codec) ? codecParam : _TestUtil.nextInt(random, 1, 2000))); swapCodec(new MockFixedIntBlockCodec(codecHasParam && "MockFixedIntBlock".equals(codec) ? codecParam : _TestUtil.nextInt(random, 1, 2000)));
// baseBlockSize cannot be over 127: // baseBlockSize cannot be over 127:
swapCodec(new MockVariableIntBlockCodec(codecHasParam && "MockVariableIntBlock".equals(codec) ? codecParam : _TestUtil.nextInt(random, 1, 127))); swapCodec(new MockVariableIntBlockCodec(codecHasParam && "MockVariableIntBlock".equals(codec) ? codecParam : _TestUtil.nextInt(random, 1, 127)));
// TODO 4.0: add this into test rotation swapCodec(new SimpleTextCodec());
//swapCodec(new SimpleTextCodec());
return cp.lookup(codec); return cp.lookup(codec);
} }

View File

@ -40,7 +40,6 @@ import org.apache.solr.search.*;
import org.apache.solr.util.ByteUtils; import org.apache.solr.util.ByteUtils;
import org.apache.solr.util.LongPriorityQueue; import org.apache.solr.util.LongPriorityQueue;
import org.apache.solr.util.PrimUtils; import org.apache.solr.util.PrimUtils;
import org.apache.solr.util.BoundedTreeSet;
import org.apache.solr.handler.component.StatsValues; import org.apache.solr.handler.component.StatsValues;
import org.apache.solr.handler.component.FieldFacetStats; import org.apache.solr.handler.component.FieldFacetStats;
import org.apache.lucene.util.OpenBitSet; import org.apache.lucene.util.OpenBitSet;
@ -1002,6 +1001,11 @@ class NumberedTermsEnum extends TermsEnum {
return tenum.docFreq(); return tenum.docFreq();
} }
@Override
public void cacheCurrentTerm() {
throw new UnsupportedOperationException();
}
public BytesRef skipTo(BytesRef target) throws IOException { public BytesRef skipTo(BytesRef target) throws IOException {
// already here // already here