mirror of https://github.com/apache/lucene.git
LUCENE-2674: improve interaction of MTQ & terms cache
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1002246 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f5031a6b27
commit
a15dd4b5cb
|
@ -121,6 +121,9 @@ Changes in backwards compatibility policy
|
|||
priority queue size, you can use FuzzyQuery(Term, float, int, int) to specify
|
||||
those explicitly.
|
||||
|
||||
* LUCENE-2674: MultiTermQuery.TermCollector.collect now accepts the
|
||||
TermsEnum as well. (Robert Muir, Mike McCandless)
|
||||
|
||||
Changes in Runtime Behavior
|
||||
|
||||
* LUCENE-2650: The behavior of FSDirectory.open has changed. On 64-bit
|
||||
|
@ -150,6 +153,11 @@ API Changes
|
|||
commit points when they are not needed anymore (instead of waiting for the
|
||||
next commit). (Shai Erera)
|
||||
|
||||
* LUCENE-2674: A new idfExplain method was added to Similarity, that
|
||||
accepts an incoming docFreq. If you subclass Similarity, make sure
|
||||
you also override this method on upgrade. (Robert Muir, Mike
|
||||
McCandless)
|
||||
|
||||
New features
|
||||
|
||||
* LUCENE-2604: Added RegexpQuery support to QueryParser. Regular expressions
|
||||
|
|
|
@ -278,3 +278,8 @@ LUCENE-1458, LUCENE-2111: Flexible Indexing
|
|||
// document is deleted...
|
||||
}
|
||||
|
||||
* LUCENE-2674: A new idfExplain method was added to Similarity, that
|
||||
accepts an incoming docFreq. If you subclass Similarity, make sure
|
||||
you also override this method on upgrade, otherwise your
|
||||
customizations won't run for certain MultiTermQuerys.
|
||||
|
||||
|
|
|
@ -90,6 +90,10 @@ public class InstantiatedTermsEnum extends TermsEnum {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cacheCurrentTerm() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef term() {
|
||||
return br;
|
||||
|
|
|
@ -873,6 +873,10 @@ public class MemoryIndex implements Serializable {
|
|||
return br;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cacheCurrentTerm() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public long ord() {
|
||||
return termUpto;
|
||||
|
|
|
@ -129,6 +129,11 @@ public class FilterIndexReader extends IndexReader {
|
|||
return in.seek(text, useCache);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cacheCurrentTerm() throws IOException {
|
||||
in.cacheCurrentTerm();
|
||||
}
|
||||
|
||||
@Override
|
||||
public SeekStatus seek(long ord) throws IOException {
|
||||
return in.seek(ord);
|
||||
|
|
|
@ -90,6 +90,13 @@ public final class MultiTermsEnum extends TermsEnum {
|
|||
return current;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cacheCurrentTerm() throws IOException {
|
||||
for(int i=0;i<numTop;i++) {
|
||||
top[i].terms.cacheCurrentTerm();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Comparator<BytesRef> getComparator() {
|
||||
return termComp;
|
||||
|
@ -213,7 +220,7 @@ public final class MultiTermsEnum extends TermsEnum {
|
|||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
private final void pullTop() {
|
||||
private void pullTop() {
|
||||
// extract all subs from the queue that have the same
|
||||
// top term
|
||||
assert numTop == 0;
|
||||
|
@ -226,7 +233,7 @@ public final class MultiTermsEnum extends TermsEnum {
|
|||
current = top[0].current;
|
||||
}
|
||||
|
||||
private final void pushTop() throws IOException {
|
||||
private void pushTop() throws IOException {
|
||||
// call next() on each top, and put back into queue
|
||||
for(int i=0;i<numTop;i++) {
|
||||
top[i].current = top[i].terms.next();
|
||||
|
@ -418,7 +425,7 @@ public final class MultiTermsEnum extends TermsEnum {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected final boolean lessThan(TermsEnumWithSlice termsA, TermsEnumWithSlice termsB) {
|
||||
protected boolean lessThan(TermsEnumWithSlice termsA, TermsEnumWithSlice termsB) {
|
||||
final int cmp = termComp.compare(termsA.current, termsB.current);
|
||||
if (cmp != 0) {
|
||||
return cmp < 0;
|
||||
|
|
|
@ -123,6 +123,10 @@ public abstract class TermsEnum {
|
|||
* instance & reuse it. */
|
||||
public abstract Comparator<BytesRef> getComparator() throws IOException;
|
||||
|
||||
/** Optional optimization hint: informs the codec that the
|
||||
* current term is likely to be re-seek'd-to soon. */
|
||||
public abstract void cacheCurrentTerm() throws IOException;
|
||||
|
||||
/** An empty TermsEnum for quickly returning an empty instance e.g.
|
||||
* in {@link org.apache.lucene.search.MultiTermQuery}
|
||||
* <p><em>Please note:</em> This enum should be unmodifiable,
|
||||
|
@ -137,6 +141,9 @@ public abstract class TermsEnum {
|
|||
@Override
|
||||
public SeekStatus seek(long ord) { return SeekStatus.END; }
|
||||
|
||||
@Override
|
||||
public void cacheCurrentTerm() {}
|
||||
|
||||
@Override
|
||||
public BytesRef term() {
|
||||
throw new IllegalStateException("this method should never be called");
|
||||
|
|
|
@ -82,6 +82,11 @@ public class PrefixCodedTermsReader extends FieldsProducer {
|
|||
public FieldAndTerm() {
|
||||
}
|
||||
|
||||
public FieldAndTerm(String field, BytesRef term) {
|
||||
this.field = field;
|
||||
this.term = new BytesRef(term);
|
||||
}
|
||||
|
||||
public FieldAndTerm(FieldAndTerm other) {
|
||||
field = other.field;
|
||||
term = new BytesRef(other.term);
|
||||
|
@ -297,6 +302,14 @@ public class PrefixCodedTermsReader extends FieldsProducer {
|
|||
return termComp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cacheCurrentTerm() {
|
||||
TermState stateCopy = (TermState) state.clone();
|
||||
stateCopy.filePointer = in.getFilePointer();
|
||||
termsCache.put(new FieldAndTerm(fieldInfo.name, bytesReader.term),
|
||||
stateCopy);
|
||||
}
|
||||
|
||||
/** Seeks until the first term that's >= the provided
|
||||
* text; returns SeekStatus.FOUND if the exact term
|
||||
* is found, SeekStatus.NOT_FOUND if a different term
|
||||
|
|
|
@ -316,7 +316,7 @@ public class PreFlexFields extends FieldsProducer {
|
|||
}
|
||||
|
||||
// Seek "back":
|
||||
getTermsDict().seekEnum(te, protoTerm.createTerm(term));
|
||||
getTermsDict().seekEnum(te, protoTerm.createTerm(term), true);
|
||||
|
||||
// Test if the term we seek'd to in fact found a
|
||||
// surrogate pair at the same position as the E:
|
||||
|
@ -387,7 +387,7 @@ public class PreFlexFields extends FieldsProducer {
|
|||
|
||||
if (seekToNonBMP(seekTermEnum, prevTerm, downTo)) {
|
||||
// TODO: more efficient seek?
|
||||
getTermsDict().seekEnum(termEnum, seekTermEnum.term());
|
||||
getTermsDict().seekEnum(termEnum, seekTermEnum.term(), true);
|
||||
//newSuffixStart = downTo+4;
|
||||
newSuffixStart = downTo;
|
||||
scratchTerm.copy(termEnum.term().bytes());
|
||||
|
@ -443,7 +443,7 @@ public class PreFlexFields extends FieldsProducer {
|
|||
|
||||
// TODO: more efficient seek? can we simply swap
|
||||
// the enums?
|
||||
getTermsDict().seekEnum(termEnum, protoTerm.createTerm(scratchTerm));
|
||||
getTermsDict().seekEnum(termEnum, protoTerm.createTerm(scratchTerm), true);
|
||||
|
||||
final Term t2 = termEnum.term();
|
||||
|
||||
|
@ -619,7 +619,7 @@ public class PreFlexFields extends FieldsProducer {
|
|||
|
||||
// Seek "forward":
|
||||
// TODO: more efficient seek?
|
||||
getTermsDict().seekEnum(seekTermEnum, protoTerm.createTerm(scratchTerm));
|
||||
getTermsDict().seekEnum(seekTermEnum, protoTerm.createTerm(scratchTerm), true);
|
||||
|
||||
scratchTerm.bytes[upTo] = scratch[0];
|
||||
scratchTerm.bytes[upTo+1] = scratch[1];
|
||||
|
@ -668,7 +668,7 @@ public class PreFlexFields extends FieldsProducer {
|
|||
|
||||
// OK seek "back"
|
||||
// TODO: more efficient seek?
|
||||
getTermsDict().seekEnum(termEnum, seekTermEnum.term());
|
||||
getTermsDict().seekEnum(termEnum, seekTermEnum.term(), true);
|
||||
|
||||
scratchTerm.copy(seekTermEnum.term().bytes());
|
||||
|
||||
|
@ -701,7 +701,7 @@ public class PreFlexFields extends FieldsProducer {
|
|||
seekTermEnum = getTermsDict().terms(protoTerm);
|
||||
//System.out.println(" term=" + termEnum.term());
|
||||
} else {
|
||||
getTermsDict().seekEnum(termEnum, protoTerm);
|
||||
getTermsDict().seekEnum(termEnum, protoTerm, true);
|
||||
}
|
||||
skipNext = true;
|
||||
|
||||
|
@ -726,6 +726,11 @@ public class PreFlexFields extends FieldsProducer {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cacheCurrentTerm() throws IOException {
|
||||
getTermsDict().cacheCurrentTerm(termEnum);
|
||||
}
|
||||
|
||||
@Override
|
||||
public SeekStatus seek(long ord) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
|
@ -747,7 +752,7 @@ public class PreFlexFields extends FieldsProducer {
|
|||
|
||||
assert termEnum != null;
|
||||
|
||||
tis.seekEnum(termEnum, t0);
|
||||
tis.seekEnum(termEnum, t0, useCache);
|
||||
|
||||
final Term t = termEnum.term();
|
||||
|
||||
|
@ -783,7 +788,7 @@ public class PreFlexFields extends FieldsProducer {
|
|||
if (seekToNonBMP(seekTermEnum, scratchTerm, i)) {
|
||||
|
||||
scratchTerm.copy(seekTermEnum.term().bytes());
|
||||
getTermsDict().seekEnum(termEnum, seekTermEnum.term());
|
||||
getTermsDict().seekEnum(termEnum, seekTermEnum.term(), useCache);
|
||||
|
||||
newSuffixStart = 1+i;
|
||||
|
||||
|
|
|
@ -52,7 +52,7 @@ public final class SegmentTermEnum implements Cloneable {
|
|||
private TermBuffer prevBuffer = new TermBuffer();
|
||||
private TermBuffer scanBuffer = new TermBuffer(); // used for scanning
|
||||
|
||||
private TermInfo termInfo = new TermInfo();
|
||||
TermInfo termInfo = new TermInfo();
|
||||
|
||||
private int format;
|
||||
private boolean isIndex = false;
|
||||
|
@ -61,7 +61,6 @@ public final class SegmentTermEnum implements Cloneable {
|
|||
int skipInterval;
|
||||
int newSuffixStart;
|
||||
int maxSkipLevels;
|
||||
private int formatM1SkipInterval;
|
||||
|
||||
SegmentTermEnum(IndexInput i, FieldInfos fis, boolean isi)
|
||||
throws CorruptIndexException, IOException {
|
||||
|
|
|
@ -54,8 +54,8 @@ public final class TermInfosReader {
|
|||
|
||||
// Just adds term's ord to TermInfo
|
||||
private final static class TermInfoAndOrd extends TermInfo {
|
||||
final int termOrd;
|
||||
public TermInfoAndOrd(TermInfo ti, int termOrd) {
|
||||
final long termOrd;
|
||||
public TermInfoAndOrd(TermInfo ti, long termOrd) {
|
||||
super(ti);
|
||||
this.termOrd = termOrd;
|
||||
}
|
||||
|
@ -228,14 +228,24 @@ public final class TermInfosReader {
|
|||
return tiOrd;
|
||||
}
|
||||
|
||||
return seekEnum(resources.termEnum, term, tiOrd);
|
||||
return seekEnum(resources.termEnum, term, tiOrd, true);
|
||||
}
|
||||
|
||||
TermInfo seekEnum(SegmentTermEnum enumerator, Term term) throws IOException {
|
||||
return seekEnum(enumerator, term, termsCache.get(new CloneableTerm(term)));
|
||||
public void cacheCurrentTerm(SegmentTermEnum enumerator) {
|
||||
termsCache.put(new CloneableTerm(enumerator.term()),
|
||||
new TermInfoAndOrd(enumerator.termInfo,
|
||||
enumerator.position));
|
||||
}
|
||||
|
||||
TermInfo seekEnum(SegmentTermEnum enumerator, Term term, TermInfoAndOrd tiOrd) throws IOException {
|
||||
TermInfo seekEnum(SegmentTermEnum enumerator, Term term, boolean useCache) throws IOException {
|
||||
if (useCache) {
|
||||
return seekEnum(enumerator, term, termsCache.get(new CloneableTerm(term)), useCache);
|
||||
} else {
|
||||
return seekEnum(enumerator, term, null, useCache);
|
||||
}
|
||||
}
|
||||
|
||||
TermInfo seekEnum(SegmentTermEnum enumerator, Term term, TermInfoAndOrd tiOrd, boolean useCache) throws IOException {
|
||||
if (size == 0) {
|
||||
return null;
|
||||
}
|
||||
|
@ -252,7 +262,7 @@ public final class TermInfosReader {
|
|||
final TermInfo ti;
|
||||
int numScans = enumerator.scanTo(term);
|
||||
if (enumerator.term() != null && term.compareToUTF16(enumerator.term()) == 0) {
|
||||
ti = enumerator.termInfo();
|
||||
ti = enumerator.termInfo;
|
||||
if (numScans > 1) {
|
||||
// we only want to put this TermInfo into the cache if
|
||||
// scanEnum skipped more than one dictionary entry.
|
||||
|
@ -260,7 +270,9 @@ public final class TermInfosReader {
|
|||
// wipe out the cache when they iterate over a large numbers
|
||||
// of terms in order
|
||||
if (tiOrd == null) {
|
||||
termsCache.put(new CloneableTerm(term), new TermInfoAndOrd(ti, (int) enumerator.position));
|
||||
if (useCache) {
|
||||
termsCache.put(new CloneableTerm(term), new TermInfoAndOrd(ti, enumerator.position));
|
||||
}
|
||||
} else {
|
||||
assert sameTermInfo(ti, tiOrd, enumerator);
|
||||
assert (int) enumerator.position == tiOrd.termOrd;
|
||||
|
@ -277,7 +289,7 @@ public final class TermInfosReader {
|
|||
// random-access: must seek
|
||||
final int indexPos;
|
||||
if (tiOrd != null) {
|
||||
indexPos = tiOrd.termOrd / totalIndexInterval;
|
||||
indexPos = (int) (tiOrd.termOrd / totalIndexInterval);
|
||||
} else {
|
||||
// Must do binary search:
|
||||
indexPos = getIndexOffset(term);
|
||||
|
@ -288,9 +300,11 @@ public final class TermInfosReader {
|
|||
final TermInfo ti;
|
||||
|
||||
if (enumerator.term() != null && term.compareToUTF16(enumerator.term()) == 0) {
|
||||
ti = enumerator.termInfo();
|
||||
ti = enumerator.termInfo;
|
||||
if (tiOrd == null) {
|
||||
termsCache.put(new CloneableTerm(term), new TermInfoAndOrd(ti, (int) enumerator.position));
|
||||
if (useCache) {
|
||||
termsCache.put(new CloneableTerm(term), new TermInfoAndOrd(ti, enumerator.position));
|
||||
}
|
||||
} else {
|
||||
assert sameTermInfo(ti, tiOrd, enumerator);
|
||||
assert (int) enumerator.position == tiOrd.termOrd;
|
||||
|
|
|
@ -32,6 +32,8 @@ import org.apache.lucene.util.StringHelper;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.Comparator;
|
||||
import java.util.Map;
|
||||
import java.util.HashMap;
|
||||
|
||||
class SimpleTextFieldsReader extends FieldsProducer {
|
||||
|
||||
|
@ -153,6 +155,10 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
return SeekStatus.END;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cacheCurrentTerm() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef next() throws IOException {
|
||||
assert !ended;
|
||||
|
@ -468,16 +474,23 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
return new SimpleTextFieldsEnum();
|
||||
}
|
||||
|
||||
private final Map<String,Terms> termsCache = new HashMap<String,Terms>();
|
||||
|
||||
@Override
|
||||
public Terms terms(String field) throws IOException {
|
||||
SimpleTextFieldsEnum fe = (SimpleTextFieldsEnum) iterator();
|
||||
String fieldUpto;
|
||||
while((fieldUpto = fe.next()) != null) {
|
||||
if (fieldUpto.equals(field)) {
|
||||
return new SimpleTextTerms(field, fe.in.getFilePointer());
|
||||
synchronized public Terms terms(String field) throws IOException {
|
||||
Terms terms = termsCache.get(field);
|
||||
if (terms == null) {
|
||||
SimpleTextFieldsEnum fe = (SimpleTextFieldsEnum) iterator();
|
||||
String fieldUpto;
|
||||
while((fieldUpto = fe.next()) != null) {
|
||||
if (fieldUpto.equals(field)) {
|
||||
terms = new SimpleTextTerms(field, fe.in.getFilePointer());
|
||||
break;
|
||||
}
|
||||
}
|
||||
termsCache.put(field, terms);
|
||||
}
|
||||
return null;
|
||||
return terms;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -46,7 +46,6 @@ public abstract class FilteredTermsEnum extends TermsEnum {
|
|||
private BytesRef initialSeekTerm = null;
|
||||
private boolean doSeek = true;
|
||||
private BytesRef actualTerm = null;
|
||||
private boolean useTermsCache = false;
|
||||
|
||||
private final TermsEnum tenum;
|
||||
|
||||
|
@ -116,16 +115,6 @@ public abstract class FilteredTermsEnum extends TermsEnum {
|
|||
return t;
|
||||
}
|
||||
|
||||
/** Expert: enable or disable the terms cache when seeking. */
|
||||
protected final void setUseTermsCache(boolean useTermsCache) {
|
||||
this.useTermsCache = useTermsCache;
|
||||
}
|
||||
|
||||
/** Expert: enable or disable the terms cache when seeking. */
|
||||
protected final boolean getUseTermsCache() {
|
||||
return useTermsCache;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the related attributes, the returned {@link AttributeSource}
|
||||
* is shared with the delegate {@code TermsEnum}.
|
||||
|
@ -188,6 +177,11 @@ public abstract class FilteredTermsEnum extends TermsEnum {
|
|||
assert tenum != null;
|
||||
return tenum.docsAndPositions(bits, reuse);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cacheCurrentTerm() throws IOException {
|
||||
tenum.cacheCurrentTerm();
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef next() throws IOException {
|
||||
|
@ -200,7 +194,7 @@ public abstract class FilteredTermsEnum extends TermsEnum {
|
|||
final BytesRef t = nextSeekTerm(actualTerm);
|
||||
// Make sure we always seek forward:
|
||||
assert actualTerm == null || t == null || getComparator().compare(t, actualTerm) > 0: "curTerm=" + actualTerm + " seekTerm=" + t;
|
||||
if (t == null || tenum.seek(t, useTermsCache) == SeekStatus.END) {
|
||||
if (t == null || tenum.seek(t, false) == SeekStatus.END) {
|
||||
// no more terms to seek to or enum exhausted
|
||||
return null;
|
||||
}
|
||||
|
|
|
@ -219,6 +219,11 @@ public final class FuzzyTermsEnum extends TermsEnum {
|
|||
return actualEnum.docFreq();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cacheCurrentTerm() throws IOException {
|
||||
actualEnum.cacheCurrentTerm();
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
|
||||
return actualEnum.docs(skipDocs, reuse);
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.search;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.PriorityQueue;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
|
@ -201,7 +200,8 @@ public abstract class MultiTermQuery extends Query {
|
|||
int count = 0;
|
||||
BytesRef bytes;
|
||||
while ((bytes = termsEnum.next()) != null) {
|
||||
if (collector.collect(bytes, boostAtt.getBoost())) {
|
||||
if (collector.collect(termsEnum, bytes, boostAtt.getBoost())) {
|
||||
termsEnum.cacheCurrentTerm();
|
||||
count++;
|
||||
} else {
|
||||
break;
|
||||
|
@ -215,7 +215,7 @@ public abstract class MultiTermQuery extends Query {
|
|||
private BoostAttribute boostAtt = null;
|
||||
|
||||
/** return false to stop collecting */
|
||||
public abstract boolean collect(BytesRef bytes, float boost) throws IOException;
|
||||
public abstract boolean collect(TermsEnum termsEnum, BytesRef bytes, float boost) throws IOException;
|
||||
|
||||
/** set the minimum boost as a hint for the term producer */
|
||||
protected final void setMaxNonCompetitiveBoost(float maxNonCompetitiveBoost) {
|
||||
|
@ -231,9 +231,10 @@ public abstract class MultiTermQuery extends Query {
|
|||
final BooleanQuery result = new BooleanQuery(true);
|
||||
final Term placeholderTerm = new Term(query.field);
|
||||
query.incTotalNumberOfTerms(collectTerms(reader, query, new TermCollector() {
|
||||
public boolean collect(BytesRef bytes, float boost) {
|
||||
@Override
|
||||
public boolean collect(TermsEnum termsEnum, BytesRef bytes, float boost) {
|
||||
// add new TQ, we must clone the term, else it may get overwritten!
|
||||
TermQuery tq = new TermQuery(placeholderTerm.createTerm(new BytesRef(bytes)));
|
||||
TermQuery tq = new TermQuery(placeholderTerm.createTerm(new BytesRef(bytes)), termsEnum.docFreq());
|
||||
tq.setBoost(query.getBoost() * boost); // set the boost
|
||||
result.add(tq, BooleanClause.Occur.SHOULD); // add to query
|
||||
return true;
|
||||
|
@ -291,20 +292,22 @@ public abstract class MultiTermQuery extends Query {
|
|||
}
|
||||
|
||||
/** Return a suitable Query for a MultiTermQuery term. */
|
||||
protected abstract Query getQuery(Term term);
|
||||
protected abstract Query getQuery(Term term, int docCount);
|
||||
|
||||
@Override
|
||||
public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException {
|
||||
final int maxSize = Math.min(size, BooleanQuery.getMaxClauseCount());
|
||||
final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>();
|
||||
collectTerms(reader, query, new TermCollector() {
|
||||
public boolean collect(BytesRef bytes, float boost) {
|
||||
@Override
|
||||
public boolean collect(TermsEnum termsEnum, BytesRef bytes, float boost) {
|
||||
// ignore uncompetetive hits
|
||||
if (stQueue.size() >= maxSize && boost <= stQueue.peek().boost)
|
||||
return true;
|
||||
// add new entry in PQ, we must clone the term, else it may get overwritten!
|
||||
st.bytes.copy(bytes);
|
||||
st.boost = boost;
|
||||
st.docFreq = termsEnum.docFreq();
|
||||
stQueue.offer(st);
|
||||
// possibly drop entries from queue
|
||||
st = (stQueue.size() > maxSize) ? stQueue.poll() : new ScoreTerm();
|
||||
|
@ -320,7 +323,7 @@ public abstract class MultiTermQuery extends Query {
|
|||
final BooleanQuery bq = new BooleanQuery(true);
|
||||
for (final ScoreTerm st : stQueue) {
|
||||
// add new query, we must clone the term, else it may get overwritten!
|
||||
Query tq = getQuery(placeholderTerm.createTerm(st.bytes));
|
||||
Query tq = getQuery(placeholderTerm.createTerm(st.bytes), st.docFreq);
|
||||
tq.setBoost(query.getBoost() * st.boost); // set the boost
|
||||
bq.add(tq, BooleanClause.Occur.SHOULD); // add to query
|
||||
}
|
||||
|
@ -349,6 +352,7 @@ public abstract class MultiTermQuery extends Query {
|
|||
private static class ScoreTerm implements Comparable<ScoreTerm> {
|
||||
public final BytesRef bytes = new BytesRef();
|
||||
public float boost;
|
||||
public int docFreq;
|
||||
|
||||
public int compareTo(ScoreTerm other) {
|
||||
if (this.boost == other.boost)
|
||||
|
@ -395,8 +399,8 @@ public abstract class MultiTermQuery extends Query {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected Query getQuery(Term term) {
|
||||
return new TermQuery(term);
|
||||
protected Query getQuery(Term term, int docFreq) {
|
||||
return new TermQuery(term, docFreq);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -433,8 +437,8 @@ public abstract class MultiTermQuery extends Query {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected Query getQuery(Term term) {
|
||||
return new ConstantScoreQuery(new QueryWrapperFilter(new TermQuery(term)));
|
||||
protected Query getQuery(Term term, int docFreq) {
|
||||
return new ConstantScoreQuery(new QueryWrapperFilter(new TermQuery(term, docFreq)));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -567,18 +571,14 @@ public abstract class MultiTermQuery extends Query {
|
|||
this.termCountLimit = termCountLimit;
|
||||
}
|
||||
|
||||
public boolean collect(BytesRef bytes, float boost) throws IOException {
|
||||
public boolean collect(TermsEnum termsEnum, BytesRef bytes, float boost) throws IOException {
|
||||
termCount++;
|
||||
if (termCount >= termCountLimit || docVisitCount >= docCountCutoff) {
|
||||
hasCutOff = true;
|
||||
return false;
|
||||
}
|
||||
pendingTerms.copyUsingLengthPrefix(bytes);
|
||||
// Loading the TermInfo from the terms dict here
|
||||
// should not be costly, because 1) the
|
||||
// query/filter will load the TermInfo when it
|
||||
// runs, and 2) the terms dict has a cache:
|
||||
docVisitCount += reader.docFreq(field, bytes);
|
||||
docVisitCount += termsEnum.docFreq();
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -741,7 +741,7 @@ public abstract class Similarity implements Serializable {
|
|||
* The default implementation uses:
|
||||
*
|
||||
* <pre>
|
||||
* idf(searcher.docFreq(term), searcher.maxDoc());
|
||||
* idf(docFreq, searcher.maxDoc());
|
||||
* </pre>
|
||||
*
|
||||
* Note that {@link Searcher#maxDoc()} is used instead of
|
||||
|
@ -752,12 +752,13 @@ public abstract class Similarity implements Serializable {
|
|||
*
|
||||
* @param term the term in question
|
||||
* @param searcher the document collection being searched
|
||||
* @param docFreq externally computed docFreq for this term
|
||||
* @return an IDFExplain object that includes both an idf score factor
|
||||
and an explanation for the term.
|
||||
* @throws IOException
|
||||
*/
|
||||
public IDFExplanation idfExplain(final Term term, final Searcher searcher) throws IOException {
|
||||
final int df = searcher.docFreq(term);
|
||||
public IDFExplanation idfExplain(final Term term, final Searcher searcher, int docFreq) throws IOException {
|
||||
final int df = docFreq;
|
||||
final int max = searcher.maxDoc();
|
||||
final float idf = idf(df, max);
|
||||
return new IDFExplanation() {
|
||||
|
@ -772,6 +773,15 @@ public abstract class Similarity implements Serializable {
|
|||
}};
|
||||
}
|
||||
|
||||
/**
|
||||
* This method forwards to {@link
|
||||
* idfExplain(Term,Searcher,int)} by passing
|
||||
* <code>searcher.docFreq(term)</code> as the docFreq.
|
||||
*/
|
||||
public IDFExplanation idfExplain(final Term term, final Searcher searcher) throws IOException {
|
||||
return idfExplain(term, searcher, searcher.docFreq(term));
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes a score factor for a phrase.
|
||||
*
|
||||
|
|
|
@ -30,7 +30,8 @@ import org.apache.lucene.util.ToStringUtils;
|
|||
This may be combined with other terms with a {@link BooleanQuery}.
|
||||
*/
|
||||
public class TermQuery extends Query {
|
||||
private Term term;
|
||||
private final Term term;
|
||||
private final int docFreq;
|
||||
|
||||
private class TermWeight extends Weight {
|
||||
private final Similarity similarity;
|
||||
|
@ -43,7 +44,11 @@ public class TermQuery extends Query {
|
|||
public TermWeight(Searcher searcher)
|
||||
throws IOException {
|
||||
this.similarity = getSimilarity(searcher);
|
||||
idfExp = similarity.idfExplain(term, searcher);
|
||||
if (docFreq != -1) {
|
||||
idfExp = similarity.idfExplain(term, searcher, docFreq);
|
||||
} else {
|
||||
idfExp = similarity.idfExplain(term, searcher);
|
||||
}
|
||||
idf = idfExp.getIdf();
|
||||
}
|
||||
|
||||
|
@ -160,7 +165,15 @@ public class TermQuery extends Query {
|
|||
|
||||
/** Constructs a query for the term <code>t</code>. */
|
||||
public TermQuery(Term t) {
|
||||
this(t, -1);
|
||||
}
|
||||
|
||||
/** Expert: constructs a TermQuery that will use the
|
||||
* provided docFreq instead of looking up the docFreq
|
||||
* against the searcher. */
|
||||
public TermQuery(Term t, int docFreq) {
|
||||
term = t;
|
||||
this.docFreq = docFreq;
|
||||
}
|
||||
|
||||
/** Returns the term of this query. */
|
||||
|
|
|
@ -284,6 +284,11 @@ public class DocTermsIndexCreator<T extends DocTermsIndex> extends EntryCreatorW
|
|||
return term;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cacheCurrentTerm() throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef term() throws IOException {
|
||||
return term;
|
||||
|
|
|
@ -319,6 +319,10 @@ public class TestExternalCodecs extends LuceneTestCase {
|
|||
return ramField.termToDocs.get(current).docs.size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cacheCurrentTerm() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) {
|
||||
return new RAMDocsEnum(ramField.termToDocs.get(current), skipDocs);
|
||||
|
|
|
@ -33,6 +33,7 @@ import org.apache.lucene.store.MockDirectoryWrapper;
|
|||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import static org.junit.Assume.*;
|
||||
|
||||
/**
|
||||
* Tests lazy skipping on the proximity file.
|
||||
|
@ -117,14 +118,13 @@ public class TestLazyProxSkipping extends LuceneTestCase {
|
|||
assertTrue(this.seeksCounter > 0);
|
||||
assertTrue("seeksCounter=" + this.seeksCounter + " numHits=" + numHits, this.seeksCounter <= numHits + 1);
|
||||
}
|
||||
|
||||
|
||||
public void testLazySkipping() throws IOException {
|
||||
assumeTrue(!CodecProvider.getDefaultCodec().equals("SimpleText"));
|
||||
// test whether only the minimum amount of seeks()
|
||||
// are performed
|
||||
if (!CodecProvider.getDefaultCodec().equals("SimpleText")) {
|
||||
performTest(5);
|
||||
performTest(10);
|
||||
}
|
||||
performTest(5);
|
||||
performTest(10);
|
||||
}
|
||||
|
||||
public void testSeek() throws IOException {
|
||||
|
|
|
@ -87,7 +87,6 @@ public class TestNumericRangeQuery64 extends LuceneTestCase {
|
|||
ascfield2.setLongValue(val);
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
|
||||
reader = writer.getReader();
|
||||
searcher=new IndexSearcher(reader);
|
||||
writer.close();
|
||||
|
|
|
@ -191,9 +191,7 @@ public abstract class LuceneTestCase extends Assert {
|
|||
|
||||
private static Map<MockDirectoryWrapper,StackTraceElement[]> stores;
|
||||
|
||||
// TODO 4.0: make sure we re-enable SimpleText in the rotation
|
||||
//private static final String[] TEST_CODECS = new String[] {"MockSep", "MockFixedIntBlock", "MockVariableIntBlock", "SimpleText"};
|
||||
private static final String[] TEST_CODECS = new String[] {"MockSep", "MockFixedIntBlock", "MockVariableIntBlock"};
|
||||
private static final String[] TEST_CODECS = new String[] {"MockSep", "MockFixedIntBlock", "MockVariableIntBlock", "SimpleText"};
|
||||
|
||||
private static void swapCodec(Codec c) {
|
||||
final CodecProvider cp = CodecProvider.getDefault();
|
||||
|
@ -246,8 +244,7 @@ public abstract class LuceneTestCase extends Assert {
|
|||
swapCodec(new MockFixedIntBlockCodec(codecHasParam && "MockFixedIntBlock".equals(codec) ? codecParam : _TestUtil.nextInt(random, 1, 2000)));
|
||||
// baseBlockSize cannot be over 127:
|
||||
swapCodec(new MockVariableIntBlockCodec(codecHasParam && "MockVariableIntBlock".equals(codec) ? codecParam : _TestUtil.nextInt(random, 1, 127)));
|
||||
// TODO 4.0: add this into test rotation
|
||||
//swapCodec(new SimpleTextCodec());
|
||||
swapCodec(new SimpleTextCodec());
|
||||
|
||||
return cp.lookup(codec);
|
||||
}
|
||||
|
|
|
@ -40,7 +40,6 @@ import org.apache.solr.search.*;
|
|||
import org.apache.solr.util.ByteUtils;
|
||||
import org.apache.solr.util.LongPriorityQueue;
|
||||
import org.apache.solr.util.PrimUtils;
|
||||
import org.apache.solr.util.BoundedTreeSet;
|
||||
import org.apache.solr.handler.component.StatsValues;
|
||||
import org.apache.solr.handler.component.FieldFacetStats;
|
||||
import org.apache.lucene.util.OpenBitSet;
|
||||
|
@ -1002,6 +1001,11 @@ class NumberedTermsEnum extends TermsEnum {
|
|||
return tenum.docFreq();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cacheCurrentTerm() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
public BytesRef skipTo(BytesRef target) throws IOException {
|
||||
|
||||
// already here
|
||||
|
|
Loading…
Reference in New Issue