From a15dd4b5cb6ddf4e70d98930fc5e5ead8aed96ad Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Tue, 28 Sep 2010 16:31:44 +0000 Subject: [PATCH] LUCENE-2674: improve interaction of MTQ & terms cache git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1002246 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 8 +++++ lucene/MIGRATE.txt | 5 +++ .../instantiated/InstantiatedTermsEnum.java | 4 +++ .../lucene/index/memory/MemoryIndex.java | 4 +++ .../lucene/index/FilterIndexReader.java | 5 +++ .../apache/lucene/index/MultiTermsEnum.java | 13 +++++-- .../org/apache/lucene/index/TermsEnum.java | 7 ++++ .../index/codecs/PrefixCodedTermsReader.java | 13 +++++++ .../index/codecs/preflex/PreFlexFields.java | 21 ++++++----- .../index/codecs/preflex/SegmentTermEnum.java | 3 +- .../index/codecs/preflex/TermInfosReader.java | 36 +++++++++++++------ .../simpletext/SimpleTextFieldsReader.java | 27 ++++++++++---- .../lucene/search/FilteredTermsEnum.java | 18 ++++------ .../apache/lucene/search/FuzzyTermsEnum.java | 5 +++ .../apache/lucene/search/MultiTermQuery.java | 36 +++++++++---------- .../org/apache/lucene/search/Similarity.java | 16 +++++++-- .../org/apache/lucene/search/TermQuery.java | 17 +++++++-- .../search/cache/DocTermsIndexCreator.java | 5 +++ .../org/apache/lucene/TestExternalCodecs.java | 4 +++ .../lucene/index/TestLazyProxSkipping.java | 10 +++--- .../search/TestNumericRangeQuery64.java | 1 - .../apache/lucene/util/LuceneTestCase.java | 7 ++-- .../apache/solr/request/UnInvertedField.java | 6 +++- 23 files changed, 193 insertions(+), 78 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index ca09073da2f..946e67f5fa1 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -121,6 +121,9 @@ Changes in backwards compatibility policy priority queue size, you can use FuzzyQuery(Term, float, int, int) to specify those explicitly. +* LUCENE-2674: MultiTermQuery.TermCollector.collect now accepts the + TermsEnum as well. (Robert Muir, Mike McCandless) + Changes in Runtime Behavior * LUCENE-2650: The behavior of FSDirectory.open has changed. On 64-bit @@ -150,6 +153,11 @@ API Changes commit points when they are not needed anymore (instead of waiting for the next commit). (Shai Erera) +* LUCENE-2674: A new idfExplain method was added to Similarity, that + accepts an incoming docFreq. If you subclass Similarity, make sure + you also override this method on upgrade. (Robert Muir, Mike + McCandless) + New features * LUCENE-2604: Added RegexpQuery support to QueryParser. Regular expressions diff --git a/lucene/MIGRATE.txt b/lucene/MIGRATE.txt index 542759777da..40ddec64662 100644 --- a/lucene/MIGRATE.txt +++ b/lucene/MIGRATE.txt @@ -278,3 +278,8 @@ LUCENE-1458, LUCENE-2111: Flexible Indexing // document is deleted... } +* LUCENE-2674: A new idfExplain method was added to Similarity, that + accepts an incoming docFreq. If you subclass Similarity, make sure + you also override this method on upgrade, otherwise your + customizations won't run for certain MultiTermQuerys. + diff --git a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java index 580485c1781..d5301529f37 100644 --- a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java +++ b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java @@ -90,6 +90,10 @@ public class InstantiatedTermsEnum extends TermsEnum { } } + @Override + public void cacheCurrentTerm() { + } + @Override public BytesRef term() { return br; diff --git a/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java index 38860cb9130..c1d73d00f24 100644 --- a/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java +++ b/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java @@ -873,6 +873,10 @@ public class MemoryIndex implements Serializable { return br; } + @Override + public void cacheCurrentTerm() { + } + @Override public long ord() { return termUpto; diff --git a/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java b/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java index 799eb8d1b8b..8149fd12567 100644 --- a/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java +++ b/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java @@ -129,6 +129,11 @@ public class FilterIndexReader extends IndexReader { return in.seek(text, useCache); } + @Override + public void cacheCurrentTerm() throws IOException { + in.cacheCurrentTerm(); + } + @Override public SeekStatus seek(long ord) throws IOException { return in.seek(ord); diff --git a/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java b/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java index c2a68c5ba0e..02e21b17ffc 100644 --- a/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java +++ b/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java @@ -90,6 +90,13 @@ public final class MultiTermsEnum extends TermsEnum { return current; } + @Override + public void cacheCurrentTerm() throws IOException { + for(int i=0;i getComparator() { return termComp; @@ -213,7 +220,7 @@ public final class MultiTermsEnum extends TermsEnum { throw new UnsupportedOperationException(); } - private final void pullTop() { + private void pullTop() { // extract all subs from the queue that have the same // top term assert numTop == 0; @@ -226,7 +233,7 @@ public final class MultiTermsEnum extends TermsEnum { current = top[0].current; } - private final void pushTop() throws IOException { + private void pushTop() throws IOException { // call next() on each top, and put back into queue for(int i=0;i getComparator() throws IOException; + /** Optional optimization hint: informs the codec that the + * current term is likely to be re-seek'd-to soon. */ + public abstract void cacheCurrentTerm() throws IOException; + /** An empty TermsEnum for quickly returning an empty instance e.g. * in {@link org.apache.lucene.search.MultiTermQuery} *

Please note: This enum should be unmodifiable, @@ -137,6 +141,9 @@ public abstract class TermsEnum { @Override public SeekStatus seek(long ord) { return SeekStatus.END; } + @Override + public void cacheCurrentTerm() {} + @Override public BytesRef term() { throw new IllegalStateException("this method should never be called"); diff --git a/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsReader.java b/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsReader.java index d37a803075b..3a574db7ee7 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsReader.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsReader.java @@ -82,6 +82,11 @@ public class PrefixCodedTermsReader extends FieldsProducer { public FieldAndTerm() { } + public FieldAndTerm(String field, BytesRef term) { + this.field = field; + this.term = new BytesRef(term); + } + public FieldAndTerm(FieldAndTerm other) { field = other.field; term = new BytesRef(other.term); @@ -297,6 +302,14 @@ public class PrefixCodedTermsReader extends FieldsProducer { return termComp; } + @Override + public void cacheCurrentTerm() { + TermState stateCopy = (TermState) state.clone(); + stateCopy.filePointer = in.getFilePointer(); + termsCache.put(new FieldAndTerm(fieldInfo.name, bytesReader.term), + stateCopy); + } + /** Seeks until the first term that's >= the provided * text; returns SeekStatus.FOUND if the exact term * is found, SeekStatus.NOT_FOUND if a different term diff --git a/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java b/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java index 676578fd232..57072463ca0 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java @@ -316,7 +316,7 @@ public class PreFlexFields extends FieldsProducer { } // Seek "back": - getTermsDict().seekEnum(te, protoTerm.createTerm(term)); + getTermsDict().seekEnum(te, protoTerm.createTerm(term), true); // Test if the term we seek'd to in fact found a // surrogate pair at the same position as the E: @@ -387,7 +387,7 @@ public class PreFlexFields extends FieldsProducer { if (seekToNonBMP(seekTermEnum, prevTerm, downTo)) { // TODO: more efficient seek? - getTermsDict().seekEnum(termEnum, seekTermEnum.term()); + getTermsDict().seekEnum(termEnum, seekTermEnum.term(), true); //newSuffixStart = downTo+4; newSuffixStart = downTo; scratchTerm.copy(termEnum.term().bytes()); @@ -443,7 +443,7 @@ public class PreFlexFields extends FieldsProducer { // TODO: more efficient seek? can we simply swap // the enums? - getTermsDict().seekEnum(termEnum, protoTerm.createTerm(scratchTerm)); + getTermsDict().seekEnum(termEnum, protoTerm.createTerm(scratchTerm), true); final Term t2 = termEnum.term(); @@ -619,7 +619,7 @@ public class PreFlexFields extends FieldsProducer { // Seek "forward": // TODO: more efficient seek? - getTermsDict().seekEnum(seekTermEnum, protoTerm.createTerm(scratchTerm)); + getTermsDict().seekEnum(seekTermEnum, protoTerm.createTerm(scratchTerm), true); scratchTerm.bytes[upTo] = scratch[0]; scratchTerm.bytes[upTo+1] = scratch[1]; @@ -668,7 +668,7 @@ public class PreFlexFields extends FieldsProducer { // OK seek "back" // TODO: more efficient seek? - getTermsDict().seekEnum(termEnum, seekTermEnum.term()); + getTermsDict().seekEnum(termEnum, seekTermEnum.term(), true); scratchTerm.copy(seekTermEnum.term().bytes()); @@ -701,7 +701,7 @@ public class PreFlexFields extends FieldsProducer { seekTermEnum = getTermsDict().terms(protoTerm); //System.out.println(" term=" + termEnum.term()); } else { - getTermsDict().seekEnum(termEnum, protoTerm); + getTermsDict().seekEnum(termEnum, protoTerm, true); } skipNext = true; @@ -726,6 +726,11 @@ public class PreFlexFields extends FieldsProducer { } } + @Override + public void cacheCurrentTerm() throws IOException { + getTermsDict().cacheCurrentTerm(termEnum); + } + @Override public SeekStatus seek(long ord) throws IOException { throw new UnsupportedOperationException(); @@ -747,7 +752,7 @@ public class PreFlexFields extends FieldsProducer { assert termEnum != null; - tis.seekEnum(termEnum, t0); + tis.seekEnum(termEnum, t0, useCache); final Term t = termEnum.term(); @@ -783,7 +788,7 @@ public class PreFlexFields extends FieldsProducer { if (seekToNonBMP(seekTermEnum, scratchTerm, i)) { scratchTerm.copy(seekTermEnum.term().bytes()); - getTermsDict().seekEnum(termEnum, seekTermEnum.term()); + getTermsDict().seekEnum(termEnum, seekTermEnum.term(), useCache); newSuffixStart = 1+i; diff --git a/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java b/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java index e78bfa1a406..d3bd5f1aef0 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java @@ -52,7 +52,7 @@ public final class SegmentTermEnum implements Cloneable { private TermBuffer prevBuffer = new TermBuffer(); private TermBuffer scanBuffer = new TermBuffer(); // used for scanning - private TermInfo termInfo = new TermInfo(); + TermInfo termInfo = new TermInfo(); private int format; private boolean isIndex = false; @@ -61,7 +61,6 @@ public final class SegmentTermEnum implements Cloneable { int skipInterval; int newSuffixStart; int maxSkipLevels; - private int formatM1SkipInterval; SegmentTermEnum(IndexInput i, FieldInfos fis, boolean isi) throws CorruptIndexException, IOException { diff --git a/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java b/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java index c2b6d041116..b633a6a99fd 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java @@ -54,8 +54,8 @@ public final class TermInfosReader { // Just adds term's ord to TermInfo private final static class TermInfoAndOrd extends TermInfo { - final int termOrd; - public TermInfoAndOrd(TermInfo ti, int termOrd) { + final long termOrd; + public TermInfoAndOrd(TermInfo ti, long termOrd) { super(ti); this.termOrd = termOrd; } @@ -228,14 +228,24 @@ public final class TermInfosReader { return tiOrd; } - return seekEnum(resources.termEnum, term, tiOrd); + return seekEnum(resources.termEnum, term, tiOrd, true); } - TermInfo seekEnum(SegmentTermEnum enumerator, Term term) throws IOException { - return seekEnum(enumerator, term, termsCache.get(new CloneableTerm(term))); + public void cacheCurrentTerm(SegmentTermEnum enumerator) { + termsCache.put(new CloneableTerm(enumerator.term()), + new TermInfoAndOrd(enumerator.termInfo, + enumerator.position)); } - TermInfo seekEnum(SegmentTermEnum enumerator, Term term, TermInfoAndOrd tiOrd) throws IOException { + TermInfo seekEnum(SegmentTermEnum enumerator, Term term, boolean useCache) throws IOException { + if (useCache) { + return seekEnum(enumerator, term, termsCache.get(new CloneableTerm(term)), useCache); + } else { + return seekEnum(enumerator, term, null, useCache); + } + } + + TermInfo seekEnum(SegmentTermEnum enumerator, Term term, TermInfoAndOrd tiOrd, boolean useCache) throws IOException { if (size == 0) { return null; } @@ -252,7 +262,7 @@ public final class TermInfosReader { final TermInfo ti; int numScans = enumerator.scanTo(term); if (enumerator.term() != null && term.compareToUTF16(enumerator.term()) == 0) { - ti = enumerator.termInfo(); + ti = enumerator.termInfo; if (numScans > 1) { // we only want to put this TermInfo into the cache if // scanEnum skipped more than one dictionary entry. @@ -260,7 +270,9 @@ public final class TermInfosReader { // wipe out the cache when they iterate over a large numbers // of terms in order if (tiOrd == null) { - termsCache.put(new CloneableTerm(term), new TermInfoAndOrd(ti, (int) enumerator.position)); + if (useCache) { + termsCache.put(new CloneableTerm(term), new TermInfoAndOrd(ti, enumerator.position)); + } } else { assert sameTermInfo(ti, tiOrd, enumerator); assert (int) enumerator.position == tiOrd.termOrd; @@ -277,7 +289,7 @@ public final class TermInfosReader { // random-access: must seek final int indexPos; if (tiOrd != null) { - indexPos = tiOrd.termOrd / totalIndexInterval; + indexPos = (int) (tiOrd.termOrd / totalIndexInterval); } else { // Must do binary search: indexPos = getIndexOffset(term); @@ -288,9 +300,11 @@ public final class TermInfosReader { final TermInfo ti; if (enumerator.term() != null && term.compareToUTF16(enumerator.term()) == 0) { - ti = enumerator.termInfo(); + ti = enumerator.termInfo; if (tiOrd == null) { - termsCache.put(new CloneableTerm(term), new TermInfoAndOrd(ti, (int) enumerator.position)); + if (useCache) { + termsCache.put(new CloneableTerm(term), new TermInfoAndOrd(ti, enumerator.position)); + } } else { assert sameTermInfo(ti, tiOrd, enumerator); assert (int) enumerator.position == tiOrd.termOrd; diff --git a/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java b/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java index bf3c436d39c..14c72b8a919 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java @@ -32,6 +32,8 @@ import org.apache.lucene.util.StringHelper; import java.io.IOException; import java.util.Comparator; +import java.util.Map; +import java.util.HashMap; class SimpleTextFieldsReader extends FieldsProducer { @@ -153,6 +155,10 @@ class SimpleTextFieldsReader extends FieldsProducer { return SeekStatus.END; } + @Override + public void cacheCurrentTerm() { + } + @Override public BytesRef next() throws IOException { assert !ended; @@ -468,16 +474,23 @@ class SimpleTextFieldsReader extends FieldsProducer { return new SimpleTextFieldsEnum(); } + private final Map termsCache = new HashMap(); + @Override - public Terms terms(String field) throws IOException { - SimpleTextFieldsEnum fe = (SimpleTextFieldsEnum) iterator(); - String fieldUpto; - while((fieldUpto = fe.next()) != null) { - if (fieldUpto.equals(field)) { - return new SimpleTextTerms(field, fe.in.getFilePointer()); + synchronized public Terms terms(String field) throws IOException { + Terms terms = termsCache.get(field); + if (terms == null) { + SimpleTextFieldsEnum fe = (SimpleTextFieldsEnum) iterator(); + String fieldUpto; + while((fieldUpto = fe.next()) != null) { + if (fieldUpto.equals(field)) { + terms = new SimpleTextTerms(field, fe.in.getFilePointer()); + break; + } } + termsCache.put(field, terms); } - return null; + return terms; } @Override diff --git a/lucene/src/java/org/apache/lucene/search/FilteredTermsEnum.java b/lucene/src/java/org/apache/lucene/search/FilteredTermsEnum.java index e3381ebd9f7..1c652834fc2 100644 --- a/lucene/src/java/org/apache/lucene/search/FilteredTermsEnum.java +++ b/lucene/src/java/org/apache/lucene/search/FilteredTermsEnum.java @@ -46,7 +46,6 @@ public abstract class FilteredTermsEnum extends TermsEnum { private BytesRef initialSeekTerm = null; private boolean doSeek = true; private BytesRef actualTerm = null; - private boolean useTermsCache = false; private final TermsEnum tenum; @@ -116,16 +115,6 @@ public abstract class FilteredTermsEnum extends TermsEnum { return t; } - /** Expert: enable or disable the terms cache when seeking. */ - protected final void setUseTermsCache(boolean useTermsCache) { - this.useTermsCache = useTermsCache; - } - - /** Expert: enable or disable the terms cache when seeking. */ - protected final boolean getUseTermsCache() { - return useTermsCache; - } - /** * Returns the related attributes, the returned {@link AttributeSource} * is shared with the delegate {@code TermsEnum}. @@ -188,6 +177,11 @@ public abstract class FilteredTermsEnum extends TermsEnum { assert tenum != null; return tenum.docsAndPositions(bits, reuse); } + + @Override + public void cacheCurrentTerm() throws IOException { + tenum.cacheCurrentTerm(); + } @Override public BytesRef next() throws IOException { @@ -200,7 +194,7 @@ public abstract class FilteredTermsEnum extends TermsEnum { final BytesRef t = nextSeekTerm(actualTerm); // Make sure we always seek forward: assert actualTerm == null || t == null || getComparator().compare(t, actualTerm) > 0: "curTerm=" + actualTerm + " seekTerm=" + t; - if (t == null || tenum.seek(t, useTermsCache) == SeekStatus.END) { + if (t == null || tenum.seek(t, false) == SeekStatus.END) { // no more terms to seek to or enum exhausted return null; } diff --git a/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java b/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java index 0c38aa509be..fe99e24ed1b 100644 --- a/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java +++ b/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java @@ -219,6 +219,11 @@ public final class FuzzyTermsEnum extends TermsEnum { return actualEnum.docFreq(); } + @Override + public void cacheCurrentTerm() throws IOException { + actualEnum.cacheCurrentTerm(); + } + @Override public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException { return actualEnum.docs(skipDocs, reuse); diff --git a/lucene/src/java/org/apache/lucene/search/MultiTermQuery.java b/lucene/src/java/org/apache/lucene/search/MultiTermQuery.java index 89c90907fa8..747779505e1 100644 --- a/lucene/src/java/org/apache/lucene/search/MultiTermQuery.java +++ b/lucene/src/java/org/apache/lucene/search/MultiTermQuery.java @@ -19,7 +19,6 @@ package org.apache.lucene.search; import java.io.IOException; import java.io.Serializable; -import java.util.ArrayList; import java.util.PriorityQueue; import org.apache.lucene.index.IndexReader; @@ -201,7 +200,8 @@ public abstract class MultiTermQuery extends Query { int count = 0; BytesRef bytes; while ((bytes = termsEnum.next()) != null) { - if (collector.collect(bytes, boostAtt.getBoost())) { + if (collector.collect(termsEnum, bytes, boostAtt.getBoost())) { + termsEnum.cacheCurrentTerm(); count++; } else { break; @@ -215,7 +215,7 @@ public abstract class MultiTermQuery extends Query { private BoostAttribute boostAtt = null; /** return false to stop collecting */ - public abstract boolean collect(BytesRef bytes, float boost) throws IOException; + public abstract boolean collect(TermsEnum termsEnum, BytesRef bytes, float boost) throws IOException; /** set the minimum boost as a hint for the term producer */ protected final void setMaxNonCompetitiveBoost(float maxNonCompetitiveBoost) { @@ -231,9 +231,10 @@ public abstract class MultiTermQuery extends Query { final BooleanQuery result = new BooleanQuery(true); final Term placeholderTerm = new Term(query.field); query.incTotalNumberOfTerms(collectTerms(reader, query, new TermCollector() { - public boolean collect(BytesRef bytes, float boost) { + @Override + public boolean collect(TermsEnum termsEnum, BytesRef bytes, float boost) { // add new TQ, we must clone the term, else it may get overwritten! - TermQuery tq = new TermQuery(placeholderTerm.createTerm(new BytesRef(bytes))); + TermQuery tq = new TermQuery(placeholderTerm.createTerm(new BytesRef(bytes)), termsEnum.docFreq()); tq.setBoost(query.getBoost() * boost); // set the boost result.add(tq, BooleanClause.Occur.SHOULD); // add to query return true; @@ -291,20 +292,22 @@ public abstract class MultiTermQuery extends Query { } /** Return a suitable Query for a MultiTermQuery term. */ - protected abstract Query getQuery(Term term); + protected abstract Query getQuery(Term term, int docCount); @Override public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException { final int maxSize = Math.min(size, BooleanQuery.getMaxClauseCount()); final PriorityQueue stQueue = new PriorityQueue(); collectTerms(reader, query, new TermCollector() { - public boolean collect(BytesRef bytes, float boost) { + @Override + public boolean collect(TermsEnum termsEnum, BytesRef bytes, float boost) { // ignore uncompetetive hits if (stQueue.size() >= maxSize && boost <= stQueue.peek().boost) return true; // add new entry in PQ, we must clone the term, else it may get overwritten! st.bytes.copy(bytes); st.boost = boost; + st.docFreq = termsEnum.docFreq(); stQueue.offer(st); // possibly drop entries from queue st = (stQueue.size() > maxSize) ? stQueue.poll() : new ScoreTerm(); @@ -320,7 +323,7 @@ public abstract class MultiTermQuery extends Query { final BooleanQuery bq = new BooleanQuery(true); for (final ScoreTerm st : stQueue) { // add new query, we must clone the term, else it may get overwritten! - Query tq = getQuery(placeholderTerm.createTerm(st.bytes)); + Query tq = getQuery(placeholderTerm.createTerm(st.bytes), st.docFreq); tq.setBoost(query.getBoost() * st.boost); // set the boost bq.add(tq, BooleanClause.Occur.SHOULD); // add to query } @@ -349,6 +352,7 @@ public abstract class MultiTermQuery extends Query { private static class ScoreTerm implements Comparable { public final BytesRef bytes = new BytesRef(); public float boost; + public int docFreq; public int compareTo(ScoreTerm other) { if (this.boost == other.boost) @@ -395,8 +399,8 @@ public abstract class MultiTermQuery extends Query { } @Override - protected Query getQuery(Term term) { - return new TermQuery(term); + protected Query getQuery(Term term, int docFreq) { + return new TermQuery(term, docFreq); } } @@ -433,8 +437,8 @@ public abstract class MultiTermQuery extends Query { } @Override - protected Query getQuery(Term term) { - return new ConstantScoreQuery(new QueryWrapperFilter(new TermQuery(term))); + protected Query getQuery(Term term, int docFreq) { + return new ConstantScoreQuery(new QueryWrapperFilter(new TermQuery(term, docFreq))); } } @@ -567,18 +571,14 @@ public abstract class MultiTermQuery extends Query { this.termCountLimit = termCountLimit; } - public boolean collect(BytesRef bytes, float boost) throws IOException { + public boolean collect(TermsEnum termsEnum, BytesRef bytes, float boost) throws IOException { termCount++; if (termCount >= termCountLimit || docVisitCount >= docCountCutoff) { hasCutOff = true; return false; } pendingTerms.copyUsingLengthPrefix(bytes); - // Loading the TermInfo from the terms dict here - // should not be costly, because 1) the - // query/filter will load the TermInfo when it - // runs, and 2) the terms dict has a cache: - docVisitCount += reader.docFreq(field, bytes); + docVisitCount += termsEnum.docFreq(); return true; } diff --git a/lucene/src/java/org/apache/lucene/search/Similarity.java b/lucene/src/java/org/apache/lucene/search/Similarity.java index 0d414d63e67..cebe18b90a6 100644 --- a/lucene/src/java/org/apache/lucene/search/Similarity.java +++ b/lucene/src/java/org/apache/lucene/search/Similarity.java @@ -741,7 +741,7 @@ public abstract class Similarity implements Serializable { * The default implementation uses: * *

-   * idf(searcher.docFreq(term), searcher.maxDoc());
+   * idf(docFreq, searcher.maxDoc());
    * 
* * Note that {@link Searcher#maxDoc()} is used instead of @@ -752,12 +752,13 @@ public abstract class Similarity implements Serializable { * * @param term the term in question * @param searcher the document collection being searched + * @param docFreq externally computed docFreq for this term * @return an IDFExplain object that includes both an idf score factor and an explanation for the term. * @throws IOException */ - public IDFExplanation idfExplain(final Term term, final Searcher searcher) throws IOException { - final int df = searcher.docFreq(term); + public IDFExplanation idfExplain(final Term term, final Searcher searcher, int docFreq) throws IOException { + final int df = docFreq; final int max = searcher.maxDoc(); final float idf = idf(df, max); return new IDFExplanation() { @@ -772,6 +773,15 @@ public abstract class Similarity implements Serializable { }}; } + /** + * This method forwards to {@link + * idfExplain(Term,Searcher,int)} by passing + * searcher.docFreq(term) as the docFreq. + */ + public IDFExplanation idfExplain(final Term term, final Searcher searcher) throws IOException { + return idfExplain(term, searcher, searcher.docFreq(term)); + } + /** * Computes a score factor for a phrase. * diff --git a/lucene/src/java/org/apache/lucene/search/TermQuery.java b/lucene/src/java/org/apache/lucene/search/TermQuery.java index 549317a67e2..4fda9b9fa17 100644 --- a/lucene/src/java/org/apache/lucene/search/TermQuery.java +++ b/lucene/src/java/org/apache/lucene/search/TermQuery.java @@ -30,7 +30,8 @@ import org.apache.lucene.util.ToStringUtils; This may be combined with other terms with a {@link BooleanQuery}. */ public class TermQuery extends Query { - private Term term; + private final Term term; + private final int docFreq; private class TermWeight extends Weight { private final Similarity similarity; @@ -43,7 +44,11 @@ public class TermQuery extends Query { public TermWeight(Searcher searcher) throws IOException { this.similarity = getSimilarity(searcher); - idfExp = similarity.idfExplain(term, searcher); + if (docFreq != -1) { + idfExp = similarity.idfExplain(term, searcher, docFreq); + } else { + idfExp = similarity.idfExplain(term, searcher); + } idf = idfExp.getIdf(); } @@ -160,7 +165,15 @@ public class TermQuery extends Query { /** Constructs a query for the term t. */ public TermQuery(Term t) { + this(t, -1); + } + + /** Expert: constructs a TermQuery that will use the + * provided docFreq instead of looking up the docFreq + * against the searcher. */ + public TermQuery(Term t, int docFreq) { term = t; + this.docFreq = docFreq; } /** Returns the term of this query. */ diff --git a/lucene/src/java/org/apache/lucene/search/cache/DocTermsIndexCreator.java b/lucene/src/java/org/apache/lucene/search/cache/DocTermsIndexCreator.java index 0d53d52a634..f9e86ea5cfa 100644 --- a/lucene/src/java/org/apache/lucene/search/cache/DocTermsIndexCreator.java +++ b/lucene/src/java/org/apache/lucene/search/cache/DocTermsIndexCreator.java @@ -284,6 +284,11 @@ public class DocTermsIndexCreator extends EntryCreatorW return term; } + @Override + public void cacheCurrentTerm() throws IOException { + throw new UnsupportedOperationException(); + } + @Override public BytesRef term() throws IOException { return term; diff --git a/lucene/src/test/org/apache/lucene/TestExternalCodecs.java b/lucene/src/test/org/apache/lucene/TestExternalCodecs.java index c148bcb7e37..6d0e4b1f962 100644 --- a/lucene/src/test/org/apache/lucene/TestExternalCodecs.java +++ b/lucene/src/test/org/apache/lucene/TestExternalCodecs.java @@ -319,6 +319,10 @@ public class TestExternalCodecs extends LuceneTestCase { return ramField.termToDocs.get(current).docs.size(); } + @Override + public void cacheCurrentTerm() { + } + @Override public DocsEnum docs(Bits skipDocs, DocsEnum reuse) { return new RAMDocsEnum(ramField.termToDocs.get(current), skipDocs); diff --git a/lucene/src/test/org/apache/lucene/index/TestLazyProxSkipping.java b/lucene/src/test/org/apache/lucene/index/TestLazyProxSkipping.java index 6f1438adcc3..bfaac41eafb 100755 --- a/lucene/src/test/org/apache/lucene/index/TestLazyProxSkipping.java +++ b/lucene/src/test/org/apache/lucene/index/TestLazyProxSkipping.java @@ -33,6 +33,7 @@ import org.apache.lucene.store.MockDirectoryWrapper; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.BytesRef; +import static org.junit.Assume.*; /** * Tests lazy skipping on the proximity file. @@ -117,14 +118,13 @@ public class TestLazyProxSkipping extends LuceneTestCase { assertTrue(this.seeksCounter > 0); assertTrue("seeksCounter=" + this.seeksCounter + " numHits=" + numHits, this.seeksCounter <= numHits + 1); } - + public void testLazySkipping() throws IOException { + assumeTrue(!CodecProvider.getDefaultCodec().equals("SimpleText")); // test whether only the minimum amount of seeks() // are performed - if (!CodecProvider.getDefaultCodec().equals("SimpleText")) { - performTest(5); - performTest(10); - } + performTest(5); + performTest(10); } public void testSeek() throws IOException { diff --git a/lucene/src/test/org/apache/lucene/search/TestNumericRangeQuery64.java b/lucene/src/test/org/apache/lucene/search/TestNumericRangeQuery64.java index 73caa4c825b..cc654f85c7b 100644 --- a/lucene/src/test/org/apache/lucene/search/TestNumericRangeQuery64.java +++ b/lucene/src/test/org/apache/lucene/search/TestNumericRangeQuery64.java @@ -87,7 +87,6 @@ public class TestNumericRangeQuery64 extends LuceneTestCase { ascfield2.setLongValue(val); writer.addDocument(doc); } - reader = writer.getReader(); searcher=new IndexSearcher(reader); writer.close(); diff --git a/lucene/src/test/org/apache/lucene/util/LuceneTestCase.java b/lucene/src/test/org/apache/lucene/util/LuceneTestCase.java index 2be9f186459..9694dbaa010 100644 --- a/lucene/src/test/org/apache/lucene/util/LuceneTestCase.java +++ b/lucene/src/test/org/apache/lucene/util/LuceneTestCase.java @@ -191,9 +191,7 @@ public abstract class LuceneTestCase extends Assert { private static Map stores; - // TODO 4.0: make sure we re-enable SimpleText in the rotation - //private static final String[] TEST_CODECS = new String[] {"MockSep", "MockFixedIntBlock", "MockVariableIntBlock", "SimpleText"}; - private static final String[] TEST_CODECS = new String[] {"MockSep", "MockFixedIntBlock", "MockVariableIntBlock"}; + private static final String[] TEST_CODECS = new String[] {"MockSep", "MockFixedIntBlock", "MockVariableIntBlock", "SimpleText"}; private static void swapCodec(Codec c) { final CodecProvider cp = CodecProvider.getDefault(); @@ -246,8 +244,7 @@ public abstract class LuceneTestCase extends Assert { swapCodec(new MockFixedIntBlockCodec(codecHasParam && "MockFixedIntBlock".equals(codec) ? codecParam : _TestUtil.nextInt(random, 1, 2000))); // baseBlockSize cannot be over 127: swapCodec(new MockVariableIntBlockCodec(codecHasParam && "MockVariableIntBlock".equals(codec) ? codecParam : _TestUtil.nextInt(random, 1, 127))); - // TODO 4.0: add this into test rotation - //swapCodec(new SimpleTextCodec()); + swapCodec(new SimpleTextCodec()); return cp.lookup(codec); } diff --git a/solr/src/java/org/apache/solr/request/UnInvertedField.java b/solr/src/java/org/apache/solr/request/UnInvertedField.java index 0bbf2a8b935..c92423e9c29 100755 --- a/solr/src/java/org/apache/solr/request/UnInvertedField.java +++ b/solr/src/java/org/apache/solr/request/UnInvertedField.java @@ -40,7 +40,6 @@ import org.apache.solr.search.*; import org.apache.solr.util.ByteUtils; import org.apache.solr.util.LongPriorityQueue; import org.apache.solr.util.PrimUtils; -import org.apache.solr.util.BoundedTreeSet; import org.apache.solr.handler.component.StatsValues; import org.apache.solr.handler.component.FieldFacetStats; import org.apache.lucene.util.OpenBitSet; @@ -1002,6 +1001,11 @@ class NumberedTermsEnum extends TermsEnum { return tenum.docFreq(); } + @Override + public void cacheCurrentTerm() { + throw new UnsupportedOperationException(); + } + public BytesRef skipTo(BytesRef target) throws IOException { // already here