LUCENE-1847: PhraseQuery/TermQuery/SpanQuery use IndexReader specific stats in their Weight#explain methods - these stats should be corpus wide.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@807595 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mark Robert Miller 2009-08-25 12:50:06 +00:00
parent ca3f86d815
commit c1c2ce7716
7 changed files with 280 additions and 36 deletions

View File

@ -412,6 +412,10 @@ API Changes
AttributeFactory to all Tokenizer implementations. AttributeFactory to all Tokenizer implementations.
(Michael Busch) (Michael Busch)
38. LUCENE-1847: Similarity#idf for both a Term and Term Collection have
been deprecated. New versions that return an IDFExplanation have been
added. (Yasoja Seneviratne, Mike McCandless, Mark Miller)
Bug fixes Bug fixes
1. LUCENE-1415: MultiPhraseQuery has incorrect hashCode() and equals() 1. LUCENE-1415: MultiPhraseQuery has incorrect hashCode() and equals()
@ -516,6 +520,10 @@ Bug fixes
new LocalizedTestCase as base class for localization junit tests. new LocalizedTestCase as base class for localization junit tests.
(Robert Muir, Uwe Schindler via Michael Busch) (Robert Muir, Uwe Schindler via Michael Busch)
26. LUCENE-1847: PhraseQuery/TermQuery/SpanQuery use IndexReader specific stats
in their Weight#explain methods - these stats should be corpus wide.
(Yasoja Seneviratne, Mike McCandless, Mark Miller)
New features New features
1. LUCENE-1411: Added expert API to open an IndexWriter on a prior 1. LUCENE-1411: Added expert API to open an IndexWriter on a prior
@ -730,11 +738,6 @@ New features
passed to not double RAM usage in the FieldCache. (Chris passed to not double RAM usage in the FieldCache. (Chris
Hostetter, Mark Miller, Mike McCandless) Hostetter, Mark Miller, Mike McCandless)
37. LUCENE-1798: Added FieldCache.set/getInfoStream, which uses
FieldCacheSanityChecker to detect when a new cache entry has
caused additional insanity, printing the details at the time that
it happens. (Chris Hostetter, Mike McCandless)
Optimizations Optimizations
1. LUCENE-1427: Fixed QueryWrapperFilter to not waste time computing 1. LUCENE-1427: Fixed QueryWrapperFilter to not waste time computing

View File

@ -17,6 +17,7 @@ package org.apache.lucene.search;
* limitations under the License. * limitations under the License.
*/ */
import java.io.Serializable;
import java.util.ArrayList; import java.util.ArrayList;
/** Expert: Describes the score computation for document and query. */ /** Expert: Describes the score computation for document and query. */
@ -124,4 +125,25 @@ public class Explanation implements java.io.Serializable {
return buffer.toString(); return buffer.toString();
} }
/**
* Small Util class used to pass both an idf factor as well as an
* explanation for that factor.
*
* This class will likely be held on a {@link Weight}, so be aware
* before storing any large or un-serializable fields.
*
*/
public static abstract class IDFExplanation implements Serializable {
/**
* @return the idf factor
*/
public abstract float getIdf();
/**
* This should be calculated lazily if possible.
*
* @return the explanation for the idf factor.
*/
public abstract String explain();
}
} }

View File

@ -24,6 +24,7 @@ import java.util.ArrayList;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermPositions; import org.apache.lucene.index.TermPositions;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.Explanation.IDFExplanation;
import org.apache.lucene.util.ToStringUtils; import org.apache.lucene.util.ToStringUtils;
/** A Query that matches documents containing a particular sequence of terms. /** A Query that matches documents containing a particular sequence of terms.
@ -112,12 +113,14 @@ public class PhraseQuery extends Query {
private float idf; private float idf;
private float queryNorm; private float queryNorm;
private float queryWeight; private float queryWeight;
private IDFExplanation idfExp;
public PhraseWeight(Searcher searcher) public PhraseWeight(Searcher searcher)
throws IOException { throws IOException {
this.similarity = getSimilarity(searcher); this.similarity = getSimilarity(searcher);
idf = similarity.idf(terms, searcher); idfExp = similarity.idfExplain(terms, searcher);
idf = idfExp.getIdf();
} }
public String toString() { return "weight(" + PhraseQuery.this + ")"; } public String toString() { return "weight(" + PhraseQuery.this + ")"; }
@ -167,24 +170,20 @@ public class PhraseQuery extends Query {
StringBuffer docFreqs = new StringBuffer(); StringBuffer docFreqs = new StringBuffer();
StringBuffer query = new StringBuffer(); StringBuffer query = new StringBuffer();
query.append('\"'); query.append('\"');
docFreqs.append(idfExp.explain());
for (int i = 0; i < terms.size(); i++) { for (int i = 0; i < terms.size(); i++) {
if (i != 0) { if (i != 0) {
docFreqs.append(" ");
query.append(" "); query.append(" ");
} }
Term term = (Term)terms.get(i); Term term = (Term)terms.get(i);
docFreqs.append(term.text());
docFreqs.append("=");
docFreqs.append(reader.docFreq(term));
query.append(term.text()); query.append(term.text());
} }
query.append('\"'); query.append('\"');
Explanation idfExpl = Explanation idfExpl =
new Explanation(idf, "idf(" + field + ": " + docFreqs + ")"); new Explanation(idf, "idf(" + field + ":" + docFreqs + ")");
// explain query weight // explain query weight
Explanation queryExpl = new Explanation(); Explanation queryExpl = new Explanation();

View File

@ -17,13 +17,16 @@ package org.apache.lucene.search;
* limitations under the License. * limitations under the License.
*/ */
import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.search.Explanation.IDFExplanation;
import org.apache.lucene.util.SmallFloat; import org.apache.lucene.util.SmallFloat;
import java.io.IOException; import java.io.IOException;
import java.io.Serializable; import java.io.Serializable;
import java.util.Collection; import java.util.Collection;
import java.util.IdentityHashMap;
import java.util.Iterator; import java.util.Iterator;
/** Expert: Scoring API. /** Expert: Scoring API.
@ -287,8 +290,6 @@ import java.util.Iterator;
* @see Searcher#setSimilarity(Similarity) * @see Searcher#setSimilarity(Similarity)
*/ */
public abstract class Similarity implements Serializable { public abstract class Similarity implements Serializable {
/** The Similarity implementation used by default. */
private static Similarity defaultImpl = new DefaultSimilarity();
public static final int NO_DOC_ID_PROVIDED = -1; public static final int NO_DOC_ID_PROVIDED = -1;
@ -478,11 +479,63 @@ public abstract class Similarity implements Serializable {
* @param term the term in question * @param term the term in question
* @param searcher the document collection being searched * @param searcher the document collection being searched
* @return a score factor for the term * @return a score factor for the term
* @deprecated see {@link #idfExplain(Term, Searcher)}
*/ */
public float idf(Term term, Searcher searcher) throws IOException { public float idf(Term term, Searcher searcher) throws IOException {
return idf(searcher.docFreq(term), searcher.maxDoc()); return idf(searcher.docFreq(term), searcher.maxDoc());
} }
/**
* Computes a score factor for a simple term and returns an explanation
* for that score factor.
*
* <p>
* The default implementation uses:
*
* <pre>
* idf(searcher.docFreq(term), searcher.maxDoc());
* </pre>
*
* Note that {@link Searcher#maxDoc()} is used instead of
* {@link org.apache.lucene.index.IndexReader#numDocs()} because it is
* proportional to {@link Searcher#docFreq(Term)} , i.e., when one is
* inaccurate, so is the other, and in the same direction.
*
* @param term the term in question
* @param searcher the document collection being searched
* @return an IDFExplain object that includes both an idf score factor
and an explanation for the term.
* @throws IOException
*/
public IDFExplanation idfExplain(final Term term, final Searcher searcher) throws IOException {
if(supportedMethods.overridesTermIDF) {
final float idf = idf(term, searcher);
return new IDFExplanation() {
//@Override
public float getIdf() {
return idf;
}
//@Override
public String explain() {
return "Inexplicable";
}
};
}
final int df = searcher.docFreq(term);
final int max = searcher.maxDoc();
final float idf = idf(df, max);
return new IDFExplanation() {
//@Override
public String explain() {
return "idf(docFreq=" + df +
", maxDocs=" + max + ")";
}
//@Override
public float getIdf() {
return idf;
}};
}
/** Computes a score factor for a phrase. /** Computes a score factor for a phrase.
* *
* <p>The default implementation sums the {@link #idf(Term,Searcher)} factor * <p>The default implementation sums the {@link #idf(Term,Searcher)} factor
@ -490,7 +543,8 @@ public abstract class Similarity implements Serializable {
* *
* @param terms the terms in the phrase * @param terms the terms in the phrase
* @param searcher the document collection being searched * @param searcher the document collection being searched
* @return a score factor for the phrase * @return
* @deprecated see {@link #idfExplain(Collection, Searcher)}
*/ */
public float idf(Collection terms, Searcher searcher) throws IOException { public float idf(Collection terms, Searcher searcher) throws IOException {
float idf = 0.0f; float idf = 0.0f;
@ -501,6 +555,60 @@ public abstract class Similarity implements Serializable {
return idf; return idf;
} }
/**
* Computes a score factor for a phrase.
*
* <p>
* The default implementation sums the idf factor for
* each term in the phrase.
*
* @param terms the terms in the phrase
* @param searcher the document collection being searched
* @return an IDFExplain object that includes both an idf
* score factor for the phrase and an explanation
* for each term.
* @throws IOException
*/
public IDFExplanation idfExplain(Collection terms, Searcher searcher) throws IOException {
if(supportedMethods.overridesCollectionIDF) {
final float idf = idf(terms, searcher);
return new IDFExplanation() {
//@Override
public float getIdf() {
return idf;
}
//@Override
public String explain() {
return "Inexplicable";
}
};
}
final int max = searcher.maxDoc();
float idf = 0.0f;
final StringBuffer exp = new StringBuffer();
Iterator i = terms.iterator();
while (i.hasNext()) {
Term term = (Term)i.next();
final int df = searcher.docFreq(term);
idf += idf(df, max);
exp.append(" ");
exp.append(term.text());
exp.append("=");
exp.append(df);
}
final float fIdf = idf;
return new IDFExplanation() {
//@Override
public float getIdf() {
return fIdf;
}
//@Override
public String explain() {
return exp.toString();
}
};
}
/** Computes a score factor based on a term's document frequency (the number /** Computes a score factor based on a term's document frequency (the number
* of documents which contain the term). This value is multiplied by the * of documents which contain the term). This value is multiplied by the
* {@link #tf(int)} factor for each term in the query and these products are * {@link #tf(int)} factor for each term in the query and these products are
@ -578,4 +686,51 @@ public abstract class Similarity implements Serializable {
return scorePayload(fieldName, payload, offset, length); return scorePayload(fieldName, payload, offset, length);
} }
/** @deprecated Remove this when old API is removed! */
private final MethodSupport supportedMethods = getSupportedMethods(this.getClass());
/** @deprecated Remove this when old API is removed! */
private static final class MethodSupport implements Serializable {
final boolean overridesCollectionIDF, overridesTermIDF;
MethodSupport(Class clazz) {
overridesCollectionIDF = isMethodOverridden(clazz, "idf", C_IDF_METHOD_PARAMS);
overridesTermIDF = isMethodOverridden(clazz, "idf", T_IDF_METHOD_PARAMS);
}
private static boolean isMethodOverridden(Class clazz, String name, Class[] params) {
try {
return clazz.getMethod(name, params).getDeclaringClass() != Similarity.class;
} catch (NoSuchMethodException e) {
// should not happen
throw new RuntimeException(e);
}
}
/** @deprecated Remove this when old API is removed! */
private static final Class[] T_IDF_METHOD_PARAMS = new Class[]{Term.class, Searcher.class};
/** @deprecated Remove this when old API is removed! */
private static final Class[] C_IDF_METHOD_PARAMS = new Class[]{Collection.class, Searcher.class};
}
/** @deprecated Remove this when old API is removed! */
private static final IdentityHashMap/*<Class<? extends Similarity>,MethodSupport>*/ knownMethodSupport = new IdentityHashMap();
/** @deprecated Remove this when old API is removed! */
private static MethodSupport getSupportedMethods(Class clazz) {
MethodSupport supportedMethods;
synchronized(knownMethodSupport) {
supportedMethods = (MethodSupport) knownMethodSupport.get(clazz);
if (supportedMethods == null) {
knownMethodSupport.put(clazz, supportedMethods = new MethodSupport(clazz));
}
}
return supportedMethods;
}
/** The Similarity implementation used by default.
* TODO: move back to top when old API is removed!
**/
private static Similarity defaultImpl = new DefaultSimilarity();
} }

View File

@ -23,6 +23,7 @@ import java.util.Set;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs; import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.Explanation.IDFExplanation;
import org.apache.lucene.util.ToStringUtils; import org.apache.lucene.util.ToStringUtils;
/** A Query that matches documents containing a term. /** A Query that matches documents containing a term.
@ -37,11 +38,13 @@ public class TermQuery extends Query {
private float idf; private float idf;
private float queryNorm; private float queryNorm;
private float queryWeight; private float queryWeight;
private IDFExplanation idfExp;
public TermWeight(Searcher searcher) public TermWeight(Searcher searcher)
throws IOException { throws IOException {
this.similarity = getSimilarity(searcher); this.similarity = getSimilarity(searcher);
idf = similarity.idf(term, searcher); // compute idf idfExp = similarity.idfExplain(term, searcher);
idf = idfExp.getIdf();
} }
public String toString() { return "weight(" + TermQuery.this + ")"; } public String toString() { return "weight(" + TermQuery.this + ")"; }
@ -75,8 +78,7 @@ public class TermQuery extends Query {
ComplexExplanation result = new ComplexExplanation(); ComplexExplanation result = new ComplexExplanation();
result.setDescription("weight("+getQuery()+" in "+doc+"), product of:"); result.setDescription("weight("+getQuery()+" in "+doc+"), product of:");
Explanation expl = new Explanation(idf, "idf(docFreq=" + reader.docFreq(term) + Explanation expl = new Explanation(idf, idfExp.explain());
", maxDocs=" + reader.maxDoc() + ")");
// explain query weight // explain query weight
Explanation queryExpl = new Explanation(); Explanation queryExpl = new Explanation();

View File

@ -18,12 +18,11 @@ package org.apache.lucene.search.spans;
*/ */
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.*; import org.apache.lucene.search.*;
import org.apache.lucene.search.Explanation.IDFExplanation;
import java.io.IOException; import java.io.IOException;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator;
import java.util.Set; import java.util.Set;
/** /**
@ -38,6 +37,7 @@ public class SpanWeight extends Weight {
protected Set terms; protected Set terms;
protected SpanQuery query; protected SpanQuery query;
private IDFExplanation idfExp;
public SpanWeight(SpanQuery query, Searcher searcher) public SpanWeight(SpanQuery query, Searcher searcher)
throws IOException { throws IOException {
@ -45,8 +45,8 @@ public class SpanWeight extends Weight {
this.query = query; this.query = query;
terms=new HashSet(); terms=new HashSet();
query.extractTerms(terms); query.extractTerms(terms);
idfExp = similarity.idfExplain(terms, searcher);
idf = this.query.getSimilarity(searcher).idf(terms, searcher); idf = idfExp.getIdf();
} }
public Query getQuery() { return query; } public Query getQuery() { return query; }
@ -75,21 +75,8 @@ public class SpanWeight extends Weight {
result.setDescription("weight("+getQuery()+" in "+doc+"), product of:"); result.setDescription("weight("+getQuery()+" in "+doc+"), product of:");
String field = ((SpanQuery)getQuery()).getField(); String field = ((SpanQuery)getQuery()).getField();
StringBuffer docFreqs = new StringBuffer();
Iterator i = terms.iterator();
while (i.hasNext()) {
Term term = (Term)i.next();
docFreqs.append(term.text());
docFreqs.append("=");
docFreqs.append(reader.docFreq(term));
if (i.hasNext()) {
docFreqs.append(" ");
}
}
Explanation idfExpl = Explanation idfExpl =
new Explanation(idf, "idf(" + field + ": " + docFreqs + ")"); new Explanation(idf, "idf(" + field + ": " + idfExp.explain() + ")");
// explain query weight // explain query weight
Explanation queryExpl = new Explanation(); Explanation queryExpl = new Explanation();

View File

@ -17,6 +17,19 @@ package org.apache.lucene.search;
* limitations under the License. * limitations under the License.
*/ */
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.MockRAMDirectory;
/** /**
* TestExplanations subclass focusing on basic query types * TestExplanations subclass focusing on basic query types
*/ */
@ -291,4 +304,67 @@ public class TestSimpleExplanations extends TestExplanations {
} }
public void testTermQueryMultiSearcherExplain() throws Exception {
// creating two directories for indices
Directory indexStoreA = new MockRAMDirectory();
Directory indexStoreB = new MockRAMDirectory();
Document lDoc = new Document();
lDoc.add(new Field("handle", "1 2", Field.Store.YES, Field.Index.ANALYZED));
Document lDoc2 = new Document();
lDoc2.add(new Field("handle", "1 2", Field.Store.YES, Field.Index.ANALYZED));
Document lDoc3 = new Document();
lDoc3.add(new Field("handle", "1 2", Field.Store.YES, Field.Index.ANALYZED));
IndexWriter writerA = new IndexWriter(indexStoreA, new StandardAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
IndexWriter writerB = new IndexWriter(indexStoreB, new StandardAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
writerA.addDocument(lDoc);
writerA.addDocument(lDoc2);
writerA.optimize();
writerA.close();
writerB.addDocument(lDoc3);
writerB.close();
QueryParser parser = new QueryParser("fulltext", new StandardAnalyzer());
Query query = parser.parse("handle:1");
Searcher[] searchers = new Searcher[2];
searchers[0] = new IndexSearcher(indexStoreB);
searchers[1] = new IndexSearcher(indexStoreA);
Searcher mSearcher = new MultiSearcher(searchers);
ScoreDoc[] hits = mSearcher.search(query, null, 1000).scoreDocs;
assertEquals(3, hits.length);
Explanation explain = mSearcher.explain(query, hits[0].doc);
String exp = explain.toString(0);
assertTrue(exp, exp.indexOf("maxDocs=3") > -1);
assertTrue(exp, exp.indexOf("docFreq=3") > -1);
query = parser.parse("handle:\"1 2\"");
hits = mSearcher.search(query, null, 1000).scoreDocs;
assertEquals(3, hits.length);
explain = mSearcher.explain(query, hits[0].doc);
exp = explain.toString(0);
assertTrue(exp, exp.indexOf("1=3") > -1);
assertTrue(exp, exp.indexOf("2=3") > -1);
query = new SpanNearQuery(new SpanQuery[] {
new SpanTermQuery(new Term("handle", "1")),
new SpanTermQuery(new Term("handle", "2")) }, 0, true);
hits = mSearcher.search(query, null, 1000).scoreDocs;
assertEquals(3, hits.length);
explain = mSearcher.explain(query, hits[0].doc);
exp = explain.toString(0);
assertTrue(exp, exp.indexOf("1=3") > -1);
assertTrue(exp, exp.indexOf("2=3") > -1);
mSearcher.close();
}
} }