LUCENE-1847: PhraseQuery/TermQuery/SpanQuery use IndexReader specific stats in their Weight#explain methods - these stats should be corpus wide.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@807595 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mark Robert Miller 2009-08-25 12:50:06 +00:00
parent ca3f86d815
commit c1c2ce7716
7 changed files with 280 additions and 36 deletions

View File

@ -411,6 +411,10 @@ API Changes
37. LUCENE-1826: Add constructors that take AttributeSource and
AttributeFactory to all Tokenizer implementations.
(Michael Busch)
38. LUCENE-1847: Similarity#idf for both a Term and Term Collection have
been deprecated. New versions that return an IDFExplanation have been
added. (Yasoja Seneviratne, Mike McCandless, Mark Miller)
Bug fixes
@ -516,6 +520,10 @@ Bug fixes
new LocalizedTestCase as base class for localization junit tests.
(Robert Muir, Uwe Schindler via Michael Busch)
26. LUCENE-1847: PhraseQuery/TermQuery/SpanQuery use IndexReader specific stats
in their Weight#explain methods - these stats should be corpus wide.
(Yasoja Seneviratne, Mike McCandless, Mark Miller)
New features
1. LUCENE-1411: Added expert API to open an IndexWriter on a prior
@ -729,11 +737,6 @@ New features
ValueSource, but takes care when composite (multi-segment) are
passed to not double RAM usage in the FieldCache. (Chris
Hostetter, Mark Miller, Mike McCandless)
37. LUCENE-1798: Added FieldCache.set/getInfoStream, which uses
FieldCacheSanityChecker to detect when a new cache entry has
caused additional insanity, printing the details at the time that
it happens. (Chris Hostetter, Mike McCandless)
Optimizations

View File

@ -17,6 +17,7 @@ package org.apache.lucene.search;
* limitations under the License.
*/
import java.io.Serializable;
import java.util.ArrayList;
/** Expert: Describes the score computation for document and query. */
@ -124,4 +125,25 @@ public class Explanation implements java.io.Serializable {
return buffer.toString();
}
/**
* Small Util class used to pass both an idf factor as well as an
* explanation for that factor.
*
* This class will likely be held on a {@link Weight}, so be aware
* before storing any large or un-serializable fields.
*
*/
public static abstract class IDFExplanation implements Serializable {
/**
* @return the idf factor
*/
public abstract float getIdf();
/**
* This should be calculated lazily if possible.
*
* @return the explanation for the idf factor.
*/
public abstract String explain();
}
}

View File

@ -24,6 +24,7 @@ import java.util.ArrayList;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermPositions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.Explanation.IDFExplanation;
import org.apache.lucene.util.ToStringUtils;
/** A Query that matches documents containing a particular sequence of terms.
@ -112,12 +113,14 @@ public class PhraseQuery extends Query {
private float idf;
private float queryNorm;
private float queryWeight;
private IDFExplanation idfExp;
public PhraseWeight(Searcher searcher)
throws IOException {
this.similarity = getSimilarity(searcher);
idf = similarity.idf(terms, searcher);
idfExp = similarity.idfExplain(terms, searcher);
idf = idfExp.getIdf();
}
public String toString() { return "weight(" + PhraseQuery.this + ")"; }
@ -167,24 +170,20 @@ public class PhraseQuery extends Query {
StringBuffer docFreqs = new StringBuffer();
StringBuffer query = new StringBuffer();
query.append('\"');
docFreqs.append(idfExp.explain());
for (int i = 0; i < terms.size(); i++) {
if (i != 0) {
docFreqs.append(" ");
query.append(" ");
}
Term term = (Term)terms.get(i);
docFreqs.append(term.text());
docFreqs.append("=");
docFreqs.append(reader.docFreq(term));
query.append(term.text());
}
query.append('\"');
Explanation idfExpl =
new Explanation(idf, "idf(" + field + ": " + docFreqs + ")");
new Explanation(idf, "idf(" + field + ":" + docFreqs + ")");
// explain query weight
Explanation queryExpl = new Explanation();

View File

@ -17,13 +17,16 @@ package org.apache.lucene.search;
* limitations under the License.
*/
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Explanation.IDFExplanation;
import org.apache.lucene.util.SmallFloat;
import java.io.IOException;
import java.io.Serializable;
import java.util.Collection;
import java.util.IdentityHashMap;
import java.util.Iterator;
/** Expert: Scoring API.
@ -287,8 +290,6 @@ import java.util.Iterator;
* @see Searcher#setSimilarity(Similarity)
*/
public abstract class Similarity implements Serializable {
/** The Similarity implementation used by default. */
private static Similarity defaultImpl = new DefaultSimilarity();
public static final int NO_DOC_ID_PROVIDED = -1;
@ -478,10 +479,62 @@ public abstract class Similarity implements Serializable {
* @param term the term in question
* @param searcher the document collection being searched
* @return a score factor for the term
* @deprecated see {@link #idfExplain(Term, Searcher)}
*/
public float idf(Term term, Searcher searcher) throws IOException {
return idf(searcher.docFreq(term), searcher.maxDoc());
}
/**
* Computes a score factor for a simple term and returns an explanation
* for that score factor.
*
* <p>
* The default implementation uses:
*
* <pre>
* idf(searcher.docFreq(term), searcher.maxDoc());
* </pre>
*
* Note that {@link Searcher#maxDoc()} is used instead of
* {@link org.apache.lucene.index.IndexReader#numDocs()} because it is
* proportional to {@link Searcher#docFreq(Term)} , i.e., when one is
* inaccurate, so is the other, and in the same direction.
*
* @param term the term in question
* @param searcher the document collection being searched
* @return an IDFExplain object that includes both an idf score factor
and an explanation for the term.
* @throws IOException
*/
public IDFExplanation idfExplain(final Term term, final Searcher searcher) throws IOException {
if(supportedMethods.overridesTermIDF) {
final float idf = idf(term, searcher);
return new IDFExplanation() {
//@Override
public float getIdf() {
return idf;
}
//@Override
public String explain() {
return "Inexplicable";
}
};
}
final int df = searcher.docFreq(term);
final int max = searcher.maxDoc();
final float idf = idf(df, max);
return new IDFExplanation() {
//@Override
public String explain() {
return "idf(docFreq=" + df +
", maxDocs=" + max + ")";
}
//@Override
public float getIdf() {
return idf;
}};
}
/** Computes a score factor for a phrase.
*
@ -490,7 +543,8 @@ public abstract class Similarity implements Serializable {
*
* @param terms the terms in the phrase
* @param searcher the document collection being searched
* @return a score factor for the phrase
* @return
* @deprecated see {@link #idfExplain(Collection, Searcher)}
*/
public float idf(Collection terms, Searcher searcher) throws IOException {
float idf = 0.0f;
@ -500,6 +554,60 @@ public abstract class Similarity implements Serializable {
}
return idf;
}
/**
* Computes a score factor for a phrase.
*
* <p>
* The default implementation sums the idf factor for
* each term in the phrase.
*
* @param terms the terms in the phrase
* @param searcher the document collection being searched
* @return an IDFExplain object that includes both an idf
* score factor for the phrase and an explanation
* for each term.
* @throws IOException
*/
public IDFExplanation idfExplain(Collection terms, Searcher searcher) throws IOException {
if(supportedMethods.overridesCollectionIDF) {
final float idf = idf(terms, searcher);
return new IDFExplanation() {
//@Override
public float getIdf() {
return idf;
}
//@Override
public String explain() {
return "Inexplicable";
}
};
}
final int max = searcher.maxDoc();
float idf = 0.0f;
final StringBuffer exp = new StringBuffer();
Iterator i = terms.iterator();
while (i.hasNext()) {
Term term = (Term)i.next();
final int df = searcher.docFreq(term);
idf += idf(df, max);
exp.append(" ");
exp.append(term.text());
exp.append("=");
exp.append(df);
}
final float fIdf = idf;
return new IDFExplanation() {
//@Override
public float getIdf() {
return fIdf;
}
//@Override
public String explain() {
return exp.toString();
}
};
}
/** Computes a score factor based on a term's document frequency (the number
* of documents which contain the term). This value is multiplied by the
@ -577,5 +685,52 @@ public abstract class Similarity implements Serializable {
//TODO: When removing the deprecated scorePayload above, set this to return 1
return scorePayload(fieldName, payload, offset, length);
}
/** @deprecated Remove this when old API is removed! */
private final MethodSupport supportedMethods = getSupportedMethods(this.getClass());
/** @deprecated Remove this when old API is removed! */
private static final class MethodSupport implements Serializable {
final boolean overridesCollectionIDF, overridesTermIDF;
MethodSupport(Class clazz) {
overridesCollectionIDF = isMethodOverridden(clazz, "idf", C_IDF_METHOD_PARAMS);
overridesTermIDF = isMethodOverridden(clazz, "idf", T_IDF_METHOD_PARAMS);
}
private static boolean isMethodOverridden(Class clazz, String name, Class[] params) {
try {
return clazz.getMethod(name, params).getDeclaringClass() != Similarity.class;
} catch (NoSuchMethodException e) {
// should not happen
throw new RuntimeException(e);
}
}
/** @deprecated Remove this when old API is removed! */
private static final Class[] T_IDF_METHOD_PARAMS = new Class[]{Term.class, Searcher.class};
/** @deprecated Remove this when old API is removed! */
private static final Class[] C_IDF_METHOD_PARAMS = new Class[]{Collection.class, Searcher.class};
}
/** @deprecated Remove this when old API is removed! */
private static final IdentityHashMap/*<Class<? extends Similarity>,MethodSupport>*/ knownMethodSupport = new IdentityHashMap();
/** @deprecated Remove this when old API is removed! */
private static MethodSupport getSupportedMethods(Class clazz) {
MethodSupport supportedMethods;
synchronized(knownMethodSupport) {
supportedMethods = (MethodSupport) knownMethodSupport.get(clazz);
if (supportedMethods == null) {
knownMethodSupport.put(clazz, supportedMethods = new MethodSupport(clazz));
}
}
return supportedMethods;
}
/** The Similarity implementation used by default.
* TODO: move back to top when old API is removed!
**/
private static Similarity defaultImpl = new DefaultSimilarity();
}

View File

@ -23,6 +23,7 @@ import java.util.Set;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.Explanation.IDFExplanation;
import org.apache.lucene.util.ToStringUtils;
/** A Query that matches documents containing a term.
@ -37,11 +38,13 @@ public class TermQuery extends Query {
private float idf;
private float queryNorm;
private float queryWeight;
private IDFExplanation idfExp;
public TermWeight(Searcher searcher)
throws IOException {
this.similarity = getSimilarity(searcher);
idf = similarity.idf(term, searcher); // compute idf
idfExp = similarity.idfExplain(term, searcher);
idf = idfExp.getIdf();
}
public String toString() { return "weight(" + TermQuery.this + ")"; }
@ -75,8 +78,7 @@ public class TermQuery extends Query {
ComplexExplanation result = new ComplexExplanation();
result.setDescription("weight("+getQuery()+" in "+doc+"), product of:");
Explanation expl = new Explanation(idf, "idf(docFreq=" + reader.docFreq(term) +
", maxDocs=" + reader.maxDoc() + ")");
Explanation expl = new Explanation(idf, idfExp.explain());
// explain query weight
Explanation queryExpl = new Explanation();

View File

@ -18,12 +18,11 @@ package org.apache.lucene.search.spans;
*/
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.*;
import org.apache.lucene.search.Explanation.IDFExplanation;
import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
/**
@ -38,6 +37,7 @@ public class SpanWeight extends Weight {
protected Set terms;
protected SpanQuery query;
private IDFExplanation idfExp;
public SpanWeight(SpanQuery query, Searcher searcher)
throws IOException {
@ -45,8 +45,8 @@ public class SpanWeight extends Weight {
this.query = query;
terms=new HashSet();
query.extractTerms(terms);
idf = this.query.getSimilarity(searcher).idf(terms, searcher);
idfExp = similarity.idfExplain(terms, searcher);
idf = idfExp.getIdf();
}
public Query getQuery() { return query; }
@ -75,21 +75,8 @@ public class SpanWeight extends Weight {
result.setDescription("weight("+getQuery()+" in "+doc+"), product of:");
String field = ((SpanQuery)getQuery()).getField();
StringBuffer docFreqs = new StringBuffer();
Iterator i = terms.iterator();
while (i.hasNext()) {
Term term = (Term)i.next();
docFreqs.append(term.text());
docFreqs.append("=");
docFreqs.append(reader.docFreq(term));
if (i.hasNext()) {
docFreqs.append(" ");
}
}
Explanation idfExpl =
new Explanation(idf, "idf(" + field + ": " + docFreqs + ")");
new Explanation(idf, "idf(" + field + ": " + idfExp.explain() + ")");
// explain query weight
Explanation queryExpl = new Explanation();

View File

@ -17,6 +17,19 @@ package org.apache.lucene.search;
* limitations under the License.
*/
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.MockRAMDirectory;
/**
* TestExplanations subclass focusing on basic query types
*/
@ -291,4 +304,67 @@ public class TestSimpleExplanations extends TestExplanations {
}
public void testTermQueryMultiSearcherExplain() throws Exception {
// creating two directories for indices
Directory indexStoreA = new MockRAMDirectory();
Directory indexStoreB = new MockRAMDirectory();
Document lDoc = new Document();
lDoc.add(new Field("handle", "1 2", Field.Store.YES, Field.Index.ANALYZED));
Document lDoc2 = new Document();
lDoc2.add(new Field("handle", "1 2", Field.Store.YES, Field.Index.ANALYZED));
Document lDoc3 = new Document();
lDoc3.add(new Field("handle", "1 2", Field.Store.YES, Field.Index.ANALYZED));
IndexWriter writerA = new IndexWriter(indexStoreA, new StandardAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
IndexWriter writerB = new IndexWriter(indexStoreB, new StandardAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
writerA.addDocument(lDoc);
writerA.addDocument(lDoc2);
writerA.optimize();
writerA.close();
writerB.addDocument(lDoc3);
writerB.close();
QueryParser parser = new QueryParser("fulltext", new StandardAnalyzer());
Query query = parser.parse("handle:1");
Searcher[] searchers = new Searcher[2];
searchers[0] = new IndexSearcher(indexStoreB);
searchers[1] = new IndexSearcher(indexStoreA);
Searcher mSearcher = new MultiSearcher(searchers);
ScoreDoc[] hits = mSearcher.search(query, null, 1000).scoreDocs;
assertEquals(3, hits.length);
Explanation explain = mSearcher.explain(query, hits[0].doc);
String exp = explain.toString(0);
assertTrue(exp, exp.indexOf("maxDocs=3") > -1);
assertTrue(exp, exp.indexOf("docFreq=3") > -1);
query = parser.parse("handle:\"1 2\"");
hits = mSearcher.search(query, null, 1000).scoreDocs;
assertEquals(3, hits.length);
explain = mSearcher.explain(query, hits[0].doc);
exp = explain.toString(0);
assertTrue(exp, exp.indexOf("1=3") > -1);
assertTrue(exp, exp.indexOf("2=3") > -1);
query = new SpanNearQuery(new SpanQuery[] {
new SpanTermQuery(new Term("handle", "1")),
new SpanTermQuery(new Term("handle", "2")) }, 0, true);
hits = mSearcher.search(query, null, 1000).scoreDocs;
assertEquals(3, hits.length);
explain = mSearcher.explain(query, hits[0].doc);
exp = explain.toString(0);
assertTrue(exp, exp.indexOf("1=3") > -1);
assertTrue(exp, exp.indexOf("2=3") > -1);
mSearcher.close();
}
}