mirror of https://github.com/apache/lucene.git
LUCENE-1847: PhraseQuery/TermQuery/SpanQuery use IndexReader specific stats in their Weight#explain methods - these stats should be corpus wide.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@807595 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
ca3f86d815
commit
c1c2ce7716
13
CHANGES.txt
13
CHANGES.txt
|
@ -411,6 +411,10 @@ API Changes
|
|||
37. LUCENE-1826: Add constructors that take AttributeSource and
|
||||
AttributeFactory to all Tokenizer implementations.
|
||||
(Michael Busch)
|
||||
|
||||
38. LUCENE-1847: Similarity#idf for both a Term and Term Collection have
|
||||
been deprecated. New versions that return an IDFExplanation have been
|
||||
added. (Yasoja Seneviratne, Mike McCandless, Mark Miller)
|
||||
|
||||
Bug fixes
|
||||
|
||||
|
@ -516,6 +520,10 @@ Bug fixes
|
|||
new LocalizedTestCase as base class for localization junit tests.
|
||||
(Robert Muir, Uwe Schindler via Michael Busch)
|
||||
|
||||
26. LUCENE-1847: PhraseQuery/TermQuery/SpanQuery use IndexReader specific stats
|
||||
in their Weight#explain methods - these stats should be corpus wide.
|
||||
(Yasoja Seneviratne, Mike McCandless, Mark Miller)
|
||||
|
||||
New features
|
||||
|
||||
1. LUCENE-1411: Added expert API to open an IndexWriter on a prior
|
||||
|
@ -729,11 +737,6 @@ New features
|
|||
ValueSource, but takes care when composite (multi-segment) are
|
||||
passed to not double RAM usage in the FieldCache. (Chris
|
||||
Hostetter, Mark Miller, Mike McCandless)
|
||||
|
||||
37. LUCENE-1798: Added FieldCache.set/getInfoStream, which uses
|
||||
FieldCacheSanityChecker to detect when a new cache entry has
|
||||
caused additional insanity, printing the details at the time that
|
||||
it happens. (Chris Hostetter, Mike McCandless)
|
||||
|
||||
Optimizations
|
||||
|
||||
|
|
|
@ -17,6 +17,7 @@ package org.apache.lucene.search;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
|
||||
/** Expert: Describes the score computation for document and query. */
|
||||
|
@ -124,4 +125,25 @@ public class Explanation implements java.io.Serializable {
|
|||
|
||||
return buffer.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Small Util class used to pass both an idf factor as well as an
|
||||
* explanation for that factor.
|
||||
*
|
||||
* This class will likely be held on a {@link Weight}, so be aware
|
||||
* before storing any large or un-serializable fields.
|
||||
*
|
||||
*/
|
||||
public static abstract class IDFExplanation implements Serializable {
|
||||
/**
|
||||
* @return the idf factor
|
||||
*/
|
||||
public abstract float getIdf();
|
||||
/**
|
||||
* This should be calculated lazily if possible.
|
||||
*
|
||||
* @return the explanation for the idf factor.
|
||||
*/
|
||||
public abstract String explain();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -24,6 +24,7 @@ import java.util.ArrayList;
|
|||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermPositions;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.search.Explanation.IDFExplanation;
|
||||
import org.apache.lucene.util.ToStringUtils;
|
||||
|
||||
/** A Query that matches documents containing a particular sequence of terms.
|
||||
|
@ -112,12 +113,14 @@ public class PhraseQuery extends Query {
|
|||
private float idf;
|
||||
private float queryNorm;
|
||||
private float queryWeight;
|
||||
private IDFExplanation idfExp;
|
||||
|
||||
public PhraseWeight(Searcher searcher)
|
||||
throws IOException {
|
||||
this.similarity = getSimilarity(searcher);
|
||||
|
||||
idf = similarity.idf(terms, searcher);
|
||||
idfExp = similarity.idfExplain(terms, searcher);
|
||||
idf = idfExp.getIdf();
|
||||
}
|
||||
|
||||
public String toString() { return "weight(" + PhraseQuery.this + ")"; }
|
||||
|
@ -167,24 +170,20 @@ public class PhraseQuery extends Query {
|
|||
StringBuffer docFreqs = new StringBuffer();
|
||||
StringBuffer query = new StringBuffer();
|
||||
query.append('\"');
|
||||
docFreqs.append(idfExp.explain());
|
||||
for (int i = 0; i < terms.size(); i++) {
|
||||
if (i != 0) {
|
||||
docFreqs.append(" ");
|
||||
query.append(" ");
|
||||
}
|
||||
|
||||
Term term = (Term)terms.get(i);
|
||||
|
||||
docFreqs.append(term.text());
|
||||
docFreqs.append("=");
|
||||
docFreqs.append(reader.docFreq(term));
|
||||
|
||||
query.append(term.text());
|
||||
}
|
||||
query.append('\"');
|
||||
|
||||
Explanation idfExpl =
|
||||
new Explanation(idf, "idf(" + field + ": " + docFreqs + ")");
|
||||
new Explanation(idf, "idf(" + field + ":" + docFreqs + ")");
|
||||
|
||||
// explain query weight
|
||||
Explanation queryExpl = new Explanation();
|
||||
|
|
|
@ -17,13 +17,16 @@ package org.apache.lucene.search;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.Explanation.IDFExplanation;
|
||||
import org.apache.lucene.util.SmallFloat;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.Collection;
|
||||
import java.util.IdentityHashMap;
|
||||
import java.util.Iterator;
|
||||
|
||||
/** Expert: Scoring API.
|
||||
|
@ -287,8 +290,6 @@ import java.util.Iterator;
|
|||
* @see Searcher#setSimilarity(Similarity)
|
||||
*/
|
||||
public abstract class Similarity implements Serializable {
|
||||
/** The Similarity implementation used by default. */
|
||||
private static Similarity defaultImpl = new DefaultSimilarity();
|
||||
|
||||
public static final int NO_DOC_ID_PROVIDED = -1;
|
||||
|
||||
|
@ -478,10 +479,62 @@ public abstract class Similarity implements Serializable {
|
|||
* @param term the term in question
|
||||
* @param searcher the document collection being searched
|
||||
* @return a score factor for the term
|
||||
* @deprecated see {@link #idfExplain(Term, Searcher)}
|
||||
*/
|
||||
public float idf(Term term, Searcher searcher) throws IOException {
|
||||
return idf(searcher.docFreq(term), searcher.maxDoc());
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes a score factor for a simple term and returns an explanation
|
||||
* for that score factor.
|
||||
*
|
||||
* <p>
|
||||
* The default implementation uses:
|
||||
*
|
||||
* <pre>
|
||||
* idf(searcher.docFreq(term), searcher.maxDoc());
|
||||
* </pre>
|
||||
*
|
||||
* Note that {@link Searcher#maxDoc()} is used instead of
|
||||
* {@link org.apache.lucene.index.IndexReader#numDocs()} because it is
|
||||
* proportional to {@link Searcher#docFreq(Term)} , i.e., when one is
|
||||
* inaccurate, so is the other, and in the same direction.
|
||||
*
|
||||
* @param term the term in question
|
||||
* @param searcher the document collection being searched
|
||||
* @return an IDFExplain object that includes both an idf score factor
|
||||
and an explanation for the term.
|
||||
* @throws IOException
|
||||
*/
|
||||
public IDFExplanation idfExplain(final Term term, final Searcher searcher) throws IOException {
|
||||
if(supportedMethods.overridesTermIDF) {
|
||||
final float idf = idf(term, searcher);
|
||||
return new IDFExplanation() {
|
||||
//@Override
|
||||
public float getIdf() {
|
||||
return idf;
|
||||
}
|
||||
//@Override
|
||||
public String explain() {
|
||||
return "Inexplicable";
|
||||
}
|
||||
};
|
||||
}
|
||||
final int df = searcher.docFreq(term);
|
||||
final int max = searcher.maxDoc();
|
||||
final float idf = idf(df, max);
|
||||
return new IDFExplanation() {
|
||||
//@Override
|
||||
public String explain() {
|
||||
return "idf(docFreq=" + df +
|
||||
", maxDocs=" + max + ")";
|
||||
}
|
||||
//@Override
|
||||
public float getIdf() {
|
||||
return idf;
|
||||
}};
|
||||
}
|
||||
|
||||
/** Computes a score factor for a phrase.
|
||||
*
|
||||
|
@ -490,7 +543,8 @@ public abstract class Similarity implements Serializable {
|
|||
*
|
||||
* @param terms the terms in the phrase
|
||||
* @param searcher the document collection being searched
|
||||
* @return a score factor for the phrase
|
||||
* @return
|
||||
* @deprecated see {@link #idfExplain(Collection, Searcher)}
|
||||
*/
|
||||
public float idf(Collection terms, Searcher searcher) throws IOException {
|
||||
float idf = 0.0f;
|
||||
|
@ -500,6 +554,60 @@ public abstract class Similarity implements Serializable {
|
|||
}
|
||||
return idf;
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes a score factor for a phrase.
|
||||
*
|
||||
* <p>
|
||||
* The default implementation sums the idf factor for
|
||||
* each term in the phrase.
|
||||
*
|
||||
* @param terms the terms in the phrase
|
||||
* @param searcher the document collection being searched
|
||||
* @return an IDFExplain object that includes both an idf
|
||||
* score factor for the phrase and an explanation
|
||||
* for each term.
|
||||
* @throws IOException
|
||||
*/
|
||||
public IDFExplanation idfExplain(Collection terms, Searcher searcher) throws IOException {
|
||||
if(supportedMethods.overridesCollectionIDF) {
|
||||
final float idf = idf(terms, searcher);
|
||||
return new IDFExplanation() {
|
||||
//@Override
|
||||
public float getIdf() {
|
||||
return idf;
|
||||
}
|
||||
//@Override
|
||||
public String explain() {
|
||||
return "Inexplicable";
|
||||
}
|
||||
};
|
||||
}
|
||||
final int max = searcher.maxDoc();
|
||||
float idf = 0.0f;
|
||||
final StringBuffer exp = new StringBuffer();
|
||||
Iterator i = terms.iterator();
|
||||
while (i.hasNext()) {
|
||||
Term term = (Term)i.next();
|
||||
final int df = searcher.docFreq(term);
|
||||
idf += idf(df, max);
|
||||
exp.append(" ");
|
||||
exp.append(term.text());
|
||||
exp.append("=");
|
||||
exp.append(df);
|
||||
}
|
||||
final float fIdf = idf;
|
||||
return new IDFExplanation() {
|
||||
//@Override
|
||||
public float getIdf() {
|
||||
return fIdf;
|
||||
}
|
||||
//@Override
|
||||
public String explain() {
|
||||
return exp.toString();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/** Computes a score factor based on a term's document frequency (the number
|
||||
* of documents which contain the term). This value is multiplied by the
|
||||
|
@ -577,5 +685,52 @@ public abstract class Similarity implements Serializable {
|
|||
//TODO: When removing the deprecated scorePayload above, set this to return 1
|
||||
return scorePayload(fieldName, payload, offset, length);
|
||||
}
|
||||
|
||||
/** @deprecated Remove this when old API is removed! */
|
||||
private final MethodSupport supportedMethods = getSupportedMethods(this.getClass());
|
||||
|
||||
/** @deprecated Remove this when old API is removed! */
|
||||
private static final class MethodSupport implements Serializable {
|
||||
final boolean overridesCollectionIDF, overridesTermIDF;
|
||||
|
||||
MethodSupport(Class clazz) {
|
||||
overridesCollectionIDF = isMethodOverridden(clazz, "idf", C_IDF_METHOD_PARAMS);
|
||||
overridesTermIDF = isMethodOverridden(clazz, "idf", T_IDF_METHOD_PARAMS);
|
||||
}
|
||||
|
||||
private static boolean isMethodOverridden(Class clazz, String name, Class[] params) {
|
||||
try {
|
||||
return clazz.getMethod(name, params).getDeclaringClass() != Similarity.class;
|
||||
} catch (NoSuchMethodException e) {
|
||||
// should not happen
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
/** @deprecated Remove this when old API is removed! */
|
||||
private static final Class[] T_IDF_METHOD_PARAMS = new Class[]{Term.class, Searcher.class};
|
||||
|
||||
/** @deprecated Remove this when old API is removed! */
|
||||
private static final Class[] C_IDF_METHOD_PARAMS = new Class[]{Collection.class, Searcher.class};
|
||||
}
|
||||
|
||||
/** @deprecated Remove this when old API is removed! */
|
||||
private static final IdentityHashMap/*<Class<? extends Similarity>,MethodSupport>*/ knownMethodSupport = new IdentityHashMap();
|
||||
|
||||
/** @deprecated Remove this when old API is removed! */
|
||||
private static MethodSupport getSupportedMethods(Class clazz) {
|
||||
MethodSupport supportedMethods;
|
||||
synchronized(knownMethodSupport) {
|
||||
supportedMethods = (MethodSupport) knownMethodSupport.get(clazz);
|
||||
if (supportedMethods == null) {
|
||||
knownMethodSupport.put(clazz, supportedMethods = new MethodSupport(clazz));
|
||||
}
|
||||
}
|
||||
return supportedMethods;
|
||||
}
|
||||
|
||||
/** The Similarity implementation used by default.
|
||||
* TODO: move back to top when old API is removed!
|
||||
**/
|
||||
private static Similarity defaultImpl = new DefaultSimilarity();
|
||||
|
||||
}
|
||||
|
|
|
@ -23,6 +23,7 @@ import java.util.Set;
|
|||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermDocs;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.search.Explanation.IDFExplanation;
|
||||
import org.apache.lucene.util.ToStringUtils;
|
||||
|
||||
/** A Query that matches documents containing a term.
|
||||
|
@ -37,11 +38,13 @@ public class TermQuery extends Query {
|
|||
private float idf;
|
||||
private float queryNorm;
|
||||
private float queryWeight;
|
||||
private IDFExplanation idfExp;
|
||||
|
||||
public TermWeight(Searcher searcher)
|
||||
throws IOException {
|
||||
this.similarity = getSimilarity(searcher);
|
||||
idf = similarity.idf(term, searcher); // compute idf
|
||||
idfExp = similarity.idfExplain(term, searcher);
|
||||
idf = idfExp.getIdf();
|
||||
}
|
||||
|
||||
public String toString() { return "weight(" + TermQuery.this + ")"; }
|
||||
|
@ -75,8 +78,7 @@ public class TermQuery extends Query {
|
|||
ComplexExplanation result = new ComplexExplanation();
|
||||
result.setDescription("weight("+getQuery()+" in "+doc+"), product of:");
|
||||
|
||||
Explanation expl = new Explanation(idf, "idf(docFreq=" + reader.docFreq(term) +
|
||||
", maxDocs=" + reader.maxDoc() + ")");
|
||||
Explanation expl = new Explanation(idf, idfExp.explain());
|
||||
|
||||
// explain query weight
|
||||
Explanation queryExpl = new Explanation();
|
||||
|
|
|
@ -18,12 +18,11 @@ package org.apache.lucene.search.spans;
|
|||
*/
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.*;
|
||||
import org.apache.lucene.search.Explanation.IDFExplanation;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
|
@ -38,6 +37,7 @@ public class SpanWeight extends Weight {
|
|||
|
||||
protected Set terms;
|
||||
protected SpanQuery query;
|
||||
private IDFExplanation idfExp;
|
||||
|
||||
public SpanWeight(SpanQuery query, Searcher searcher)
|
||||
throws IOException {
|
||||
|
@ -45,8 +45,8 @@ public class SpanWeight extends Weight {
|
|||
this.query = query;
|
||||
terms=new HashSet();
|
||||
query.extractTerms(terms);
|
||||
|
||||
idf = this.query.getSimilarity(searcher).idf(terms, searcher);
|
||||
idfExp = similarity.idfExplain(terms, searcher);
|
||||
idf = idfExp.getIdf();
|
||||
}
|
||||
|
||||
public Query getQuery() { return query; }
|
||||
|
@ -75,21 +75,8 @@ public class SpanWeight extends Weight {
|
|||
result.setDescription("weight("+getQuery()+" in "+doc+"), product of:");
|
||||
String field = ((SpanQuery)getQuery()).getField();
|
||||
|
||||
StringBuffer docFreqs = new StringBuffer();
|
||||
Iterator i = terms.iterator();
|
||||
while (i.hasNext()) {
|
||||
Term term = (Term)i.next();
|
||||
docFreqs.append(term.text());
|
||||
docFreqs.append("=");
|
||||
docFreqs.append(reader.docFreq(term));
|
||||
|
||||
if (i.hasNext()) {
|
||||
docFreqs.append(" ");
|
||||
}
|
||||
}
|
||||
|
||||
Explanation idfExpl =
|
||||
new Explanation(idf, "idf(" + field + ": " + docFreqs + ")");
|
||||
new Explanation(idf, "idf(" + field + ": " + idfExp.explain() + ")");
|
||||
|
||||
// explain query weight
|
||||
Explanation queryExpl = new Explanation();
|
||||
|
|
|
@ -17,6 +17,19 @@ package org.apache.lucene.search;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.queryParser.QueryParser;
|
||||
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.MockRAMDirectory;
|
||||
|
||||
|
||||
/**
|
||||
* TestExplanations subclass focusing on basic query types
|
||||
*/
|
||||
|
@ -291,4 +304,67 @@ public class TestSimpleExplanations extends TestExplanations {
|
|||
}
|
||||
|
||||
|
||||
public void testTermQueryMultiSearcherExplain() throws Exception {
|
||||
// creating two directories for indices
|
||||
Directory indexStoreA = new MockRAMDirectory();
|
||||
Directory indexStoreB = new MockRAMDirectory();
|
||||
|
||||
Document lDoc = new Document();
|
||||
lDoc.add(new Field("handle", "1 2", Field.Store.YES, Field.Index.ANALYZED));
|
||||
Document lDoc2 = new Document();
|
||||
lDoc2.add(new Field("handle", "1 2", Field.Store.YES, Field.Index.ANALYZED));
|
||||
Document lDoc3 = new Document();
|
||||
lDoc3.add(new Field("handle", "1 2", Field.Store.YES, Field.Index.ANALYZED));
|
||||
|
||||
IndexWriter writerA = new IndexWriter(indexStoreA, new StandardAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
|
||||
IndexWriter writerB = new IndexWriter(indexStoreB, new StandardAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
|
||||
|
||||
writerA.addDocument(lDoc);
|
||||
writerA.addDocument(lDoc2);
|
||||
writerA.optimize();
|
||||
writerA.close();
|
||||
|
||||
writerB.addDocument(lDoc3);
|
||||
writerB.close();
|
||||
|
||||
QueryParser parser = new QueryParser("fulltext", new StandardAnalyzer());
|
||||
Query query = parser.parse("handle:1");
|
||||
|
||||
Searcher[] searchers = new Searcher[2];
|
||||
searchers[0] = new IndexSearcher(indexStoreB);
|
||||
searchers[1] = new IndexSearcher(indexStoreA);
|
||||
Searcher mSearcher = new MultiSearcher(searchers);
|
||||
ScoreDoc[] hits = mSearcher.search(query, null, 1000).scoreDocs;
|
||||
|
||||
assertEquals(3, hits.length);
|
||||
|
||||
Explanation explain = mSearcher.explain(query, hits[0].doc);
|
||||
String exp = explain.toString(0);
|
||||
assertTrue(exp, exp.indexOf("maxDocs=3") > -1);
|
||||
assertTrue(exp, exp.indexOf("docFreq=3") > -1);
|
||||
|
||||
query = parser.parse("handle:\"1 2\"");
|
||||
hits = mSearcher.search(query, null, 1000).scoreDocs;
|
||||
|
||||
assertEquals(3, hits.length);
|
||||
|
||||
explain = mSearcher.explain(query, hits[0].doc);
|
||||
exp = explain.toString(0);
|
||||
assertTrue(exp, exp.indexOf("1=3") > -1);
|
||||
assertTrue(exp, exp.indexOf("2=3") > -1);
|
||||
|
||||
query = new SpanNearQuery(new SpanQuery[] {
|
||||
new SpanTermQuery(new Term("handle", "1")),
|
||||
new SpanTermQuery(new Term("handle", "2")) }, 0, true);
|
||||
hits = mSearcher.search(query, null, 1000).scoreDocs;
|
||||
|
||||
assertEquals(3, hits.length);
|
||||
|
||||
explain = mSearcher.explain(query, hits[0].doc);
|
||||
exp = explain.toString(0);
|
||||
assertTrue(exp, exp.indexOf("1=3") > -1);
|
||||
assertTrue(exp, exp.indexOf("2=3") > -1);
|
||||
mSearcher.close();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue