From c1c2ce77160c4bf85f1bea76a42299ef48002f0a Mon Sep 17 00:00:00 2001 From: Mark Robert Miller Date: Tue, 25 Aug 2009 12:50:06 +0000 Subject: [PATCH] LUCENE-1847: PhraseQuery/TermQuery/SpanQuery use IndexReader specific stats in their Weight#explain methods - these stats should be corpus wide. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@807595 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 13 +- .../org/apache/lucene/search/Explanation.java | 22 +++ .../org/apache/lucene/search/PhraseQuery.java | 13 +- .../org/apache/lucene/search/Similarity.java | 161 +++++++++++++++++- .../org/apache/lucene/search/TermQuery.java | 8 +- .../lucene/search/spans/SpanWeight.java | 23 +-- .../lucene/search/TestSimpleExplanations.java | 76 +++++++++ 7 files changed, 280 insertions(+), 36 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index e9db97beddf..86b2c1ec476 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -411,6 +411,10 @@ API Changes 37. LUCENE-1826: Add constructors that take AttributeSource and AttributeFactory to all Tokenizer implementations. (Michael Busch) + +38. LUCENE-1847: Similarity#idf for both a Term and Term Collection have + been deprecated. New versions that return an IDFExplanation have been + added. (Yasoja Seneviratne, Mike McCandless, Mark Miller) Bug fixes @@ -516,6 +520,10 @@ Bug fixes new LocalizedTestCase as base class for localization junit tests. (Robert Muir, Uwe Schindler via Michael Busch) +26. LUCENE-1847: PhraseQuery/TermQuery/SpanQuery use IndexReader specific stats + in their Weight#explain methods - these stats should be corpus wide. + (Yasoja Seneviratne, Mike McCandless, Mark Miller) + New features 1. LUCENE-1411: Added expert API to open an IndexWriter on a prior @@ -729,11 +737,6 @@ New features ValueSource, but takes care when composite (multi-segment) are passed to not double RAM usage in the FieldCache. (Chris Hostetter, Mark Miller, Mike McCandless) - -37. LUCENE-1798: Added FieldCache.set/getInfoStream, which uses - FieldCacheSanityChecker to detect when a new cache entry has - caused additional insanity, printing the details at the time that - it happens. (Chris Hostetter, Mike McCandless) Optimizations diff --git a/src/java/org/apache/lucene/search/Explanation.java b/src/java/org/apache/lucene/search/Explanation.java index 4e0b2b7e5f2..dc7fb38c4ad 100644 --- a/src/java/org/apache/lucene/search/Explanation.java +++ b/src/java/org/apache/lucene/search/Explanation.java @@ -17,6 +17,7 @@ package org.apache.lucene.search; * limitations under the License. */ +import java.io.Serializable; import java.util.ArrayList; /** Expert: Describes the score computation for document and query. */ @@ -124,4 +125,25 @@ public class Explanation implements java.io.Serializable { return buffer.toString(); } + + /** + * Small Util class used to pass both an idf factor as well as an + * explanation for that factor. + * + * This class will likely be held on a {@link Weight}, so be aware + * before storing any large or un-serializable fields. + * + */ + public static abstract class IDFExplanation implements Serializable { + /** + * @return the idf factor + */ + public abstract float getIdf(); + /** + * This should be calculated lazily if possible. + * + * @return the explanation for the idf factor. + */ + public abstract String explain(); + } } diff --git a/src/java/org/apache/lucene/search/PhraseQuery.java b/src/java/org/apache/lucene/search/PhraseQuery.java index 1f1962f5768..305e9d23f47 100644 --- a/src/java/org/apache/lucene/search/PhraseQuery.java +++ b/src/java/org/apache/lucene/search/PhraseQuery.java @@ -24,6 +24,7 @@ import java.util.ArrayList; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermPositions; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.Explanation.IDFExplanation; import org.apache.lucene.util.ToStringUtils; /** A Query that matches documents containing a particular sequence of terms. @@ -112,12 +113,14 @@ public class PhraseQuery extends Query { private float idf; private float queryNorm; private float queryWeight; + private IDFExplanation idfExp; public PhraseWeight(Searcher searcher) throws IOException { this.similarity = getSimilarity(searcher); - idf = similarity.idf(terms, searcher); + idfExp = similarity.idfExplain(terms, searcher); + idf = idfExp.getIdf(); } public String toString() { return "weight(" + PhraseQuery.this + ")"; } @@ -167,24 +170,20 @@ public class PhraseQuery extends Query { StringBuffer docFreqs = new StringBuffer(); StringBuffer query = new StringBuffer(); query.append('\"'); + docFreqs.append(idfExp.explain()); for (int i = 0; i < terms.size(); i++) { if (i != 0) { - docFreqs.append(" "); query.append(" "); } Term term = (Term)terms.get(i); - docFreqs.append(term.text()); - docFreqs.append("="); - docFreqs.append(reader.docFreq(term)); - query.append(term.text()); } query.append('\"'); Explanation idfExpl = - new Explanation(idf, "idf(" + field + ": " + docFreqs + ")"); + new Explanation(idf, "idf(" + field + ":" + docFreqs + ")"); // explain query weight Explanation queryExpl = new Explanation(); diff --git a/src/java/org/apache/lucene/search/Similarity.java b/src/java/org/apache/lucene/search/Similarity.java index e8987acb081..6958657586a 100644 --- a/src/java/org/apache/lucene/search/Similarity.java +++ b/src/java/org/apache/lucene/search/Similarity.java @@ -17,13 +17,16 @@ package org.apache.lucene.search; * limitations under the License. */ + import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.Term; +import org.apache.lucene.search.Explanation.IDFExplanation; import org.apache.lucene.util.SmallFloat; import java.io.IOException; import java.io.Serializable; import java.util.Collection; +import java.util.IdentityHashMap; import java.util.Iterator; /** Expert: Scoring API. @@ -287,8 +290,6 @@ import java.util.Iterator; * @see Searcher#setSimilarity(Similarity) */ public abstract class Similarity implements Serializable { - /** The Similarity implementation used by default. */ - private static Similarity defaultImpl = new DefaultSimilarity(); public static final int NO_DOC_ID_PROVIDED = -1; @@ -478,10 +479,62 @@ public abstract class Similarity implements Serializable { * @param term the term in question * @param searcher the document collection being searched * @return a score factor for the term + * @deprecated see {@link #idfExplain(Term, Searcher)} */ public float idf(Term term, Searcher searcher) throws IOException { return idf(searcher.docFreq(term), searcher.maxDoc()); } + + /** + * Computes a score factor for a simple term and returns an explanation + * for that score factor. + * + *

+ * The default implementation uses: + * + *

+   * idf(searcher.docFreq(term), searcher.maxDoc());
+   * 
+ * + * Note that {@link Searcher#maxDoc()} is used instead of + * {@link org.apache.lucene.index.IndexReader#numDocs()} because it is + * proportional to {@link Searcher#docFreq(Term)} , i.e., when one is + * inaccurate, so is the other, and in the same direction. + * + * @param term the term in question + * @param searcher the document collection being searched + * @return an IDFExplain object that includes both an idf score factor + and an explanation for the term. + * @throws IOException + */ + public IDFExplanation idfExplain(final Term term, final Searcher searcher) throws IOException { + if(supportedMethods.overridesTermIDF) { + final float idf = idf(term, searcher); + return new IDFExplanation() { + //@Override + public float getIdf() { + return idf; + } + //@Override + public String explain() { + return "Inexplicable"; + } + }; + } + final int df = searcher.docFreq(term); + final int max = searcher.maxDoc(); + final float idf = idf(df, max); + return new IDFExplanation() { + //@Override + public String explain() { + return "idf(docFreq=" + df + + ", maxDocs=" + max + ")"; + } + //@Override + public float getIdf() { + return idf; + }}; + } /** Computes a score factor for a phrase. * @@ -490,7 +543,8 @@ public abstract class Similarity implements Serializable { * * @param terms the terms in the phrase * @param searcher the document collection being searched - * @return a score factor for the phrase + * @return + * @deprecated see {@link #idfExplain(Collection, Searcher)} */ public float idf(Collection terms, Searcher searcher) throws IOException { float idf = 0.0f; @@ -500,6 +554,60 @@ public abstract class Similarity implements Serializable { } return idf; } + + /** + * Computes a score factor for a phrase. + * + *

+ * The default implementation sums the idf factor for + * each term in the phrase. + * + * @param terms the terms in the phrase + * @param searcher the document collection being searched + * @return an IDFExplain object that includes both an idf + * score factor for the phrase and an explanation + * for each term. + * @throws IOException + */ + public IDFExplanation idfExplain(Collection terms, Searcher searcher) throws IOException { + if(supportedMethods.overridesCollectionIDF) { + final float idf = idf(terms, searcher); + return new IDFExplanation() { + //@Override + public float getIdf() { + return idf; + } + //@Override + public String explain() { + return "Inexplicable"; + } + }; + } + final int max = searcher.maxDoc(); + float idf = 0.0f; + final StringBuffer exp = new StringBuffer(); + Iterator i = terms.iterator(); + while (i.hasNext()) { + Term term = (Term)i.next(); + final int df = searcher.docFreq(term); + idf += idf(df, max); + exp.append(" "); + exp.append(term.text()); + exp.append("="); + exp.append(df); + } + final float fIdf = idf; + return new IDFExplanation() { + //@Override + public float getIdf() { + return fIdf; + } + //@Override + public String explain() { + return exp.toString(); + } + }; + } /** Computes a score factor based on a term's document frequency (the number * of documents which contain the term). This value is multiplied by the @@ -577,5 +685,52 @@ public abstract class Similarity implements Serializable { //TODO: When removing the deprecated scorePayload above, set this to return 1 return scorePayload(fieldName, payload, offset, length); } + + /** @deprecated Remove this when old API is removed! */ + private final MethodSupport supportedMethods = getSupportedMethods(this.getClass()); + + /** @deprecated Remove this when old API is removed! */ + private static final class MethodSupport implements Serializable { + final boolean overridesCollectionIDF, overridesTermIDF; + + MethodSupport(Class clazz) { + overridesCollectionIDF = isMethodOverridden(clazz, "idf", C_IDF_METHOD_PARAMS); + overridesTermIDF = isMethodOverridden(clazz, "idf", T_IDF_METHOD_PARAMS); + } + + private static boolean isMethodOverridden(Class clazz, String name, Class[] params) { + try { + return clazz.getMethod(name, params).getDeclaringClass() != Similarity.class; + } catch (NoSuchMethodException e) { + // should not happen + throw new RuntimeException(e); + } + } + /** @deprecated Remove this when old API is removed! */ + private static final Class[] T_IDF_METHOD_PARAMS = new Class[]{Term.class, Searcher.class}; + + /** @deprecated Remove this when old API is removed! */ + private static final Class[] C_IDF_METHOD_PARAMS = new Class[]{Collection.class, Searcher.class}; + } + + /** @deprecated Remove this when old API is removed! */ + private static final IdentityHashMap/*,MethodSupport>*/ knownMethodSupport = new IdentityHashMap(); + + /** @deprecated Remove this when old API is removed! */ + private static MethodSupport getSupportedMethods(Class clazz) { + MethodSupport supportedMethods; + synchronized(knownMethodSupport) { + supportedMethods = (MethodSupport) knownMethodSupport.get(clazz); + if (supportedMethods == null) { + knownMethodSupport.put(clazz, supportedMethods = new MethodSupport(clazz)); + } + } + return supportedMethods; + } + + /** The Similarity implementation used by default. + * TODO: move back to top when old API is removed! + **/ + private static Similarity defaultImpl = new DefaultSimilarity(); } diff --git a/src/java/org/apache/lucene/search/TermQuery.java b/src/java/org/apache/lucene/search/TermQuery.java index 958c9b1f17c..bbcdc768213 100644 --- a/src/java/org/apache/lucene/search/TermQuery.java +++ b/src/java/org/apache/lucene/search/TermQuery.java @@ -23,6 +23,7 @@ import java.util.Set; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermDocs; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.Explanation.IDFExplanation; import org.apache.lucene.util.ToStringUtils; /** A Query that matches documents containing a term. @@ -37,11 +38,13 @@ public class TermQuery extends Query { private float idf; private float queryNorm; private float queryWeight; + private IDFExplanation idfExp; public TermWeight(Searcher searcher) throws IOException { this.similarity = getSimilarity(searcher); - idf = similarity.idf(term, searcher); // compute idf + idfExp = similarity.idfExplain(term, searcher); + idf = idfExp.getIdf(); } public String toString() { return "weight(" + TermQuery.this + ")"; } @@ -75,8 +78,7 @@ public class TermQuery extends Query { ComplexExplanation result = new ComplexExplanation(); result.setDescription("weight("+getQuery()+" in "+doc+"), product of:"); - Explanation expl = new Explanation(idf, "idf(docFreq=" + reader.docFreq(term) + - ", maxDocs=" + reader.maxDoc() + ")"); + Explanation expl = new Explanation(idf, idfExp.explain()); // explain query weight Explanation queryExpl = new Explanation(); diff --git a/src/java/org/apache/lucene/search/spans/SpanWeight.java b/src/java/org/apache/lucene/search/spans/SpanWeight.java index fb57aa6a9e4..2d903451825 100644 --- a/src/java/org/apache/lucene/search/spans/SpanWeight.java +++ b/src/java/org/apache/lucene/search/spans/SpanWeight.java @@ -18,12 +18,11 @@ package org.apache.lucene.search.spans; */ import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.Term; import org.apache.lucene.search.*; +import org.apache.lucene.search.Explanation.IDFExplanation; import java.io.IOException; import java.util.HashSet; -import java.util.Iterator; import java.util.Set; /** @@ -38,6 +37,7 @@ public class SpanWeight extends Weight { protected Set terms; protected SpanQuery query; + private IDFExplanation idfExp; public SpanWeight(SpanQuery query, Searcher searcher) throws IOException { @@ -45,8 +45,8 @@ public class SpanWeight extends Weight { this.query = query; terms=new HashSet(); query.extractTerms(terms); - - idf = this.query.getSimilarity(searcher).idf(terms, searcher); + idfExp = similarity.idfExplain(terms, searcher); + idf = idfExp.getIdf(); } public Query getQuery() { return query; } @@ -75,21 +75,8 @@ public class SpanWeight extends Weight { result.setDescription("weight("+getQuery()+" in "+doc+"), product of:"); String field = ((SpanQuery)getQuery()).getField(); - StringBuffer docFreqs = new StringBuffer(); - Iterator i = terms.iterator(); - while (i.hasNext()) { - Term term = (Term)i.next(); - docFreqs.append(term.text()); - docFreqs.append("="); - docFreqs.append(reader.docFreq(term)); - - if (i.hasNext()) { - docFreqs.append(" "); - } - } - Explanation idfExpl = - new Explanation(idf, "idf(" + field + ": " + docFreqs + ")"); + new Explanation(idf, "idf(" + field + ": " + idfExp.explain() + ")"); // explain query weight Explanation queryExpl = new Explanation(); diff --git a/src/test/org/apache/lucene/search/TestSimpleExplanations.java b/src/test/org/apache/lucene/search/TestSimpleExplanations.java index 77d003961d8..999b2ceb892 100644 --- a/src/test/org/apache/lucene/search/TestSimpleExplanations.java +++ b/src/test/org/apache/lucene/search/TestSimpleExplanations.java @@ -17,6 +17,19 @@ package org.apache.lucene.search; * limitations under the License. */ +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.queryParser.QueryParser; +import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.search.spans.SpanTermQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.MockRAMDirectory; + + /** * TestExplanations subclass focusing on basic query types */ @@ -291,4 +304,67 @@ public class TestSimpleExplanations extends TestExplanations { } + public void testTermQueryMultiSearcherExplain() throws Exception { + // creating two directories for indices + Directory indexStoreA = new MockRAMDirectory(); + Directory indexStoreB = new MockRAMDirectory(); + + Document lDoc = new Document(); + lDoc.add(new Field("handle", "1 2", Field.Store.YES, Field.Index.ANALYZED)); + Document lDoc2 = new Document(); + lDoc2.add(new Field("handle", "1 2", Field.Store.YES, Field.Index.ANALYZED)); + Document lDoc3 = new Document(); + lDoc3.add(new Field("handle", "1 2", Field.Store.YES, Field.Index.ANALYZED)); + + IndexWriter writerA = new IndexWriter(indexStoreA, new StandardAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); + IndexWriter writerB = new IndexWriter(indexStoreB, new StandardAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); + + writerA.addDocument(lDoc); + writerA.addDocument(lDoc2); + writerA.optimize(); + writerA.close(); + + writerB.addDocument(lDoc3); + writerB.close(); + + QueryParser parser = new QueryParser("fulltext", new StandardAnalyzer()); + Query query = parser.parse("handle:1"); + + Searcher[] searchers = new Searcher[2]; + searchers[0] = new IndexSearcher(indexStoreB); + searchers[1] = new IndexSearcher(indexStoreA); + Searcher mSearcher = new MultiSearcher(searchers); + ScoreDoc[] hits = mSearcher.search(query, null, 1000).scoreDocs; + + assertEquals(3, hits.length); + + Explanation explain = mSearcher.explain(query, hits[0].doc); + String exp = explain.toString(0); + assertTrue(exp, exp.indexOf("maxDocs=3") > -1); + assertTrue(exp, exp.indexOf("docFreq=3") > -1); + + query = parser.parse("handle:\"1 2\""); + hits = mSearcher.search(query, null, 1000).scoreDocs; + + assertEquals(3, hits.length); + + explain = mSearcher.explain(query, hits[0].doc); + exp = explain.toString(0); + assertTrue(exp, exp.indexOf("1=3") > -1); + assertTrue(exp, exp.indexOf("2=3") > -1); + + query = new SpanNearQuery(new SpanQuery[] { + new SpanTermQuery(new Term("handle", "1")), + new SpanTermQuery(new Term("handle", "2")) }, 0, true); + hits = mSearcher.search(query, null, 1000).scoreDocs; + + assertEquals(3, hits.length); + + explain = mSearcher.explain(query, hits[0].doc); + exp = explain.toString(0); + assertTrue(exp, exp.indexOf("1=3") > -1); + assertTrue(exp, exp.indexOf("2=3") > -1); + mSearcher.close(); + } + }