LUCENE-1847: PhraseQuery/TermQuery/SpanQuery use IndexReader specific stats in their Weight#explain methods - these stats should be corpus wide.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@807595 13f79535-47bb-0310-9956-ffa450edef68
2009-08-25 12:50:06 +00:00 · 2009-08-25 12:50:06 +00:00 · c1c2ce7716
parent ca3f86d815
commit c1c2ce7716
7 changed files with 280 additions and 36 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -411,6 +411,10 @@ API Changes
 37. LUCENE-1826: Add constructors that take AttributeSource and
    AttributeFactory to all Tokenizer implementations.
    (Michael Busch)
+    
+38. LUCENE-1847: Similarity#idf for both a Term and Term Collection have
+    been deprecated. New versions that return an IDFExplanation have been
+    added.  (Yasoja Seneviratne, Mike McCandless, Mark Miller)

 Bug fixes

@ -516,6 +520,10 @@ Bug fixes
    new LocalizedTestCase as base class for localization junit tests.
    (Robert Muir, Uwe Schindler via Michael Busch)

+26. LUCENE-1847: PhraseQuery/TermQuery/SpanQuery use IndexReader specific stats 
+    in their Weight#explain methods - these stats should be corpus wide.
+    (Yasoja Seneviratne, Mike McCandless, Mark Miller)
+
 New features

 1. LUCENE-1411: Added expert API to open an IndexWriter on a prior
@ -729,11 +737,6 @@ New features
    ValueSource, but takes care when composite (multi-segment) are
    passed to not double RAM usage in the FieldCache.  (Chris
    Hostetter, Mark Miller, Mike McCandless)
-
-37. LUCENE-1798: Added FieldCache.set/getInfoStream, which uses
-    FieldCacheSanityChecker to detect when a new cache entry has
-    caused additional insanity, printing the details at the time that
-    it happens.  (Chris Hostetter, Mike McCandless)
   
 Optimizations

--- a/src/java/org/apache/lucene/search/Explanation.java
+++ b/src/java/org/apache/lucene/search/Explanation.java
@ -17,6 +17,7 @@ package org.apache.lucene.search;
 * limitations under the License.
 */

+import java.io.Serializable;
 import java.util.ArrayList;

 /** Expert: Describes the score computation for document and query. */
@ -124,4 +125,25 @@ public class Explanation implements java.io.Serializable {

    return buffer.toString();
  }
+  
+  /**
+   * Small Util class used to pass both an idf factor as well as an
+   * explanation for that factor.
+   * 
+   * This class will likely be held on a {@link Weight}, so be aware 
+   * before storing any large or un-serializable fields.
+   *
+   */
+  public static abstract class IDFExplanation implements Serializable {
+    /**
+     * @return the idf factor
+     */
+    public abstract float getIdf();
+    /**
+     * This should be calculated lazily if possible.
+     * 
+     * @return the explanation for the idf factor.
+     */
+    public abstract String explain();
+  }
 }
--- a/src/java/org/apache/lucene/search/PhraseQuery.java
+++ b/src/java/org/apache/lucene/search/PhraseQuery.java
@ -24,6 +24,7 @@ import java.util.ArrayList;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermPositions;
 import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.Explanation.IDFExplanation;
 import org.apache.lucene.util.ToStringUtils;

 /** A Query that matches documents containing a particular sequence of terms.
@ -112,12 +113,14 @@ public class PhraseQuery extends Query {
    private float idf;
    private float queryNorm;
    private float queryWeight;
+    private IDFExplanation idfExp;

    public PhraseWeight(Searcher searcher)
      throws IOException {
      this.similarity = getSimilarity(searcher);

-      idf = similarity.idf(terms, searcher);
+      idfExp = similarity.idfExplain(terms, searcher);
+      idf = idfExp.getIdf();
    }

    public String toString() { return "weight(" + PhraseQuery.this + ")"; }
@ -167,24 +170,20 @@ public class PhraseQuery extends Query {
      StringBuffer docFreqs = new StringBuffer();
      StringBuffer query = new StringBuffer();
      query.append('\"');
+      docFreqs.append(idfExp.explain());
      for (int i = 0; i < terms.size(); i++) {
        if (i != 0) {
-          docFreqs.append(" ");
          query.append(" ");
        }

        Term term = (Term)terms.get(i);

-        docFreqs.append(term.text());
-        docFreqs.append("=");
-        docFreqs.append(reader.docFreq(term));
-
        query.append(term.text());
      }
      query.append('\"');

      Explanation idfExpl =
-        new Explanation(idf, "idf(" + field + ": " + docFreqs + ")");
+        new Explanation(idf, "idf(" + field + ":" + docFreqs + ")");

      // explain query weight
      Explanation queryExpl = new Explanation();
--- a/src/java/org/apache/lucene/search/Similarity.java
+++ b/src/java/org/apache/lucene/search/Similarity.java
@ -17,13 +17,16 @@ package org.apache.lucene.search;
 * limitations under the License.
 */

+
 import org.apache.lucene.index.FieldInvertState;
 import org.apache.lucene.index.Term;
+import org.apache.lucene.search.Explanation.IDFExplanation;
 import org.apache.lucene.util.SmallFloat;

 import java.io.IOException;
 import java.io.Serializable;
 import java.util.Collection;
+import java.util.IdentityHashMap;
 import java.util.Iterator;

 /** Expert: Scoring API.
@ -287,8 +290,6 @@ import java.util.Iterator;
 * @see Searcher#setSimilarity(Similarity)
 */
 public abstract class Similarity implements Serializable {
-  /** The Similarity implementation used by default. */
-  private static Similarity defaultImpl = new DefaultSimilarity();

  public static final int NO_DOC_ID_PROVIDED = -1;

@ -478,10 +479,62 @@ public abstract class Similarity implements Serializable {
   * @param term the term in question
   * @param searcher the document collection being searched
   * @return a score factor for the term
+   * @deprecated see {@link #idfExplain(Term, Searcher)}
   */
  public float idf(Term term, Searcher searcher) throws IOException {
    return idf(searcher.docFreq(term), searcher.maxDoc());
  }
+  
+  /**
+   * Computes a score factor for a simple term and returns an explanation
+   * for that score factor.
+   * 
+   * <p>
+   * The default implementation uses:
+   * 
+   * <pre>
+   * idf(searcher.docFreq(term), searcher.maxDoc());
+   * </pre>
+   * 
+   * Note that {@link Searcher#maxDoc()} is used instead of
+   * {@link org.apache.lucene.index.IndexReader#numDocs()} because it is
+   * proportional to {@link Searcher#docFreq(Term)} , i.e., when one is
+   * inaccurate, so is the other, and in the same direction.
+   * 
+   * @param term the term in question
+   * @param searcher the document collection being searched
+   * @return an IDFExplain object that includes both an idf score factor 
+             and an explanation for the term.
+   * @throws IOException
+   */
+  public IDFExplanation idfExplain(final Term term, final Searcher searcher) throws IOException {
+    if(supportedMethods.overridesTermIDF) {
+      final float idf = idf(term, searcher);
+      return new IDFExplanation() {
+        //@Override
+        public float getIdf() {
+          return idf;
+        }
+        //@Override
+        public String explain() {
+          return "Inexplicable";
+        }
+      };
+    }
+    final int df = searcher.docFreq(term);
+    final int max = searcher.maxDoc();
+    final float idf = idf(df, max);
+    return new IDFExplanation() {
+        //@Override
+        public String explain() {
+          return "idf(docFreq=" + df +
+          ", maxDocs=" + max + ")";
+        }
+        //@Override
+        public float getIdf() {
+          return idf;
+        }};
+   }

  /** Computes a score factor for a phrase.
   *
@ -490,7 +543,8 @@ public abstract class Similarity implements Serializable {
   *
   * @param terms the terms in the phrase
   * @param searcher the document collection being searched
-   * @return a score factor for the phrase
+   * @return  
+   * @deprecated see {@link #idfExplain(Collection, Searcher)}
   */
  public float idf(Collection terms, Searcher searcher) throws IOException {
    float idf = 0.0f;
@ -500,6 +554,60 @@ public abstract class Similarity implements Serializable {
    }
    return idf;
  }
+  
+  /**
+   * Computes a score factor for a phrase.
+   * 
+   * <p>
+   * The default implementation sums the idf factor for
+   * each term in the phrase.
+   * 
+   * @param terms the terms in the phrase
+   * @param searcher the document collection being searched
+   * @return an IDFExplain object that includes both an idf 
+   *         score factor for the phrase and an explanation 
+   *         for each term.
+   * @throws IOException
+   */
+  public IDFExplanation idfExplain(Collection terms, Searcher searcher) throws IOException {
+    if(supportedMethods.overridesCollectionIDF) {
+      final float idf = idf(terms, searcher);
+      return new IDFExplanation() {
+        //@Override
+        public float getIdf() {
+          return idf;
+        }
+        //@Override
+        public String explain() {
+          return "Inexplicable";
+        }
+      };
+    }
+    final int max = searcher.maxDoc();
+    float idf = 0.0f;
+    final StringBuffer exp = new StringBuffer();
+    Iterator i = terms.iterator();
+    while (i.hasNext()) {
+      Term term = (Term)i.next();
+      final int df = searcher.docFreq(term);
+      idf += idf(df, max);
+      exp.append(" ");
+      exp.append(term.text());
+      exp.append("=");
+      exp.append(df);
+    }
+    final float fIdf = idf;
+    return new IDFExplanation() {
+      //@Override
+      public float getIdf() {
+        return fIdf;
+      }
+      //@Override
+      public String explain() {
+        return exp.toString();
+      }
+    };
+  }

  /** Computes a score factor based on a term's document frequency (the number
   * of documents which contain the term).  This value is multiplied by the
@ -577,5 +685,52 @@ public abstract class Similarity implements Serializable {
    //TODO: When removing the deprecated scorePayload above, set this to return 1
    return scorePayload(fieldName, payload, offset, length);
  }
+  
+  /** @deprecated Remove this when old API is removed! */
+  private final MethodSupport supportedMethods = getSupportedMethods(this.getClass());
+  
+    /** @deprecated Remove this when old API is removed! */
+  private static final class MethodSupport implements Serializable {
+    final boolean overridesCollectionIDF, overridesTermIDF;
+
+    MethodSupport(Class clazz) {
+      overridesCollectionIDF = isMethodOverridden(clazz, "idf", C_IDF_METHOD_PARAMS);
+      overridesTermIDF = isMethodOverridden(clazz, "idf", T_IDF_METHOD_PARAMS);
+    }
+    
+    private static boolean isMethodOverridden(Class clazz, String name, Class[] params) {
+      try {
+        return clazz.getMethod(name, params).getDeclaringClass() != Similarity.class;
+      } catch (NoSuchMethodException e) {
+        // should not happen
+        throw new RuntimeException(e);
+      }
+    }
+    /** @deprecated Remove this when old API is removed! */
+    private static final Class[] T_IDF_METHOD_PARAMS = new Class[]{Term.class, Searcher.class};
+    
+    /** @deprecated Remove this when old API is removed! */
+    private static final Class[] C_IDF_METHOD_PARAMS = new Class[]{Collection.class, Searcher.class};
+  }
+  
+  /** @deprecated Remove this when old API is removed! */
+  private static final IdentityHashMap/*<Class<? extends Similarity>,MethodSupport>*/ knownMethodSupport = new IdentityHashMap();
+  
+  /** @deprecated Remove this when old API is removed! */
+  private static MethodSupport getSupportedMethods(Class clazz) {
+    MethodSupport supportedMethods;
+    synchronized(knownMethodSupport) {
+      supportedMethods = (MethodSupport) knownMethodSupport.get(clazz);
+      if (supportedMethods == null) {
+        knownMethodSupport.put(clazz, supportedMethods = new MethodSupport(clazz));
+      }
+    }
+    return supportedMethods;
+  }
+  
+  /** The Similarity implementation used by default. 
+   *  TODO: move back to top when old API is removed! 
+   **/
+  private static Similarity defaultImpl = new DefaultSimilarity();

 }
--- a/src/java/org/apache/lucene/search/TermQuery.java
+++ b/src/java/org/apache/lucene/search/TermQuery.java
@ -23,6 +23,7 @@ import java.util.Set;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermDocs;
 import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.Explanation.IDFExplanation;
 import org.apache.lucene.util.ToStringUtils;

 /** A Query that matches documents containing a term.
@ -37,11 +38,13 @@ public class TermQuery extends Query {
    private float idf;
    private float queryNorm;
    private float queryWeight;
+    private IDFExplanation idfExp;

    public TermWeight(Searcher searcher)
      throws IOException {
      this.similarity = getSimilarity(searcher);
-      idf = similarity.idf(term, searcher); // compute idf
+      idfExp = similarity.idfExplain(term, searcher);
+      idf = idfExp.getIdf();
    }

    public String toString() { return "weight(" + TermQuery.this + ")"; }
@ -75,8 +78,7 @@ public class TermQuery extends Query {
      ComplexExplanation result = new ComplexExplanation();
      result.setDescription("weight("+getQuery()+" in "+doc+"), product of:");

-      Explanation expl = new Explanation(idf, "idf(docFreq=" + reader.docFreq(term) +
-            ", maxDocs=" + reader.maxDoc() + ")");
+      Explanation expl = new Explanation(idf, idfExp.explain());

      // explain query weight
      Explanation queryExpl = new Explanation();
--- a/src/java/org/apache/lucene/search/spans/SpanWeight.java
+++ b/src/java/org/apache/lucene/search/spans/SpanWeight.java
@ -18,12 +18,11 @@ package org.apache.lucene.search.spans;
 */

 import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.Term;
 import org.apache.lucene.search.*;
+import org.apache.lucene.search.Explanation.IDFExplanation;

 import java.io.IOException;
 import java.util.HashSet;
-import java.util.Iterator;
 import java.util.Set;

 /**
@ -38,6 +37,7 @@ public class SpanWeight extends Weight {

  protected Set terms;
  protected SpanQuery query;
+  private IDFExplanation idfExp;

  public SpanWeight(SpanQuery query, Searcher searcher)
    throws IOException {
@ -45,8 +45,8 @@ public class SpanWeight extends Weight {
    this.query = query;
    terms=new HashSet();
    query.extractTerms(terms);
-
-    idf = this.query.getSimilarity(searcher).idf(terms, searcher);
+    idfExp = similarity.idfExplain(terms, searcher);
+    idf = idfExp.getIdf();
  }

  public Query getQuery() { return query; }
@ -75,21 +75,8 @@ public class SpanWeight extends Weight {
    result.setDescription("weight("+getQuery()+" in "+doc+"), product of:");
    String field = ((SpanQuery)getQuery()).getField();

-    StringBuffer docFreqs = new StringBuffer();
-    Iterator i = terms.iterator();
-    while (i.hasNext()) {
-      Term term = (Term)i.next();
-      docFreqs.append(term.text());
-      docFreqs.append("=");
-      docFreqs.append(reader.docFreq(term));
-
-      if (i.hasNext()) {
-        docFreqs.append(" ");
-      }
-    }
-
    Explanation idfExpl =
-      new Explanation(idf, "idf(" + field + ": " + docFreqs + ")");
+      new Explanation(idf, "idf(" + field + ": " + idfExp.explain() + ")");

    // explain query weight
    Explanation queryExpl = new Explanation();
--- a/src/test/org/apache/lucene/search/TestSimpleExplanations.java
+++ b/src/test/org/apache/lucene/search/TestSimpleExplanations.java
@ -17,6 +17,19 @@ package org.apache.lucene.search;
 * limitations under the License.
 */

+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.queryParser.QueryParser;
+import org.apache.lucene.search.spans.SpanNearQuery;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.MockRAMDirectory;
+
+
 /**
 * TestExplanations subclass focusing on basic query types
 */
@ -291,4 +304,67 @@ public class TestSimpleExplanations extends TestExplanations {
  }
  
  
+  public void testTermQueryMultiSearcherExplain() throws Exception {
+    // creating two directories for indices
+    Directory indexStoreA = new MockRAMDirectory();
+    Directory indexStoreB = new MockRAMDirectory();
+
+    Document lDoc = new Document();
+    lDoc.add(new Field("handle", "1 2", Field.Store.YES, Field.Index.ANALYZED));
+    Document lDoc2 = new Document();
+    lDoc2.add(new Field("handle", "1 2", Field.Store.YES, Field.Index.ANALYZED));
+    Document lDoc3 = new Document();
+    lDoc3.add(new Field("handle", "1 2", Field.Store.YES, Field.Index.ANALYZED));
+
+    IndexWriter writerA = new IndexWriter(indexStoreA, new StandardAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
+    IndexWriter writerB = new IndexWriter(indexStoreB, new StandardAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
+
+    writerA.addDocument(lDoc);
+    writerA.addDocument(lDoc2);
+    writerA.optimize();
+    writerA.close();
+
+    writerB.addDocument(lDoc3);
+    writerB.close();
+
+    QueryParser parser = new QueryParser("fulltext", new StandardAnalyzer());
+    Query query = parser.parse("handle:1");
+
+    Searcher[] searchers = new Searcher[2];
+    searchers[0] = new IndexSearcher(indexStoreB);
+    searchers[1] = new IndexSearcher(indexStoreA);
+    Searcher mSearcher = new MultiSearcher(searchers);
+    ScoreDoc[] hits = mSearcher.search(query, null, 1000).scoreDocs;
+
+    assertEquals(3, hits.length);
+
+    Explanation explain = mSearcher.explain(query, hits[0].doc);
+    String exp = explain.toString(0);
+    assertTrue(exp, exp.indexOf("maxDocs=3") > -1);
+    assertTrue(exp, exp.indexOf("docFreq=3") > -1);
+    
+    query = parser.parse("handle:\"1 2\"");
+    hits = mSearcher.search(query, null, 1000).scoreDocs;
+
+    assertEquals(3, hits.length);
+
+    explain = mSearcher.explain(query, hits[0].doc);
+    exp = explain.toString(0);
+    assertTrue(exp, exp.indexOf("1=3") > -1);
+    assertTrue(exp, exp.indexOf("2=3") > -1);
+    
+    query = new SpanNearQuery(new SpanQuery[] {
+        new SpanTermQuery(new Term("handle", "1")),
+        new SpanTermQuery(new Term("handle", "2")) }, 0, true);
+    hits = mSearcher.search(query, null, 1000).scoreDocs;
+
+    assertEquals(3, hits.length);
+
+    explain = mSearcher.explain(query, hits[0].doc);
+    exp = explain.toString(0);
+    assertTrue(exp, exp.indexOf("1=3") > -1);
+    assertTrue(exp, exp.indexOf("2=3") > -1);
+    mSearcher.close();
+  }
+  
 }