LUCENE-1847: PhraseQuery/TermQuery/SpanQuery use IndexReader specific stats in their Weight#explain methods - these stats should be corpus wide.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@807595 13f79535-47bb-0310-9956-ffa450edef68
2009-08-25 12:50:06 +00:00 · 2009-08-25 12:50:06 +00:00 · c1c2ce7716
parent ca3f86d815
commit c1c2ce7716
7 changed files with 280 additions and 36 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -412,6 +412,10 @@ API Changes
    AttributeFactory to all Tokenizer implementations.
    (Michael Busch)
 38. LUCENE-1847: Similarity#idf for both a Term and Term Collection have
    been deprecated. New versions that return an IDFExplanation have been
    added.  (Yasoja Seneviratne, Mike McCandless, Mark Miller)
 Bug fixes
 1. LUCENE-1415: MultiPhraseQuery has incorrect hashCode() and equals()
@ -516,6 +520,10 @@ Bug fixes
    new LocalizedTestCase as base class for localization junit tests.
    (Robert Muir, Uwe Schindler via Michael Busch)
 26. LUCENE-1847: PhraseQuery/TermQuery/SpanQuery use IndexReader specific stats 
    in their Weight#explain methods - these stats should be corpus wide.
    (Yasoja Seneviratne, Mike McCandless, Mark Miller)
 New features
 1. LUCENE-1411: Added expert API to open an IndexWriter on a prior
@ -730,11 +738,6 @@ New features
    passed to not double RAM usage in the FieldCache.  (Chris
    Hostetter, Mark Miller, Mike McCandless)
 37. LUCENE-1798: Added FieldCache.set/getInfoStream, which uses
    FieldCacheSanityChecker to detect when a new cache entry has
    caused additional insanity, printing the details at the time that
    it happens.  (Chris Hostetter, Mike McCandless)
 Optimizations
 1. LUCENE-1427: Fixed QueryWrapperFilter to not waste time computing
--- a/src/java/org/apache/lucene/search/Explanation.java
+++ b/src/java/org/apache/lucene/search/Explanation.java
@ -17,6 +17,7 @@ package org.apache.lucene.search;
 * limitations under the License.
 */
 import java.io.Serializable;
 import java.util.ArrayList;
 /** Expert: Describes the score computation for document and query. */
@ -124,4 +125,25 @@ public class Explanation implements java.io.Serializable {
    return buffer.toString();
  }
  /**
   * Small Util class used to pass both an idf factor as well as an
   * explanation for that factor.
   * 
   * This class will likely be held on a {@link Weight}, so be aware 
   * before storing any large or un-serializable fields.
   *
   */
  public static abstract class IDFExplanation implements Serializable {
    /**
     * @return the idf factor
     */
    public abstract float getIdf();
    /**
     * This should be calculated lazily if possible.
     * 
     * @return the explanation for the idf factor.
     */
    public abstract String explain();
  }
 }
--- a/src/java/org/apache/lucene/search/PhraseQuery.java
+++ b/src/java/org/apache/lucene/search/PhraseQuery.java
@ -24,6 +24,7 @@ import java.util.ArrayList;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermPositions;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.search.Explanation.IDFExplanation;
 import org.apache.lucene.util.ToStringUtils;
 /** A Query that matches documents containing a particular sequence of terms.
@ -112,12 +113,14 @@ public class PhraseQuery extends Query {
    private float idf;
    private float queryNorm;
    private float queryWeight;
    private IDFExplanation idfExp;
    public PhraseWeight(Searcher searcher)
      throws IOException {
      this.similarity = getSimilarity(searcher);
-      idf = similarity.idf(terms, searcher);
+      idfExp = similarity.idfExplain(terms, searcher);
      idf = idfExp.getIdf();
    }
    public String toString() { return "weight(" + PhraseQuery.this + ")"; }
@ -167,24 +170,20 @@ public class PhraseQuery extends Query {
      StringBuffer docFreqs = new StringBuffer();
      StringBuffer query = new StringBuffer();
      query.append('\"');
      docFreqs.append(idfExp.explain());
      for (int i = 0; i < terms.size(); i++) {
        if (i != 0) {
          docFreqs.append(" ");
          query.append(" ");
        }
        Term term = (Term)terms.get(i);
        docFreqs.append(term.text());
        docFreqs.append("=");
        docFreqs.append(reader.docFreq(term));
        query.append(term.text());
      }
      query.append('\"');
      Explanation idfExpl =
-        new Explanation(idf, "idf(" + field + ": " + docFreqs + ")");
+        new Explanation(idf, "idf(" + field + ":" + docFreqs + ")");
      // explain query weight
      Explanation queryExpl = new Explanation();
--- a/src/java/org/apache/lucene/search/Similarity.java
+++ b/src/java/org/apache/lucene/search/Similarity.java
@ -17,13 +17,16 @@ package org.apache.lucene.search;
 * limitations under the License.
 */
 import org.apache.lucene.index.FieldInvertState;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.Explanation.IDFExplanation;
 import org.apache.lucene.util.SmallFloat;
 import java.io.IOException;
 import java.io.Serializable;
 import java.util.Collection;
 import java.util.IdentityHashMap;
 import java.util.Iterator;
 /** Expert: Scoring API.
@ -287,8 +290,6 @@ import java.util.Iterator;
 * @see Searcher#setSimilarity(Similarity)
 */
 public abstract class Similarity implements Serializable {
  /** The Similarity implementation used by default. */
  private static Similarity defaultImpl = new DefaultSimilarity();
  public static final int NO_DOC_ID_PROVIDED = -1;
@ -478,11 +479,63 @@ public abstract class Similarity implements Serializable {
   * @param term the term in question
   * @param searcher the document collection being searched
   * @return a score factor for the term
   * @deprecated see {@link #idfExplain(Term, Searcher)}
   */
  public float idf(Term term, Searcher searcher) throws IOException {
    return idf(searcher.docFreq(term), searcher.maxDoc());
  }
  /**
   * Computes a score factor for a simple term and returns an explanation
   * for that score factor.
   * 
   * <p>
   * The default implementation uses:
   * 
   * <pre>
   * idf(searcher.docFreq(term), searcher.maxDoc());
   * </pre>
   * 
   * Note that {@link Searcher#maxDoc()} is used instead of
   * {@link org.apache.lucene.index.IndexReader#numDocs()} because it is
   * proportional to {@link Searcher#docFreq(Term)} , i.e., when one is
   * inaccurate, so is the other, and in the same direction.
   * 
   * @param term the term in question
   * @param searcher the document collection being searched
   * @return an IDFExplain object that includes both an idf score factor 
             and an explanation for the term.
   * @throws IOException
   */
  public IDFExplanation idfExplain(final Term term, final Searcher searcher) throws IOException {
    if(supportedMethods.overridesTermIDF) {
      final float idf = idf(term, searcher);
      return new IDFExplanation() {
        //@Override
        public float getIdf() {
          return idf;
        }
        //@Override
        public String explain() {
          return "Inexplicable";
        }
      };
    }
    final int df = searcher.docFreq(term);
    final int max = searcher.maxDoc();
    final float idf = idf(df, max);
    return new IDFExplanation() {
        //@Override
        public String explain() {
          return "idf(docFreq=" + df +
          ", maxDocs=" + max + ")";
        }
        //@Override
        public float getIdf() {
          return idf;
        }};
   }
  /** Computes a score factor for a phrase.
   *
   * <p>The default implementation sums the {@link #idf(Term,Searcher)} factor
@ -490,7 +543,8 @@ public abstract class Similarity implements Serializable {
   *
   * @param terms the terms in the phrase
   * @param searcher the document collection being searched
-   * @return a score factor for the phrase
+   * @return  
   * @deprecated see {@link #idfExplain(Collection, Searcher)}
   */
  public float idf(Collection terms, Searcher searcher) throws IOException {
    float idf = 0.0f;
@ -501,6 +555,60 @@ public abstract class Similarity implements Serializable {
    return idf;
  }
  /**
   * Computes a score factor for a phrase.
   * 
   * <p>
   * The default implementation sums the idf factor for
   * each term in the phrase.
   * 
   * @param terms the terms in the phrase
   * @param searcher the document collection being searched
   * @return an IDFExplain object that includes both an idf 
   *         score factor for the phrase and an explanation 
   *         for each term.
   * @throws IOException
   */
  public IDFExplanation idfExplain(Collection terms, Searcher searcher) throws IOException {
    if(supportedMethods.overridesCollectionIDF) {
      final float idf = idf(terms, searcher);
      return new IDFExplanation() {
        //@Override
        public float getIdf() {
          return idf;
        }
        //@Override
        public String explain() {
          return "Inexplicable";
        }
      };
    }
    final int max = searcher.maxDoc();
    float idf = 0.0f;
    final StringBuffer exp = new StringBuffer();
    Iterator i = terms.iterator();
    while (i.hasNext()) {
      Term term = (Term)i.next();
      final int df = searcher.docFreq(term);
      idf += idf(df, max);
      exp.append(" ");
      exp.append(term.text());
      exp.append("=");
      exp.append(df);
    }
    final float fIdf = idf;
    return new IDFExplanation() {
      //@Override
      public float getIdf() {
        return fIdf;
      }
      //@Override
      public String explain() {
        return exp.toString();
      }
    };
  }
  /** Computes a score factor based on a term's document frequency (the number
   * of documents which contain the term).  This value is multiplied by the
   * {@link #tf(int)} factor for each term in the query and these products are
@ -578,4 +686,51 @@ public abstract class Similarity implements Serializable {
    return scorePayload(fieldName, payload, offset, length);
  }
  /** @deprecated Remove this when old API is removed! */
  private final MethodSupport supportedMethods = getSupportedMethods(this.getClass());
    /** @deprecated Remove this when old API is removed! */
  private static final class MethodSupport implements Serializable {
    final boolean overridesCollectionIDF, overridesTermIDF;
    MethodSupport(Class clazz) {
      overridesCollectionIDF = isMethodOverridden(clazz, "idf", C_IDF_METHOD_PARAMS);
      overridesTermIDF = isMethodOverridden(clazz, "idf", T_IDF_METHOD_PARAMS);
    }
    private static boolean isMethodOverridden(Class clazz, String name, Class[] params) {
      try {
        return clazz.getMethod(name, params).getDeclaringClass() != Similarity.class;
      } catch (NoSuchMethodException e) {
        // should not happen
        throw new RuntimeException(e);
      }
    }
    /** @deprecated Remove this when old API is removed! */
    private static final Class[] T_IDF_METHOD_PARAMS = new Class[]{Term.class, Searcher.class};
    /** @deprecated Remove this when old API is removed! */
    private static final Class[] C_IDF_METHOD_PARAMS = new Class[]{Collection.class, Searcher.class};
  }
  /** @deprecated Remove this when old API is removed! */
  private static final IdentityHashMap/*<Class<? extends Similarity>,MethodSupport>*/ knownMethodSupport = new IdentityHashMap();
  /** @deprecated Remove this when old API is removed! */
  private static MethodSupport getSupportedMethods(Class clazz) {
    MethodSupport supportedMethods;
    synchronized(knownMethodSupport) {
      supportedMethods = (MethodSupport) knownMethodSupport.get(clazz);
      if (supportedMethods == null) {
        knownMethodSupport.put(clazz, supportedMethods = new MethodSupport(clazz));
      }
    }
    return supportedMethods;
  }
  /** The Similarity implementation used by default. 
   *  TODO: move back to top when old API is removed! 
   **/
  private static Similarity defaultImpl = new DefaultSimilarity();
 }
--- a/src/java/org/apache/lucene/search/TermQuery.java
+++ b/src/java/org/apache/lucene/search/TermQuery.java
@ -23,6 +23,7 @@ import java.util.Set;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermDocs;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.search.Explanation.IDFExplanation;
 import org.apache.lucene.util.ToStringUtils;
 /** A Query that matches documents containing a term.
@ -37,11 +38,13 @@ public class TermQuery extends Query {
    private float idf;
    private float queryNorm;
    private float queryWeight;
    private IDFExplanation idfExp;
    public TermWeight(Searcher searcher)
      throws IOException {
      this.similarity = getSimilarity(searcher);
-      idf = similarity.idf(term, searcher); // compute idf
+      idfExp = similarity.idfExplain(term, searcher);
      idf = idfExp.getIdf();
    }
    public String toString() { return "weight(" + TermQuery.this + ")"; }
@ -75,8 +78,7 @@ public class TermQuery extends Query {
      ComplexExplanation result = new ComplexExplanation();
      result.setDescription("weight("+getQuery()+" in "+doc+"), product of:");
-      Explanation expl = new Explanation(idf, "idf(docFreq=" + reader.docFreq(term) +
+      Explanation expl = new Explanation(idf, idfExp.explain());
            ", maxDocs=" + reader.maxDoc() + ")");
      // explain query weight
      Explanation queryExpl = new Explanation();
--- a/src/java/org/apache/lucene/search/spans/SpanWeight.java
+++ b/src/java/org/apache/lucene/search/spans/SpanWeight.java
@ -18,12 +18,11 @@ package org.apache.lucene.search.spans;
 */
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.*;
 import org.apache.lucene.search.Explanation.IDFExplanation;
 import java.io.IOException;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.Set;
 /**
@ -38,6 +37,7 @@ public class SpanWeight extends Weight {
  protected Set terms;
  protected SpanQuery query;
  private IDFExplanation idfExp;
  public SpanWeight(SpanQuery query, Searcher searcher)
    throws IOException {
@ -45,8 +45,8 @@ public class SpanWeight extends Weight {
    this.query = query;
    terms=new HashSet();
    query.extractTerms(terms);
-
+    idfExp = similarity.idfExplain(terms, searcher);
-    idf = this.query.getSimilarity(searcher).idf(terms, searcher);
+    idf = idfExp.getIdf();
  }
  public Query getQuery() { return query; }
@ -75,21 +75,8 @@ public class SpanWeight extends Weight {
    result.setDescription("weight("+getQuery()+" in "+doc+"), product of:");
    String field = ((SpanQuery)getQuery()).getField();
    StringBuffer docFreqs = new StringBuffer();
    Iterator i = terms.iterator();
    while (i.hasNext()) {
      Term term = (Term)i.next();
      docFreqs.append(term.text());
      docFreqs.append("=");
      docFreqs.append(reader.docFreq(term));
      if (i.hasNext()) {
        docFreqs.append(" ");
      }
    }
    Explanation idfExpl =
-      new Explanation(idf, "idf(" + field + ": " + docFreqs + ")");
+      new Explanation(idf, "idf(" + field + ": " + idfExp.explain() + ")");
    // explain query weight
    Explanation queryExpl = new Explanation();
--- a/src/test/org/apache/lucene/search/TestSimpleExplanations.java
+++ b/src/test/org/apache/lucene/search/TestSimpleExplanations.java
@ -17,6 +17,19 @@ package org.apache.lucene.search;
 * limitations under the License.
 */
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.queryParser.QueryParser;
 import org.apache.lucene.search.spans.SpanNearQuery;
 import org.apache.lucene.search.spans.SpanQuery;
 import org.apache.lucene.search.spans.SpanTermQuery;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.MockRAMDirectory;
 /**
 * TestExplanations subclass focusing on basic query types
 */
@ -291,4 +304,67 @@ public class TestSimpleExplanations extends TestExplanations {
  }
  public void testTermQueryMultiSearcherExplain() throws Exception {
    // creating two directories for indices
    Directory indexStoreA = new MockRAMDirectory();
    Directory indexStoreB = new MockRAMDirectory();
    Document lDoc = new Document();
    lDoc.add(new Field("handle", "1 2", Field.Store.YES, Field.Index.ANALYZED));
    Document lDoc2 = new Document();
    lDoc2.add(new Field("handle", "1 2", Field.Store.YES, Field.Index.ANALYZED));
    Document lDoc3 = new Document();
    lDoc3.add(new Field("handle", "1 2", Field.Store.YES, Field.Index.ANALYZED));
    IndexWriter writerA = new IndexWriter(indexStoreA, new StandardAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
    IndexWriter writerB = new IndexWriter(indexStoreB, new StandardAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
    writerA.addDocument(lDoc);
    writerA.addDocument(lDoc2);
    writerA.optimize();
    writerA.close();
    writerB.addDocument(lDoc3);
    writerB.close();
    QueryParser parser = new QueryParser("fulltext", new StandardAnalyzer());
    Query query = parser.parse("handle:1");
    Searcher[] searchers = new Searcher[2];
    searchers[0] = new IndexSearcher(indexStoreB);
    searchers[1] = new IndexSearcher(indexStoreA);
    Searcher mSearcher = new MultiSearcher(searchers);
    ScoreDoc[] hits = mSearcher.search(query, null, 1000).scoreDocs;
    assertEquals(3, hits.length);
    Explanation explain = mSearcher.explain(query, hits[0].doc);
    String exp = explain.toString(0);
    assertTrue(exp, exp.indexOf("maxDocs=3") > -1);
    assertTrue(exp, exp.indexOf("docFreq=3") > -1);
    query = parser.parse("handle:\"1 2\"");
    hits = mSearcher.search(query, null, 1000).scoreDocs;
    assertEquals(3, hits.length);
    explain = mSearcher.explain(query, hits[0].doc);
    exp = explain.toString(0);
    assertTrue(exp, exp.indexOf("1=3") > -1);
    assertTrue(exp, exp.indexOf("2=3") > -1);
    query = new SpanNearQuery(new SpanQuery[] {
        new SpanTermQuery(new Term("handle", "1")),
        new SpanTermQuery(new Term("handle", "2")) }, 0, true);
    hits = mSearcher.search(query, null, 1000).scoreDocs;
    assertEquals(3, hits.length);
    explain = mSearcher.explain(query, hits[0].doc);
    exp = explain.toString(0);
    assertTrue(exp, exp.indexOf("1=3") > -1);
    assertTrue(exp, exp.indexOf("2=3") > -1);
    mSearcher.close();
  }
 }