LUCENE-3555: add support for distributed stats

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1197455 13f79535-47bb-0310-9956-ffa450edef68
2025-02-21 17:46:28 +00:00 · 2011-11-04 09:23:56 +00:00 · 2011-11-04 09:23:56 +00:00 · b19a207c86
commit b19a207c86
parent 5f8d4fc8af
21 changed files with 278 additions and 348 deletions
--- a/lucene/src/java/org/apache/lucene/search/CollectionStatistics.java
+++ b/lucene/src/java/org/apache/lucene/search/CollectionStatistics.java
@ -0,0 +1,72 @@
+package org.apache.lucene.search;
+
+import org.apache.lucene.index.IndexReader; // javadocs
+import org.apache.lucene.index.Terms;       // javadocs
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Contains statistics for a collection (field)
+ * @lucene.experimental
+ */
+public class CollectionStatistics {
+  private final String field;
+  private final int maxDoc;
+  private final int docCount;
+  private final long sumTotalTermFreq;
+  private final long sumDocFreq;
+  
+  public CollectionStatistics(String field, int maxDoc, int docCount, long sumTotalTermFreq, long sumDocFreq) {
+    this.field = field;
+    this.maxDoc = maxDoc;
+    this.docCount = docCount;
+    this.sumTotalTermFreq = sumTotalTermFreq;
+    this.sumDocFreq = sumDocFreq;
+  }
+  
+  /** returns the field name */
+  public String field() {
+    return field;
+  }
+  
+  /** returns the total number of documents, regardless of 
+   * whether they all contain values for this field. 
+   * @see IndexReader#maxDoc() */
+  public int maxDoc() {
+    return maxDoc;
+  }
+  
+  /** returns the total number of documents that
+   * have at least one term for this field. 
+   * @see Terms#getDocCount() */
+  public int docCount() {
+    return docCount;
+  }
+  
+  /** returns the total number of tokens for this field
+   * @see Terms#getSumTotalTermFreq() */
+  public long sumTotalTermFreq() {
+    return sumTotalTermFreq;
+  }
+  
+  /** returns the total number of postings for this field 
+   * @see Terms#getSumDocFreq() */
+  public long sumDocFreq() {
+    return sumDocFreq;
+  }
+}
--- a/lucene/src/java/org/apache/lucene/search/IndexSearcher.java
+++ b/lucene/src/java/org/apache/lucene/search/IndexSearcher.java
@ -35,14 +35,18 @@ import org.apache.lucene.index.CorruptIndexException;
 import org.apache.lucene.index.IndexReader.AtomicReaderContext;
 import org.apache.lucene.index.IndexReader.ReaderContext;
 import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.MultiFields;
 import org.apache.lucene.index.StoredFieldVisitor;
 import org.apache.lucene.index.Term;
+import org.apache.lucene.index.Terms;
 import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
 import org.apache.lucene.search.similarities.SimilarityProvider;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.NIOFSDirectory;    // javadoc
 import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.ReaderUtil;
+import org.apache.lucene.util.TermContext;
 import org.apache.lucene.util.ThreadInterruptedException;

 /** Implements search over a single IndexReader.
@ -860,4 +864,34 @@ public class IndexSearcher implements Closeable {
  public String toString() {
    return "IndexSearcher(" + reader + "; executor=" + executor + ")";
  }
+  
+  /**
+   * Returns {@link TermStatistics} for a term
+   * @lucene.experimental
+   */
+  public TermStatistics termStatistics(Term term, TermContext context) throws IOException {
+    return new TermStatistics(term.bytes(), context.docFreq(), context.totalTermFreq());
+  };
+  
+  /**
+   * Returns {@link CollectionStatistics} for a field
+   * @lucene.experimental
+   */
+  public CollectionStatistics collectionStatistics(String field) throws IOException {
+    final int docCount;
+    final long sumTotalTermFreq;
+    final long sumDocFreq;
+    
+    Terms terms = MultiFields.getTerms(reader, field);
+    if (terms == null) {
+      docCount = 0;
+      sumTotalTermFreq = 0;
+      sumDocFreq = 0;
+    } else {
+      docCount = terms.getDocCount();
+      sumTotalTermFreq = terms.getSumTotalTermFreq();
+      sumDocFreq = terms.getSumDocFreq();
+    }
+    return new CollectionStatistics(field, reader.maxDoc(), docCount, sumTotalTermFreq, sumDocFreq);
+  }
 }
--- a/lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java
+++ b/lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java
@ -141,13 +141,15 @@ public class MultiPhraseQuery extends Query {
      final ReaderContext context = searcher.getTopReaderContext();
      
      // compute idf
-      ArrayList<TermContext> allTerms = new ArrayList<TermContext>();
+      ArrayList<TermStatistics> allTermStats = new ArrayList<TermStatistics>();
      for(final Term[] terms: termArrays) {
        for (Term term: terms) {
-          allTerms.add(TermContext.build(context, term, true));
+          TermContext termContext = TermContext.build(context, term, true);
+          allTermStats.add(searcher.termStatistics(term, termContext));
        }
      }
-      stats = similarity.computeStats(searcher, field, getBoost(), allTerms.toArray(new TermContext[allTerms.size()]));
+      stats = similarity.computeStats(searcher.collectionStatistics(field), 
+          getBoost(), allTermStats.toArray(new TermStatistics[allTermStats.size()]));
    }

    @Override
--- a/lucene/src/java/org/apache/lucene/search/PhraseQuery.java
+++ b/lucene/src/java/org/apache/lucene/search/PhraseQuery.java
@ -190,9 +190,13 @@ public class PhraseQuery extends Query {
      this.similarity = searcher.getSimilarityProvider().get(field);
      final ReaderContext context = searcher.getTopReaderContext();
      states = new TermContext[terms.size()];
-      for (int i = 0; i < terms.size(); i++)
-        states[i] = TermContext.build(context, terms.get(i), true);
-      stats = similarity.computeStats(searcher, field, getBoost(), states);
+      TermStatistics termStats[] = new TermStatistics[terms.size()];
+      for (int i = 0; i < terms.size(); i++) {
+        final Term term = terms.get(i);
+        states[i] = TermContext.build(context, term, true);
+        termStats[i] = searcher.termStatistics(term, states[i]);
+      }
+      stats = similarity.computeStats(searcher.collectionStatistics(field), getBoost(), termStats);
    }

    @Override
--- a/lucene/src/java/org/apache/lucene/search/TermQuery.java
+++ b/lucene/src/java/org/apache/lucene/search/TermQuery.java
@ -54,7 +54,10 @@ public class TermQuery extends Query {
      assert termStates != null : "TermContext must not be null";
      this.termStates = termStates;
      this.similarity = searcher.getSimilarityProvider().get(term.field());
-      this.stats = similarity.computeStats(searcher, term.field(), getBoost(), termStates);
+      this.stats = similarity.computeStats(
+          searcher.collectionStatistics(term.field()), 
+          getBoost(), 
+          searcher.termStatistics(term, termStates));
    }

    @Override
--- a/lucene/src/java/org/apache/lucene/search/TermStatistics.java
+++ b/lucene/src/java/org/apache/lucene/search/TermStatistics.java
@ -0,0 +1,53 @@
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.IndexReader; // javadocs
+import org.apache.lucene.util.BytesRef;
+/**
+ * Contains statistics for a specific term
+ * @lucene.experimental
+ */
+public class TermStatistics {
+  private final BytesRef term;
+  private final int docFreq;
+  private final long totalTermFreq;
+  
+  public TermStatistics(BytesRef term, int docFreq, long totalTermFreq) {
+    this.term = term;
+    this.docFreq = docFreq;
+    this.totalTermFreq = totalTermFreq;
+  }
+  
+  /** returns the term text */
+  public BytesRef term() {
+    return term;
+  }
+  
+  /** returns the number of documents this term occurs in 
+   * @see IndexReader#docFreq(String, BytesRef) */
+  public int docFreq() {
+    return docFreq;
+  }
+  
+  /** returns the total number of occurrences of this term
+   * @see IndexReader#totalTermFreq(String, BytesRef) */
+  public long totalTermFreq() {
+    return totalTermFreq;
+  }
+}
--- a/lucene/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
+++ b/lucene/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
@ -20,14 +20,12 @@ package org.apache.lucene.search.similarities;
 import java.io.IOException;

 import org.apache.lucene.index.FieldInvertState;
-import org.apache.lucene.index.MultiFields;
 import org.apache.lucene.index.IndexReader.AtomicReaderContext;
-import org.apache.lucene.index.Terms;
+import org.apache.lucene.search.CollectionStatistics;
 import org.apache.lucene.search.Explanation;
-import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.TermStatistics;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.SmallFloat;
-import org.apache.lucene.util.TermContext;

 /**
 * BM25 Similarity. Introduced in Stephen E. Robertson, Steve Walker,
@ -75,15 +73,13 @@ public class BM25Similarity extends Similarity {
  /** The default implementation computes the average as <code>sumTotalTermFreq / maxDoc</code>,
   * or returns <code>1</code> if the index does not store sumTotalTermFreq (Lucene 3.x indexes
   * or any field that omits frequency information). */
-  protected float avgFieldLength(IndexSearcher searcher, String field) throws IOException {
-    Terms terms = MultiFields.getTerms(searcher.getIndexReader(), field);
-    if (terms == null) {
-      // field does not exist;
-      return 1f;
+  protected float avgFieldLength(CollectionStatistics collectionStats) {
+    final long sumTotalTermFreq = collectionStats.sumTotalTermFreq();
+    if (sumTotalTermFreq <= 0) {
+      return 1f;       // field does not exist, or stat is unsupported
+    } else {
+      return (float) (sumTotalTermFreq / (double) collectionStats.maxDoc());
    }
-    long sumTotalTermFreq = terms.getSumTotalTermFreq();
-    long maxdoc = searcher.maxDoc();
-    return sumTotalTermFreq == -1 ? 1f : (float) (sumTotalTermFreq / (double) maxdoc);
  }
  
  /** The default implementation encodes <code>boost / sqrt(length)</code>
@ -131,19 +127,19 @@ public class BM25Similarity extends Similarity {
    return encodeNormValue(state.getBoost(), numTerms);
  }

-  public Explanation idfExplain(TermContext stats, final IndexSearcher searcher) throws IOException {
-    final int df = stats.docFreq();
-    final int max = searcher.maxDoc();
+  public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
+    final int df = termStats.docFreq();
+    final int max = collectionStats.maxDoc();
    final float idf = idf(df, max);
    return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")");
  }

-  public Explanation idfExplain(final TermContext stats[], IndexSearcher searcher) throws IOException {
-    final int max = searcher.maxDoc();
+  public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
+    final int max = collectionStats.maxDoc();
    float idf = 0.0f;
    final Explanation exp = new Explanation();
    exp.setDescription("idf(), sum of:");
-    for (final TermContext stat : stats ) {
+    for (final TermStatistics stat : termStats ) {
      final int df = stat.docFreq();
      final float termIdf = idf(df, max);
      exp.addDetail(new Explanation(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"));
@ -154,10 +150,10 @@ public class BM25Similarity extends Similarity {
  }

  @Override
-  public final Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost, TermContext... termStats) throws IOException {
-    Explanation idf = termStats.length == 1 ? idfExplain(termStats[0], searcher) : idfExplain(termStats, searcher);
+  public final Stats computeStats(CollectionStatistics collectionStats, float queryBoost, TermStatistics... termStats) {
+    Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats);

-    float avgdl = avgFieldLength(searcher, fieldName);
+    float avgdl = avgFieldLength(collectionStats);

    // compute freq-independent part of bm25 equation across all norm values
    float cache[] = new float[256];
--- a/lucene/src/java/org/apache/lucene/search/similarities/LMSimilarity.java
+++ b/lucene/src/java/org/apache/lucene/search/similarities/LMSimilarity.java
@ -17,11 +17,9 @@ package org.apache.lucene.search.similarities;
 * limitations under the License.
 */

-import java.io.IOException;
-
+import org.apache.lucene.search.CollectionStatistics;
 import org.apache.lucene.search.Explanation;
-import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.util.TermContext;
+import org.apache.lucene.search.TermStatistics;

 /**
 * Abstract superclass for language modeling Similarities. The following inner
@ -62,8 +60,8 @@ public abstract class LMSimilarity extends SimilarityBase {
   * usual statistics.
   */
  @Override
-  protected void fillBasicStats(BasicStats stats, IndexSearcher searcher, String fieldName, TermContext termContext) throws IOException {
-    super.fillBasicStats(stats, searcher, fieldName, termContext);
+  protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) {
+    super.fillBasicStats(stats, collectionStats, termStats);
    LMStats lmStats = (LMStats) stats;
    lmStats.setCollectionProbability(collectionModel.computeProbability(stats));
  }
--- a/lucene/src/java/org/apache/lucene/search/similarities/MultiSimilarity.java
+++ b/lucene/src/java/org/apache/lucene/search/similarities/MultiSimilarity.java
@ -21,10 +21,10 @@ import java.io.IOException;

 import org.apache.lucene.index.FieldInvertState;
 import org.apache.lucene.index.IndexReader.AtomicReaderContext;
+import org.apache.lucene.search.CollectionStatistics;
 import org.apache.lucene.search.Explanation;
-import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.TermStatistics;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.TermContext;

 /**
 * Implements the CombSUM method for combining evidence from multiple
@ -45,10 +45,10 @@ public class MultiSimilarity extends Similarity {
  }

  @Override
-  public Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost, TermContext... termContexts) throws IOException {
+  public Stats computeStats(CollectionStatistics collectionStats, float queryBoost, TermStatistics... termStats) {
    Stats subStats[] = new Stats[sims.length];
    for (int i = 0; i < subStats.length; i++) {
-      subStats[i] = sims[i].computeStats(searcher, fieldName, queryBoost, termContexts);
+      subStats[i] = sims[i].computeStats(collectionStats, queryBoost, termStats);
    }
    return new MultiStats(subStats);
  }
--- a/lucene/src/java/org/apache/lucene/search/similarities/Similarity.java
+++ b/lucene/src/java/org/apache/lucene/search/similarities/Similarity.java
@ -26,11 +26,13 @@ import org.apache.lucene.index.IndexReader; // javadoc
 import org.apache.lucene.index.IndexReader.AtomicReaderContext;
 import org.apache.lucene.index.Terms; // javadoc
 import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.CollectionStatistics;
 import org.apache.lucene.search.Explanation;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.PhraseQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TermStatistics;
 import org.apache.lucene.search.spans.SpanQuery; // javadoc
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.SmallFloat; // javadoc
@ -81,10 +83,10 @@ import org.apache.lucene.util.TermContext;
 * <a name="querytime"/>
 * At query-time, Queries interact with the Similarity via these steps:
 * <ol>
- *   <li>The {@link #computeStats(IndexSearcher, String, float, TermContext...)} method is called a single time,
+ *   <li>The {@link #computeStats(CollectionStatistics, float, TermStatistics...)} method is called a single time,
 *       allowing the implementation to compute any statistics (such as IDF, average document length, etc)
- *       across <i>the entire collection</i>. The {@link TermContext}s passed in are already positioned
- *       to the terms involved with the raw statistics involved, so a Similarity can freely use any combination
+ *       across <i>the entire collection</i>. The {@link TermStatistics} passed in already contain
+ *       the raw statistics involved, so a Similarity can freely use any combination
 *       of term statistics without causing any additional I/O. Lucene makes no assumption about what is 
 *       stored in the returned {@link Similarity.Stats} object.
 *   <li>The query normalization process occurs a single time: {@link Similarity.Stats#getValueForNormalization()}
@ -128,7 +130,7 @@ public abstract class Similarity {
  /**
   * Compute any collection-level stats (e.g. IDF, average document length, etc) needed for scoring a query.
   */
-  public abstract Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost, TermContext... termContexts) throws IOException;
+  public abstract Stats computeStats(CollectionStatistics collectionStats, float queryBoost, TermStatistics... termStats);
  
  /**
   * returns a new {@link Similarity.ExactDocScorer}.
--- a/lucene/src/java/org/apache/lucene/search/similarities/SimilarityBase.java
+++ b/lucene/src/java/org/apache/lucene/search/similarities/SimilarityBase.java
@ -20,15 +20,12 @@ package org.apache.lucene.search.similarities;
 import java.io.IOException;

 import org.apache.lucene.index.FieldInvertState;
-import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexReader.AtomicReaderContext;
-import org.apache.lucene.index.MultiFields;
-import org.apache.lucene.index.Terms;
+import org.apache.lucene.search.CollectionStatistics;
 import org.apache.lucene.search.Explanation;
-import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.TermStatistics;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.SmallFloat;
-import org.apache.lucene.util.TermContext;

 /**
 * A subclass of {@code Similarity} that provides a simplified API for its
@ -71,12 +68,11 @@ public abstract class SimilarityBase extends Similarity {
  }
  
  @Override
-  public final Stats computeStats(IndexSearcher searcher, String fieldName,
-      float queryBoost, TermContext... termContexts) throws IOException {
-    BasicStats stats[] = new BasicStats[termContexts.length];
-    for (int i = 0; i < termContexts.length; i++) {
+  public final Stats computeStats(CollectionStatistics collectionStats, float queryBoost, TermStatistics... termStats) {
+    BasicStats stats[] = new BasicStats[termStats.length];
+    for (int i = 0; i < termStats.length; i++) {
      stats[i] = newStats(queryBoost);
-      fillBasicStats(stats[i], searcher, fieldName, termContexts[i]);
+      fillBasicStats(stats[i], collectionStats, termStats[i]);
    }
    return stats.length == 1 ? stats[0] : new MultiSimilarity.MultiStats(stats);
  }
@ -88,13 +84,11 @@ public abstract class SimilarityBase extends Similarity {
  
  /** Fills all member fields defined in {@code BasicStats} in {@code stats}. 
   *  Subclasses can override this method to fill additional stats. */
-  protected void fillBasicStats(BasicStats stats, IndexSearcher searcher,
-      String fieldName, TermContext termContext) throws IOException {
-    IndexReader reader = searcher.getIndexReader();
-    int numberOfDocuments = reader.maxDoc();
+  protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) {
+    int numberOfDocuments = collectionStats.maxDoc();
    
-    int docFreq = termContext.docFreq();
-    long totalTermFreq = termContext.totalTermFreq();
+    int docFreq = termStats.docFreq();
+    long totalTermFreq = termStats.totalTermFreq();

    // codec does not supply totalTermFreq: substitute docFreq
    if (totalTermFreq == -1) {
@ -103,25 +97,19 @@ public abstract class SimilarityBase extends Similarity {

    final long numberOfFieldTokens;
    final float avgFieldLength;
-    
-    Terms terms = MultiFields.getTerms(searcher.getIndexReader(), fieldName);
-    if (terms == null) {
-      // field does not exist;
-      numberOfFieldTokens = 0;
-      avgFieldLength = 1;
-    } else {
-      long sumTotalTermFreq = terms.getSumTotalTermFreq();

+    long sumTotalTermFreq = collectionStats.sumTotalTermFreq();
+
+    if (sumTotalTermFreq <= 0) {
+      // field does not exist;
      // We have to provide something if codec doesnt supply these measures,
      // or if someone omitted frequencies for the field... negative values cause
      // NaN/Inf for some scorers.
-      if (sumTotalTermFreq == -1) {
-        numberOfFieldTokens = docFreq;
-        avgFieldLength = 1;
-      } else {
-        numberOfFieldTokens = sumTotalTermFreq;
-        avgFieldLength = (float)numberOfFieldTokens / numberOfDocuments;
-      }
+      numberOfFieldTokens = docFreq;
+      avgFieldLength = 1;
+    } else {
+      numberOfFieldTokens = sumTotalTermFreq;
+      avgFieldLength = (float)numberOfFieldTokens / numberOfDocuments;
    }
 
    // TODO: add sumDocFreq for field (numberOfFieldPostings)
--- a/lucene/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java
+++ b/lucene/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java
@ -22,9 +22,11 @@ import java.io.IOException;

 import org.apache.lucene.index.IndexReader.AtomicReaderContext;
 import org.apache.lucene.index.Term;
+import org.apache.lucene.search.CollectionStatistics;
 import org.apache.lucene.search.Explanation;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.TermStatistics;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.TermContext;
 import org.apache.lucene.util.SmallFloat;
@ -575,15 +577,15 @@ public abstract class TFIDFSimilarity extends Similarity {
   * is inaccurate, so is {@link IndexSearcher#maxDoc()}, and in the same direction.
   * In addition, {@link IndexSearcher#maxDoc()} is more efficient to compute
   *   
-   * @param stats statistics of the term in question
-   * @param searcher the document collection being searched
+   * @param collectionStats collection-level statistics
+   * @param termStats term-level statistics for the term
   * @return an Explain object that includes both an idf score factor 
             and an explanation for the term.
   * @throws IOException
   */
-  public Explanation idfExplain(TermContext stats, final IndexSearcher searcher) throws IOException {
-    final int df = stats.docFreq();
-    final int max = searcher.maxDoc();
+  public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
+    final int df = termStats.docFreq();
+    final int max = collectionStats.maxDoc();
    final float idf = idf(df, max);
    return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")");
  }
@ -595,19 +597,19 @@ public abstract class TFIDFSimilarity extends Similarity {
   * The default implementation sums the idf factor for
   * each term in the phrase.
   * 
-   * @param stats statistics of the terms in the phrase
-   * @param searcher the document collection being searched
+   * @param collectionStats collection-level statistics
+   * @param termStats term-level statistics for the terms in the phrase
   * @return an Explain object that includes both an idf 
   *         score factor for the phrase and an explanation 
   *         for each term.
   * @throws IOException
   */
-  public Explanation idfExplain(final TermContext stats[], IndexSearcher searcher) throws IOException {
-    final int max = searcher.maxDoc();
+  public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
+    final int max = collectionStats.maxDoc();
    float idf = 0.0f;
    final Explanation exp = new Explanation();
    exp.setDescription("idf(), sum of:");
-    for (final TermContext stat : stats ) {
+    for (final TermStatistics stat : termStats ) {
      final int df = stat.docFreq();
      final float termIdf = idf(df, max);
      exp.addDetail(new Explanation(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"));
@ -693,11 +695,10 @@ public abstract class TFIDFSimilarity extends Similarity {
  public abstract float scorePayload(int doc, int start, int end, BytesRef payload);

  @Override
-  public final Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost,
-      TermContext... termContexts) throws IOException {
-    final Explanation idf = termContexts.length == 1
-    ? idfExplain(termContexts[0], searcher)
-    : idfExplain(termContexts, searcher);
+  public final Stats computeStats(CollectionStatistics collectionStats, float queryBoost, TermStatistics... termStats) {
+    final Explanation idf = termStats.length == 1
+    ? idfExplain(collectionStats, termStats[0])
+    : idfExplain(collectionStats, termStats);
    return new IDFStats(idf, queryBoost);
  }

--- a/lucene/src/java/org/apache/lucene/search/spans/SpanWeight.java
+++ b/lucene/src/java/org/apache/lucene/search/spans/SpanWeight.java
@ -48,10 +48,17 @@ public class SpanWeight extends Weight {
    query.extractTerms(terms);
    final ReaderContext context = searcher.getTopReaderContext();
    final TermContext states[] = new TermContext[terms.size()];
+    final TermStatistics termStats[] = new TermStatistics[terms.size()];
    int i = 0;
-    for (Term term : terms)
-      states[i++] = TermContext.build(context, term, true);
-    stats = similarity.computeStats(searcher, query.getField(), query.getBoost(), states);
+    for (Term term : terms) {
+      states[i] = TermContext.build(context, term, true);
+      termStats[i] = searcher.termStatistics(term, states[i]);
+      i++;
+    }
+    stats = similarity.computeStats(
+        searcher.collectionStatistics(query.getField()), 
+        query.getBoost(), 
+        termStats);
  }

  @Override
--- a/lucene/src/test/org/apache/lucene/index/TestOmitTf.java
+++ b/lucene/src/test/org/apache/lucene/index/TestOmitTf.java
@ -21,7 +21,6 @@ import java.io.IOException;

 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.util.TermContext;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.document.Document;
@ -50,7 +49,7 @@ public class TestOmitTf extends LuceneTestCase {
        @Override public float tf(float freq) { return freq; }
        @Override public float sloppyFreq(int distance) { return 2.0f; }
        @Override public float idf(int docFreq, int numDocs) { return 1.0f; }
-        @Override public Explanation idfExplain(TermContext[] terms, IndexSearcher searcher) throws IOException {
+        @Override public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics[] termStats) {
          return new Explanation(1.0f, "Inexplicable");
        }
        @Override public float scorePayload(int doc, int start, int end, BytesRef payload) { return 1.0f; }
--- a/lucene/src/test/org/apache/lucene/search/JustCompileSearch.java
+++ b/lucene/src/test/org/apache/lucene/search/JustCompileSearch.java
@ -22,12 +22,8 @@ import java.io.IOException;
 import org.apache.lucene.index.IndexReader.AtomicReaderContext;
 import org.apache.lucene.search.similarities.Similarity;
 import org.apache.lucene.search.similarities.SimilarityProvider;
-import org.apache.lucene.search.similarities.Similarity.ExactDocScorer;
-import org.apache.lucene.search.similarities.Similarity.SloppyDocScorer;
-import org.apache.lucene.search.similarities.Similarity.Stats;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.TermContext;
 import org.apache.lucene.index.FieldInvertState;
 import org.apache.lucene.util.PriorityQueue;

@ -250,7 +246,7 @@ final class JustCompileSearch {
  static final class JustCompileSimilarity extends Similarity {

    @Override
-    public Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost, TermContext... termContexts) throws IOException {
+    public Stats computeStats(CollectionStatistics collectionStats, float queryBoost, TermStatistics... termStats) {
      throw new UnsupportedOperationException(UNSUPPORTED_MSG);
    }

--- a/lucene/src/test/org/apache/lucene/search/TestDocValuesScoring.java
+++ b/lucene/src/test/org/apache/lucene/search/TestDocValuesScoring.java
@ -161,8 +161,8 @@ public class TestDocValuesScoring extends LuceneTestCase {
    }

    @Override
-    public Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost, TermContext... termContexts) throws IOException {
-      return sim.computeStats(searcher, fieldName, queryBoost, termContexts);
+    public Stats computeStats(CollectionStatistics collectionStats, float queryBoost, TermStatistics... termStats) {
+      return sim.computeStats(collectionStats, queryBoost, termStats);
    }

    @Override
--- a/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java
+++ b/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java
@ -316,8 +316,7 @@ public class TestMultiPhraseQuery extends LuceneTestCase {
        return new DefaultSimilarity() {
          
          @Override
-          public Explanation idfExplain(TermContext stats[],
-              IndexSearcher searcher) throws IOException {
+          public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
            return new Explanation(10f, "just a test");
          } 
        };
--- a/lucene/src/test/org/apache/lucene/search/TestSimilarity.java
+++ b/lucene/src/test/org/apache/lucene/search/TestSimilarity.java
@ -18,7 +18,6 @@ package org.apache.lucene.search;
 */

 import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.util.TermContext;

 import java.io.IOException;

@ -50,7 +49,7 @@ public class TestSimilarity extends LuceneTestCase {
        @Override public float tf(float freq) { return freq; }
        @Override public float sloppyFreq(int distance) { return 2.0f; }
        @Override public float idf(int docFreq, int numDocs) { return 1.0f; }
-        @Override public Explanation idfExplain(TermContext[] stats, IndexSearcher searcher) throws IOException {
+        @Override public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics[] stats) {
          return new Explanation(1.0f, "Inexplicable"); 
        }
      };
--- a/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadNearQuery.java
+++ b/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadNearQuery.java
@ -27,10 +27,12 @@ import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.Payload;
 import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.Term;
+import org.apache.lucene.search.CollectionStatistics;
 import org.apache.lucene.search.Explanation;
 import org.apache.lucene.search.QueryUtils;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.TermStatistics;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.similarities.DefaultSimilarity;
 import org.apache.lucene.search.similarities.Similarity;
@ -42,7 +44,6 @@ import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.English;
 import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.util.TermContext;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;

@ -346,7 +347,7 @@ public class TestPayloadNearQuery extends LuceneTestCase {
    
        // idf used for phrase queries
        @Override 
-        public Explanation idfExplain(TermContext states[], IndexSearcher searcher) throws IOException {
+        public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics[] termStats) {
          return new Explanation(1.0f, "Inexplicable");
        }
      };
--- a/lucene/src/test/org/apache/lucene/search/similarities/SpoofIndexSearcher.java
+++ b/lucene/src/test/org/apache/lucene/search/similarities/SpoofIndexSearcher.java
@ -1,222 +0,0 @@
-package org.apache.lucene.search.similarities;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Comparator;
-import java.util.Map;
-
-import org.apache.lucene.index.CorruptIndexException;
-import org.apache.lucene.index.Fields;
-import org.apache.lucene.index.FieldsEnum;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.StoredFieldVisitor;
-import org.apache.lucene.index.TermFreqVector;
-import org.apache.lucene.index.TermVectorMapper;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.index.codecs.PerDocValues;
-import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.util.Bits;
-import org.apache.lucene.util.BytesRef;
-
-/**
- * Index searcher implementation that takes an {@link BasicStats} instance and
- * returns statistics accordingly. Most of the methods are not implemented, so
- * it can only be used for Similarity unit testing.
- */
-public class SpoofIndexSearcher extends IndexSearcher {
-  public SpoofIndexSearcher(BasicStats stats) {
-    super(new SpoofIndexReader(stats));
-  }
-  
-  public static class SpoofIndexReader extends IndexReader {
-    /** The stats the reader has to return. */
-    protected BasicStats stats;
-    /** The fields the reader has to return. */
-    protected SpoofFields fields;
-    
-    public SpoofIndexReader(BasicStats stats) {
-      this.stats = stats;
-      this.fields = new SpoofFields(stats);
-    }
-
-    @Override
-    public int numDocs() {
-      return stats.getNumberOfDocuments();
-    }
-
-    @Override
-    public int maxDoc() {
-      return stats.getNumberOfDocuments();
-    }
-
-    @Override
-    public Fields fields() throws IOException {
-      return fields;
-    }
-
-    @Override
-    public Collection<String> getFieldNames(FieldOption fldOption) {
-      return Arrays.asList(new String[]{"spoof"});
-    }
-
-    @Override
-    public ReaderContext getTopReaderContext() {
-      return new AtomicReaderContext(this);
-    }
-    
-    @Override
-    public boolean hasDeletions() {
-      return false;
-    }
-
-    // ------------------------ Not implemented methods ------------------------
-    
-    @Override
-    public TermFreqVector[] getTermFreqVectors(int docNumber)
-        throws IOException {
-      return null;
-    }
-
-    @Override
-    public TermFreqVector getTermFreqVector(int docNumber, String field)
-        throws IOException {
-      return null;
-    }
-
-    @Override
-    public void getTermFreqVector(int docNumber, String field,
-        TermVectorMapper mapper) throws IOException {
-    }
-
-    @Override
-    public void getTermFreqVector(int docNumber, TermVectorMapper mapper)
-        throws IOException {
-    }
-
-    @Override
-    public void document(int docID, StoredFieldVisitor visitor) throws CorruptIndexException, IOException {
-    }
-
-    @Override
-    public byte[] norms(String field) throws IOException {
-      return null;
-    }
-
-    @Override
-    protected void doSetNorm(int doc, String field, byte value)
-        throws CorruptIndexException, IOException {
-    }
-
-    @Override
-    public PerDocValues perDocValues() throws IOException {
-      return null;
-    }
-
-    @Override
-    protected void doDelete(int docNum) throws CorruptIndexException,
-        IOException {
-    }
-
-    @Override
-    protected void doUndeleteAll() throws CorruptIndexException, IOException {
-    }
-
-    @Override
-    protected void doCommit(Map<String,String> commitUserData)
-        throws IOException {
-    }
-
-    @Override
-    protected void doClose() throws IOException {
-    }
-
-    @Override
-    public Bits getLiveDocs() {
-      return null;
-    }
-  }
-  
-  /** Spoof Fields class for Similarity testing. */
-  public static class SpoofFields extends Fields {
-    /** The stats the object has to return. */
-    protected SpoofTerms terms;
-    
-    public SpoofFields(BasicStats stats) {
-      this.terms = new SpoofTerms(stats);
-    }
-    
-    @Override
-    public Terms terms(String field) throws IOException {
-      return terms;
-    }
-    
-    // ------------------------ Not implemented methods ------------------------
-
-    @Override
-    public FieldsEnum iterator() throws IOException {
-      return null;
-    }
-  }
-  
-  /** Spoof Terms class for Similarity testing. */
-  public static class SpoofTerms extends Terms {
-    /** The stats the object has to return. */
-    protected BasicStats stats;
-    
-    public SpoofTerms(BasicStats stats) {
-      this.stats = stats;
-    }
-    
-    @Override
-    public long getSumTotalTermFreq() throws IOException {
-      return stats.getNumberOfFieldTokens();
-    }
-
-    @Override
-    public long getSumDocFreq() throws IOException {
-      return stats.getDocFreq();
-    }
-    
-    @Override
-    public int getDocCount() throws IOException {
-      return stats.getDocFreq();
-    }    
-    
-    // ------------------------ Not implemented methods ------------------------
-
-    
-    @Override
-    public TermsEnum iterator() throws IOException {
-      return null;
-    }
-
-    @Override
-    public long getUniqueTermCount() throws IOException {
-      return -1;
-    }
-
-    @Override
-    public Comparator<BytesRef> getComparator() throws IOException {
-      return null;
-    }
-  }
-}
--- a/lucene/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java
+++ b/lucene/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java
@ -30,12 +30,15 @@ import org.apache.lucene.index.OrdTermState;
 import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.codecs.CodecProvider;
+import org.apache.lucene.search.CollectionStatistics;
 import org.apache.lucene.search.Explanation;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TermStatistics;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.TermContext;
 import org.junit.Ignore;
@ -172,7 +175,14 @@ public class TestSimilarityBase extends LuceneTestCase {
    stats.setTotalTermFreq(TOTAL_TERM_FREQ);
    return stats;
  }
-
+  
+  private CollectionStatistics toCollectionStats(BasicStats stats) {
+    return new CollectionStatistics("spoof", stats.getNumberOfDocuments(), -1, stats.getNumberOfFieldTokens(), -1);
+  }
+  
+  private TermStatistics toTermStats(BasicStats stats) {
+    return new TermStatistics(new BytesRef("spoofyText"), stats.getDocFreq(), stats.getTotalTermFreq());
+  }
  /**
   * The generic test core called by all unit test methods. It calls the
   * {@link SimilarityBase#score(BasicStats, float, int)} method of all
@ -180,17 +190,11 @@ public class TestSimilarityBase extends LuceneTestCase {
   * is a finite positive real number.
   */
  private void unitTestCore(BasicStats stats, float freq, int docLen)
-      throws IOException {
-    // We have to fake everything, because computeStats() can be overridden and
-    // there is no way to inject false data after fillBasicStats().
-    SpoofIndexSearcher searcher = new SpoofIndexSearcher(stats);
-    TermContext tc = new TermContext(
-        searcher.getIndexReader().getTopReaderContext(),
-        new OrdTermState(), 0, stats.getDocFreq(), stats.getTotalTermFreq());
-    
+      throws IOException { 
    for (SimilarityBase sim : sims) {
-      BasicStats realStats = (BasicStats) sim.computeStats(new SpoofIndexSearcher(stats),
-          "spoof", stats.getTotalBoost(), tc);
+      BasicStats realStats = (BasicStats) sim.computeStats(toCollectionStats(stats), 
+          stats.getTotalBoost(),
+          toTermStats(stats));
      float score = sim.score(realStats, freq, docLen);
      float explScore = sim.explain(
          realStats, 1, new Explanation(freq, "freq"), docLen).getValue();
@ -520,16 +524,10 @@ public class TestSimilarityBase extends LuceneTestCase {
   */
  private void correctnessTestCore(SimilarityBase sim, float gold)
      throws IOException {
-    // We have to fake everything, because computeStats() can be overridden and
-    // there is no way to inject false data after fillBasicStats().
    BasicStats stats = createStats();
-    SpoofIndexSearcher searcher = new SpoofIndexSearcher(stats);
-    TermContext tc = new TermContext(
-        searcher.getIndexReader().getTopReaderContext(),
-        new OrdTermState(), 0, stats.getDocFreq(), stats.getTotalTermFreq());
-    
-    BasicStats realStats = (BasicStats) sim.computeStats(
-        searcher, "spoof", stats.getTotalBoost(), tc);
+    BasicStats realStats = (BasicStats) sim.computeStats(toCollectionStats(stats), 
+        stats.getTotalBoost(),
+        toTermStats(stats));
    float score = sim.score(realStats, FREQ, DOC_LEN);
    assertEquals(
        sim.toString() + " score not correct.", gold, score, FLOAT_EPSILON);