LUCENE-4648: use only within-doc stats for PostingsHighlighter

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1425992 13f79535-47bb-0310-9956-ffa450edef68
2012-12-26 19:14:28 +00:00 · 2012-12-26 19:14:28 +00:00 · 936a67bb7d
parent aa00eb2e70
commit 936a67bb7d
2 changed files with 17 additions and 40 deletions
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/postingshighlight/PassageScorer.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/postingshighlight/PassageScorer.java
@ -17,9 +17,6 @@ package org.apache.lucene.sandbox.postingshighlight;
 * limitations under the License.
 */

-import org.apache.lucene.search.CollectionStatistics;
-import org.apache.lucene.search.TermStatistics;
-
 /** 
 * Used for ranking passages.
 * <p>
@ -44,16 +41,17 @@ public class PassageScorer {
  public static final float pivot = 87f;
    
  /**
-   * Computes term importance, given its collection-wide statistics.
+   * Computes term importance, given its in-document statistics.
   * 
-   * @param collectionStats statistics for the collection
-   * @param termStats statistics for the term
+   * @param contentLength length of document in characters
+   * @param totalTermFreq number of time term occurs in document
   * @return term importance
   */
-  public float weight(CollectionStatistics collectionStats, TermStatistics termStats) {
-    long numDocs = collectionStats.maxDoc();
-    long docFreq = termStats.docFreq();
-    return (k1 + 1) * (float) Math.log(1 + (numDocs - docFreq + 0.5D)/(docFreq + 0.5D));
+  public float weight(int contentLength, int totalTermFreq) {
+    // approximate #docs from content length
+    float numDocs = 1 + contentLength / pivot;
+    // numDocs not numDocs - docFreq (ala DFR), since we approximate numDocs
+    return (k1 + 1) * (float) Math.log(1 + (numDocs + 0.5D)/(totalTermFreq + 0.5D));
  }

  /**
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/postingshighlight/PostingsHighlighter.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/postingshighlight/PostingsHighlighter.java
@ -39,16 +39,12 @@ import org.apache.lucene.index.MultiReader;
 import org.apache.lucene.index.ReaderUtil;
 import org.apache.lucene.index.StoredFieldVisitor;
 import org.apache.lucene.index.Term;
-import org.apache.lucene.index.TermContext;
-import org.apache.lucene.index.TermState;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.index.FieldInfo.IndexOptions;
-import org.apache.lucene.search.CollectionStatistics;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.ScoreDoc;
-import org.apache.lucene.search.TermStatistics;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.UnicodeUtil;
@ -135,6 +131,7 @@ public final class PostingsHighlighter {
    SortedSet<Term> terms = new TreeSet<Term>();
    query.extractTerms(terms);
    terms = terms.subSet(floor, ceiling);
+    Term termTexts[] = terms.toArray(new Term[terms.size()]);
    // TODO: should we have some reasonable defaults for term pruning? (e.g. stopwords)

    int docids[] = new int[scoreDocs.length];
@ -156,24 +153,6 @@ public final class PostingsHighlighter {
      visitor.reset();
    }
    
-    // now pull index stats: TODO: we should probably pull this from the reader instead?
-    // this could be a distributed call, which is crazy
-    CollectionStatistics collectionStats = searcher.collectionStatistics(field);
-    TermContext termContexts[] = new TermContext[terms.size()];
-    Term termTexts[] = new Term[terms.size()]; // needed for seekExact
-    float weights[] = new float[terms.size()];
-    int upto = 0;
-    for (Term term : terms) {
-      termTexts[upto] = term;
-      TermContext context = TermContext.build(readerContext, term, true);
-      termContexts[upto] = context;
-      TermStatistics termStats = searcher.termStatistics(term, context);
-      weights[upto] = scorer.weight(collectionStats, termStats);
-      upto++;
-      // TODO: should we instead score all the documents term-at-a-time here?
-      // the i/o would be better, but more transient ram
-    }
-    
    BreakIterator bi = (BreakIterator)breakIterator.clone();
    
    Map<Integer,String> highlights = new HashMap<Integer,String>();
@ -201,7 +180,7 @@ public final class PostingsHighlighter {
        termsEnum = t.iterator(null);
        postings = new DocsAndPositionsEnum[terms.size()];
      }
-      Passage passages[] = highlightDoc(termTexts, termContexts, subContext.ord, weights, content.length(), bi, doc - subContext.docBase, termsEnum, postings, maxPassages);
+      Passage passages[] = highlightDoc(termTexts, content.length(), bi, doc - subContext.docBase, termsEnum, postings, maxPassages);
      if (passages.length > 0) {
        // otherwise a null snippet
        highlights.put(doc, formatter.format(passages, content));
@ -219,9 +198,10 @@ public final class PostingsHighlighter {
  // algorithm: treat sentence snippets as miniature documents
  // we can intersect these with the postings lists via BreakIterator.preceding(offset),s
  // score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq))
-  private Passage[] highlightDoc(Term termTexts[], TermContext[] terms, int ord, float[] weights, 
-      int contentLength, BreakIterator bi, int doc, TermsEnum termsEnum, DocsAndPositionsEnum[] postings, int n) throws IOException {
+  private Passage[] highlightDoc(Term terms[], int contentLength, BreakIterator bi, int doc, 
+      TermsEnum termsEnum, DocsAndPositionsEnum[] postings, int n) throws IOException {
    PriorityQueue<OffsetsEnum> pq = new PriorityQueue<OffsetsEnum>();
+    float weights[] = new float[terms.length];
    // initialize postings
    for (int i = 0; i < terms.length; i++) {
      DocsAndPositionsEnum de = postings[i];
@ -230,11 +210,9 @@ public final class PostingsHighlighter {
        continue;
      } else if (de == null) {
        postings[i] = EMPTY; // initially
-        TermState ts = terms[i].get(ord);
-        if (ts == null) {
-          continue;
+        if (!termsEnum.seekExact(terms[i].bytes(), true)) {
+          continue; // term not found
        }
-        termsEnum.seekExact(termTexts[i].bytes(), ts);
        DocsAndPositionsEnum de2 = termsEnum.docsAndPositions(null, null, DocsAndPositionsEnum.FLAG_OFFSETS);
        if (de2 == null) {
          continue;
@ -250,6 +228,7 @@ public final class PostingsHighlighter {
      }

      if (doc == pDoc) {
+        weights[i] = scorer.weight(contentLength, de.freq());
        de.nextPosition();
        pq.add(new OffsetsEnum(de, i));
      }
@ -315,7 +294,7 @@ public final class PostingsHighlighter {
      int tf = 0;
      while (true) {
        tf++;
-        current.addMatch(start, end, termTexts[off.id]);
+        current.addMatch(start, end, terms[off.id]);
        if (off.pos == dp.freq()) {
          break; // removed from pq
        } else {