LUCENE-4289: fix highlighter idf inconsistencies/inefficiencies

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1369859 13f79535-47bb-0310-9956-ffa450edef68
2012-08-06 14:55:41 +00:00 · 2012-08-06 14:55:41 +00:00 · 462ff90d8e
parent c1f2562aff
commit 462ff90d8e
4 changed files with 11 additions and 16 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -169,6 +169,9 @@ Bug Fixes
 * LUCENE-4282: Automaton FuzzyQuery didnt always deliver all results.
  (Johannes Christen, Uwe Schindler, Robert Muir)

+* LUCENE-4289: Fix minor idf inconsistencies/inefficiencies in highlighter.
+  (Robert Muir)
+
 Changes in Runtime Behavior

 * LUCENE-4109: Enable position increments in the flexible queryparser by default.
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermExtractor.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermExtractor.java
@ -60,18 +60,14 @@ public final class QueryTermExtractor
 	public static final WeightedTerm[] getIdfWeightedTerms(Query query, IndexReader reader, String fieldName) 
 	{
 	    WeightedTerm[] terms=getTerms(query,false, fieldName);
-	    int totalNumDocs=reader.numDocs();
+	    int totalNumDocs=reader.maxDoc();
 	    for (int i = 0; i < terms.length; i++)
        {
 	        try
            {
                int docFreq=reader.docFreq(new Term(fieldName,terms[i].term));
-                // docFreq counts deletes
-                if(totalNumDocs < docFreq) {
-                  docFreq = totalNumDocs;
-                }
                //IDF algorithm taken from DefaultSimilarity class
-                float idf=(float)(Math.log((float)totalNumDocs/(double)(docFreq+1)) + 1.0);
+                float idf=(float)(Math.log(totalNumDocs/(double)(docFreq+1)) + 1.0);
                terms[i].weight*=idf;
            } 
 	        catch (IOException e)
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
@ -431,7 +431,7 @@ public class WeightedSpanTermExtractor {
    Map<String,WeightedSpanTerm> terms = new PositionCheckingMap<String>();
    extract(query, terms);

-    int totalNumDocs = reader.numDocs();
+    int totalNumDocs = reader.maxDoc();
    Set<String> weightedTerms = terms.keySet();
    Iterator<String> it = weightedTerms.iterator();

@ -439,12 +439,8 @@ public class WeightedSpanTermExtractor {
      while (it.hasNext()) {
        WeightedSpanTerm weightedSpanTerm = terms.get(it.next());
        int docFreq = reader.docFreq(new Term(fieldName, weightedSpanTerm.term));
-        // docFreq counts deletes
-        if(totalNumDocs < docFreq) {
-          docFreq = totalNumDocs;
-        }
        // IDF algorithm taken from DefaultSimilarity class
-        float idf = (float) (Math.log((float) totalNumDocs / (double) (docFreq + 1)) + 1.0);
+        float idf = (float) (Math.log(totalNumDocs / (double) (docFreq + 1)) + 1.0);
        weightedSpanTerm.weight *= idf;
      }
    } finally {
--- a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java
@ -95,8 +95,7 @@ public class FieldTermStack {
    DocsAndPositionsEnum dpEnum = null;
    BytesRef text;
    
-    int numDocs = reader.numDocs() - reader.numDeletedDocs();
-    float weight = 0;
+    int numDocs = reader.maxDoc();
    
    while ((text = termsEnum.next()) != null) {
      UnicodeUtil.UTF8toUTF16(text, spare);
@ -111,13 +110,14 @@ public class FieldTermStack {
      }

      dpEnum.nextDoc();
+      
+      // For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html
+      final float weight = ( float ) ( Math.log( numDocs / ( double ) ( reader.docFreq( fieldName, text ) + 1 ) ) + 1.0 );

      final int freq = dpEnum.freq();
      
      for(int i = 0;i < freq;i++) {
        int pos = dpEnum.nextPosition();
-        // For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html
-        weight = ( float ) ( Math.log( numDocs / ( double ) ( reader.docFreq( fieldName, text ) + 1 ) ) + 1.0 );
        if (dpEnum.startOffset() < 0) {
          return; // no offsets, null snippet
        }