diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index f4027d5a252..21bd358b261 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -169,6 +169,9 @@ Bug Fixes * LUCENE-4282: Automaton FuzzyQuery didnt always deliver all results. (Johannes Christen, Uwe Schindler, Robert Muir) +* LUCENE-4289: Fix minor idf inconsistencies/inefficiencies in highlighter. + (Robert Muir) + Changes in Runtime Behavior * LUCENE-4109: Enable position increments in the flexible queryparser by default. diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermExtractor.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermExtractor.java index a60422c87dd..147209f1c66 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermExtractor.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermExtractor.java @@ -60,18 +60,14 @@ public final class QueryTermExtractor public static final WeightedTerm[] getIdfWeightedTerms(Query query, IndexReader reader, String fieldName) { WeightedTerm[] terms=getTerms(query,false, fieldName); - int totalNumDocs=reader.numDocs(); + int totalNumDocs=reader.maxDoc(); for (int i = 0; i < terms.length; i++) { try { int docFreq=reader.docFreq(new Term(fieldName,terms[i].term)); - // docFreq counts deletes - if(totalNumDocs < docFreq) { - docFreq = totalNumDocs; - } //IDF algorithm taken from DefaultSimilarity class - float idf=(float)(Math.log((float)totalNumDocs/(double)(docFreq+1)) + 1.0); + float idf=(float)(Math.log(totalNumDocs/(double)(docFreq+1)) + 1.0); terms[i].weight*=idf; } catch (IOException e) diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java index 161225b40d6..61efe7544d5 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java @@ -431,7 +431,7 @@ public class WeightedSpanTermExtractor { Map terms = new PositionCheckingMap(); extract(query, terms); - int totalNumDocs = reader.numDocs(); + int totalNumDocs = reader.maxDoc(); Set weightedTerms = terms.keySet(); Iterator it = weightedTerms.iterator(); @@ -439,12 +439,8 @@ public class WeightedSpanTermExtractor { while (it.hasNext()) { WeightedSpanTerm weightedSpanTerm = terms.get(it.next()); int docFreq = reader.docFreq(new Term(fieldName, weightedSpanTerm.term)); - // docFreq counts deletes - if(totalNumDocs < docFreq) { - docFreq = totalNumDocs; - } // IDF algorithm taken from DefaultSimilarity class - float idf = (float) (Math.log((float) totalNumDocs / (double) (docFreq + 1)) + 1.0); + float idf = (float) (Math.log(totalNumDocs / (double) (docFreq + 1)) + 1.0); weightedSpanTerm.weight *= idf; } } finally { diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java index 236437cd1b1..ac1e39c9e1e 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java @@ -95,8 +95,7 @@ public class FieldTermStack { DocsAndPositionsEnum dpEnum = null; BytesRef text; - int numDocs = reader.numDocs() - reader.numDeletedDocs(); - float weight = 0; + int numDocs = reader.maxDoc(); while ((text = termsEnum.next()) != null) { UnicodeUtil.UTF8toUTF16(text, spare); @@ -111,13 +110,14 @@ public class FieldTermStack { } dpEnum.nextDoc(); + + // For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html + final float weight = ( float ) ( Math.log( numDocs / ( double ) ( reader.docFreq( fieldName, text ) + 1 ) ) + 1.0 ); final int freq = dpEnum.freq(); for(int i = 0;i < freq;i++) { int pos = dpEnum.nextPosition(); - // For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html - weight = ( float ) ( Math.log( numDocs / ( double ) ( reader.docFreq( fieldName, text ) + 1 ) ) + 1.0 ); if (dpEnum.startOffset() < 0) { return; // no offsets, null snippet }