LUCENE-4289: fix highlighter idf inconsistencies/inefficiencies

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1369859 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-08-06 14:55:41 +00:00
parent c1f2562aff
commit 462ff90d8e
4 changed files with 11 additions and 16 deletions

View File

@ -169,6 +169,9 @@ Bug Fixes
* LUCENE-4282: Automaton FuzzyQuery didnt always deliver all results. * LUCENE-4282: Automaton FuzzyQuery didnt always deliver all results.
(Johannes Christen, Uwe Schindler, Robert Muir) (Johannes Christen, Uwe Schindler, Robert Muir)
* LUCENE-4289: Fix minor idf inconsistencies/inefficiencies in highlighter.
(Robert Muir)
Changes in Runtime Behavior Changes in Runtime Behavior
* LUCENE-4109: Enable position increments in the flexible queryparser by default. * LUCENE-4109: Enable position increments in the flexible queryparser by default.

View File

@ -60,18 +60,14 @@ public final class QueryTermExtractor
public static final WeightedTerm[] getIdfWeightedTerms(Query query, IndexReader reader, String fieldName) public static final WeightedTerm[] getIdfWeightedTerms(Query query, IndexReader reader, String fieldName)
{ {
WeightedTerm[] terms=getTerms(query,false, fieldName); WeightedTerm[] terms=getTerms(query,false, fieldName);
int totalNumDocs=reader.numDocs(); int totalNumDocs=reader.maxDoc();
for (int i = 0; i < terms.length; i++) for (int i = 0; i < terms.length; i++)
{ {
try try
{ {
int docFreq=reader.docFreq(new Term(fieldName,terms[i].term)); int docFreq=reader.docFreq(new Term(fieldName,terms[i].term));
// docFreq counts deletes
if(totalNumDocs < docFreq) {
docFreq = totalNumDocs;
}
//IDF algorithm taken from DefaultSimilarity class //IDF algorithm taken from DefaultSimilarity class
float idf=(float)(Math.log((float)totalNumDocs/(double)(docFreq+1)) + 1.0); float idf=(float)(Math.log(totalNumDocs/(double)(docFreq+1)) + 1.0);
terms[i].weight*=idf; terms[i].weight*=idf;
} }
catch (IOException e) catch (IOException e)

View File

@ -431,7 +431,7 @@ public class WeightedSpanTermExtractor {
Map<String,WeightedSpanTerm> terms = new PositionCheckingMap<String>(); Map<String,WeightedSpanTerm> terms = new PositionCheckingMap<String>();
extract(query, terms); extract(query, terms);
int totalNumDocs = reader.numDocs(); int totalNumDocs = reader.maxDoc();
Set<String> weightedTerms = terms.keySet(); Set<String> weightedTerms = terms.keySet();
Iterator<String> it = weightedTerms.iterator(); Iterator<String> it = weightedTerms.iterator();
@ -439,12 +439,8 @@ public class WeightedSpanTermExtractor {
while (it.hasNext()) { while (it.hasNext()) {
WeightedSpanTerm weightedSpanTerm = terms.get(it.next()); WeightedSpanTerm weightedSpanTerm = terms.get(it.next());
int docFreq = reader.docFreq(new Term(fieldName, weightedSpanTerm.term)); int docFreq = reader.docFreq(new Term(fieldName, weightedSpanTerm.term));
// docFreq counts deletes
if(totalNumDocs < docFreq) {
docFreq = totalNumDocs;
}
// IDF algorithm taken from DefaultSimilarity class // IDF algorithm taken from DefaultSimilarity class
float idf = (float) (Math.log((float) totalNumDocs / (double) (docFreq + 1)) + 1.0); float idf = (float) (Math.log(totalNumDocs / (double) (docFreq + 1)) + 1.0);
weightedSpanTerm.weight *= idf; weightedSpanTerm.weight *= idf;
} }
} finally { } finally {

View File

@ -95,8 +95,7 @@ public class FieldTermStack {
DocsAndPositionsEnum dpEnum = null; DocsAndPositionsEnum dpEnum = null;
BytesRef text; BytesRef text;
int numDocs = reader.numDocs() - reader.numDeletedDocs(); int numDocs = reader.maxDoc();
float weight = 0;
while ((text = termsEnum.next()) != null) { while ((text = termsEnum.next()) != null) {
UnicodeUtil.UTF8toUTF16(text, spare); UnicodeUtil.UTF8toUTF16(text, spare);
@ -112,12 +111,13 @@ public class FieldTermStack {
dpEnum.nextDoc(); dpEnum.nextDoc();
// For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html
final float weight = ( float ) ( Math.log( numDocs / ( double ) ( reader.docFreq( fieldName, text ) + 1 ) ) + 1.0 );
final int freq = dpEnum.freq(); final int freq = dpEnum.freq();
for(int i = 0;i < freq;i++) { for(int i = 0;i < freq;i++) {
int pos = dpEnum.nextPosition(); int pos = dpEnum.nextPosition();
// For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html
weight = ( float ) ( Math.log( numDocs / ( double ) ( reader.docFreq( fieldName, text ) + 1 ) ) + 1.0 );
if (dpEnum.startOffset() < 0) { if (dpEnum.startOffset() < 0) {
return; // no offsets, null snippet return; // no offsets, null snippet
} }