mirror of https://github.com/apache/lucene.git
LUCENE-4289: fix highlighter idf inconsistencies/inefficiencies
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1369859 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
c1f2562aff
commit
462ff90d8e
|
@ -169,6 +169,9 @@ Bug Fixes
|
|||
* LUCENE-4282: Automaton FuzzyQuery didnt always deliver all results.
|
||||
(Johannes Christen, Uwe Schindler, Robert Muir)
|
||||
|
||||
* LUCENE-4289: Fix minor idf inconsistencies/inefficiencies in highlighter.
|
||||
(Robert Muir)
|
||||
|
||||
Changes in Runtime Behavior
|
||||
|
||||
* LUCENE-4109: Enable position increments in the flexible queryparser by default.
|
||||
|
|
|
@ -60,18 +60,14 @@ public final class QueryTermExtractor
|
|||
public static final WeightedTerm[] getIdfWeightedTerms(Query query, IndexReader reader, String fieldName)
|
||||
{
|
||||
WeightedTerm[] terms=getTerms(query,false, fieldName);
|
||||
int totalNumDocs=reader.numDocs();
|
||||
int totalNumDocs=reader.maxDoc();
|
||||
for (int i = 0; i < terms.length; i++)
|
||||
{
|
||||
try
|
||||
{
|
||||
int docFreq=reader.docFreq(new Term(fieldName,terms[i].term));
|
||||
// docFreq counts deletes
|
||||
if(totalNumDocs < docFreq) {
|
||||
docFreq = totalNumDocs;
|
||||
}
|
||||
//IDF algorithm taken from DefaultSimilarity class
|
||||
float idf=(float)(Math.log((float)totalNumDocs/(double)(docFreq+1)) + 1.0);
|
||||
float idf=(float)(Math.log(totalNumDocs/(double)(docFreq+1)) + 1.0);
|
||||
terms[i].weight*=idf;
|
||||
}
|
||||
catch (IOException e)
|
||||
|
|
|
@ -431,7 +431,7 @@ public class WeightedSpanTermExtractor {
|
|||
Map<String,WeightedSpanTerm> terms = new PositionCheckingMap<String>();
|
||||
extract(query, terms);
|
||||
|
||||
int totalNumDocs = reader.numDocs();
|
||||
int totalNumDocs = reader.maxDoc();
|
||||
Set<String> weightedTerms = terms.keySet();
|
||||
Iterator<String> it = weightedTerms.iterator();
|
||||
|
||||
|
@ -439,12 +439,8 @@ public class WeightedSpanTermExtractor {
|
|||
while (it.hasNext()) {
|
||||
WeightedSpanTerm weightedSpanTerm = terms.get(it.next());
|
||||
int docFreq = reader.docFreq(new Term(fieldName, weightedSpanTerm.term));
|
||||
// docFreq counts deletes
|
||||
if(totalNumDocs < docFreq) {
|
||||
docFreq = totalNumDocs;
|
||||
}
|
||||
// IDF algorithm taken from DefaultSimilarity class
|
||||
float idf = (float) (Math.log((float) totalNumDocs / (double) (docFreq + 1)) + 1.0);
|
||||
float idf = (float) (Math.log(totalNumDocs / (double) (docFreq + 1)) + 1.0);
|
||||
weightedSpanTerm.weight *= idf;
|
||||
}
|
||||
} finally {
|
||||
|
|
|
@ -95,8 +95,7 @@ public class FieldTermStack {
|
|||
DocsAndPositionsEnum dpEnum = null;
|
||||
BytesRef text;
|
||||
|
||||
int numDocs = reader.numDocs() - reader.numDeletedDocs();
|
||||
float weight = 0;
|
||||
int numDocs = reader.maxDoc();
|
||||
|
||||
while ((text = termsEnum.next()) != null) {
|
||||
UnicodeUtil.UTF8toUTF16(text, spare);
|
||||
|
@ -112,12 +111,13 @@ public class FieldTermStack {
|
|||
|
||||
dpEnum.nextDoc();
|
||||
|
||||
// For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html
|
||||
final float weight = ( float ) ( Math.log( numDocs / ( double ) ( reader.docFreq( fieldName, text ) + 1 ) ) + 1.0 );
|
||||
|
||||
final int freq = dpEnum.freq();
|
||||
|
||||
for(int i = 0;i < freq;i++) {
|
||||
int pos = dpEnum.nextPosition();
|
||||
// For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html
|
||||
weight = ( float ) ( Math.log( numDocs / ( double ) ( reader.docFreq( fieldName, text ) + 1 ) ) + 1.0 );
|
||||
if (dpEnum.startOffset() < 0) {
|
||||
return; // no offsets, null snippet
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue