mirror of https://github.com/apache/lucene.git
LUCENE-4289: fix highlighter idf inconsistencies/inefficiencies
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1369859 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
c1f2562aff
commit
462ff90d8e
|
@ -169,6 +169,9 @@ Bug Fixes
|
||||||
* LUCENE-4282: Automaton FuzzyQuery didnt always deliver all results.
|
* LUCENE-4282: Automaton FuzzyQuery didnt always deliver all results.
|
||||||
(Johannes Christen, Uwe Schindler, Robert Muir)
|
(Johannes Christen, Uwe Schindler, Robert Muir)
|
||||||
|
|
||||||
|
* LUCENE-4289: Fix minor idf inconsistencies/inefficiencies in highlighter.
|
||||||
|
(Robert Muir)
|
||||||
|
|
||||||
Changes in Runtime Behavior
|
Changes in Runtime Behavior
|
||||||
|
|
||||||
* LUCENE-4109: Enable position increments in the flexible queryparser by default.
|
* LUCENE-4109: Enable position increments in the flexible queryparser by default.
|
||||||
|
|
|
@ -60,18 +60,14 @@ public final class QueryTermExtractor
|
||||||
public static final WeightedTerm[] getIdfWeightedTerms(Query query, IndexReader reader, String fieldName)
|
public static final WeightedTerm[] getIdfWeightedTerms(Query query, IndexReader reader, String fieldName)
|
||||||
{
|
{
|
||||||
WeightedTerm[] terms=getTerms(query,false, fieldName);
|
WeightedTerm[] terms=getTerms(query,false, fieldName);
|
||||||
int totalNumDocs=reader.numDocs();
|
int totalNumDocs=reader.maxDoc();
|
||||||
for (int i = 0; i < terms.length; i++)
|
for (int i = 0; i < terms.length; i++)
|
||||||
{
|
{
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
int docFreq=reader.docFreq(new Term(fieldName,terms[i].term));
|
int docFreq=reader.docFreq(new Term(fieldName,terms[i].term));
|
||||||
// docFreq counts deletes
|
|
||||||
if(totalNumDocs < docFreq) {
|
|
||||||
docFreq = totalNumDocs;
|
|
||||||
}
|
|
||||||
//IDF algorithm taken from DefaultSimilarity class
|
//IDF algorithm taken from DefaultSimilarity class
|
||||||
float idf=(float)(Math.log((float)totalNumDocs/(double)(docFreq+1)) + 1.0);
|
float idf=(float)(Math.log(totalNumDocs/(double)(docFreq+1)) + 1.0);
|
||||||
terms[i].weight*=idf;
|
terms[i].weight*=idf;
|
||||||
}
|
}
|
||||||
catch (IOException e)
|
catch (IOException e)
|
||||||
|
|
|
@ -431,7 +431,7 @@ public class WeightedSpanTermExtractor {
|
||||||
Map<String,WeightedSpanTerm> terms = new PositionCheckingMap<String>();
|
Map<String,WeightedSpanTerm> terms = new PositionCheckingMap<String>();
|
||||||
extract(query, terms);
|
extract(query, terms);
|
||||||
|
|
||||||
int totalNumDocs = reader.numDocs();
|
int totalNumDocs = reader.maxDoc();
|
||||||
Set<String> weightedTerms = terms.keySet();
|
Set<String> weightedTerms = terms.keySet();
|
||||||
Iterator<String> it = weightedTerms.iterator();
|
Iterator<String> it = weightedTerms.iterator();
|
||||||
|
|
||||||
|
@ -439,12 +439,8 @@ public class WeightedSpanTermExtractor {
|
||||||
while (it.hasNext()) {
|
while (it.hasNext()) {
|
||||||
WeightedSpanTerm weightedSpanTerm = terms.get(it.next());
|
WeightedSpanTerm weightedSpanTerm = terms.get(it.next());
|
||||||
int docFreq = reader.docFreq(new Term(fieldName, weightedSpanTerm.term));
|
int docFreq = reader.docFreq(new Term(fieldName, weightedSpanTerm.term));
|
||||||
// docFreq counts deletes
|
|
||||||
if(totalNumDocs < docFreq) {
|
|
||||||
docFreq = totalNumDocs;
|
|
||||||
}
|
|
||||||
// IDF algorithm taken from DefaultSimilarity class
|
// IDF algorithm taken from DefaultSimilarity class
|
||||||
float idf = (float) (Math.log((float) totalNumDocs / (double) (docFreq + 1)) + 1.0);
|
float idf = (float) (Math.log(totalNumDocs / (double) (docFreq + 1)) + 1.0);
|
||||||
weightedSpanTerm.weight *= idf;
|
weightedSpanTerm.weight *= idf;
|
||||||
}
|
}
|
||||||
} finally {
|
} finally {
|
||||||
|
|
|
@ -95,8 +95,7 @@ public class FieldTermStack {
|
||||||
DocsAndPositionsEnum dpEnum = null;
|
DocsAndPositionsEnum dpEnum = null;
|
||||||
BytesRef text;
|
BytesRef text;
|
||||||
|
|
||||||
int numDocs = reader.numDocs() - reader.numDeletedDocs();
|
int numDocs = reader.maxDoc();
|
||||||
float weight = 0;
|
|
||||||
|
|
||||||
while ((text = termsEnum.next()) != null) {
|
while ((text = termsEnum.next()) != null) {
|
||||||
UnicodeUtil.UTF8toUTF16(text, spare);
|
UnicodeUtil.UTF8toUTF16(text, spare);
|
||||||
|
@ -112,12 +111,13 @@ public class FieldTermStack {
|
||||||
|
|
||||||
dpEnum.nextDoc();
|
dpEnum.nextDoc();
|
||||||
|
|
||||||
|
// For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html
|
||||||
|
final float weight = ( float ) ( Math.log( numDocs / ( double ) ( reader.docFreq( fieldName, text ) + 1 ) ) + 1.0 );
|
||||||
|
|
||||||
final int freq = dpEnum.freq();
|
final int freq = dpEnum.freq();
|
||||||
|
|
||||||
for(int i = 0;i < freq;i++) {
|
for(int i = 0;i < freq;i++) {
|
||||||
int pos = dpEnum.nextPosition();
|
int pos = dpEnum.nextPosition();
|
||||||
// For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html
|
|
||||||
weight = ( float ) ( Math.log( numDocs / ( double ) ( reader.docFreq( fieldName, text ) + 1 ) ) + 1.0 );
|
|
||||||
if (dpEnum.startOffset() < 0) {
|
if (dpEnum.startOffset() < 0) {
|
||||||
return; // no offsets, null snippet
|
return; // no offsets, null snippet
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue