LUCENE-1355: highlighter can incorrectly produce negative idf when index has deletes

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@687052 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mark Robert Miller 2008-08-19 13:16:41 +00:00
parent 0541628af1
commit 7675606908
3 changed files with 69 additions and 1 deletions

View File

@ -66,6 +66,10 @@ public final class QueryTermExtractor
try try
{ {
int docFreq=reader.docFreq(new Term(fieldName,terms[i].term)); int docFreq=reader.docFreq(new Term(fieldName,terms[i].term));
// docFreq counts deletes
if(totalNumDocs < docFreq) {
docFreq = totalNumDocs;
}
//IDF algorithm taken from DefaultSimilarity class //IDF algorithm taken from DefaultSimilarity class
float idf=(float)(Math.log((float)totalNumDocs/(double)(docFreq+1)) + 1.0); float idf=(float)(Math.log((float)totalNumDocs/(double)(docFreq+1)) + 1.0);
terms[i].weight*=idf; terms[i].weight*=idf;

View File

@ -409,7 +409,10 @@ public class WeightedSpanTermExtractor {
while (it.hasNext()) { while (it.hasNext()) {
WeightedSpanTerm weightedSpanTerm = (WeightedSpanTerm) terms.get(it.next()); WeightedSpanTerm weightedSpanTerm = (WeightedSpanTerm) terms.get(it.next());
int docFreq = reader.docFreq(new Term(fieldName, weightedSpanTerm.term)); int docFreq = reader.docFreq(new Term(fieldName, weightedSpanTerm.term));
// docFreq counts deletes
if(totalNumDocs < docFreq) {
docFreq = totalNumDocs;
}
// IDF algorithm taken from DefaultSimilarity class // IDF algorithm taken from DefaultSimilarity class
float idf = (float) (Math.log((float) totalNumDocs / (double) (docFreq + 1)) + 1.0); float idf = (float) (Math.log((float) totalNumDocs / (double) (docFreq + 1)) + 1.0);
weightedSpanTerm.weight *= idf; weightedSpanTerm.weight *= idf;

View File

@ -42,9 +42,14 @@ import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FilteredQuery; import org.apache.lucene.search.FilteredQuery;
@ -57,12 +62,14 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.search.RangeFilter; import org.apache.lucene.search.RangeFilter;
import org.apache.lucene.search.Searcher; import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.highlight.SynonymTokenizer.TestHighlightRunner; import org.apache.lucene.search.highlight.SynonymTokenizer.TestHighlightRunner;
import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanNotQuery; import org.apache.lucene.search.spans.SpanNotQuery;
import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.store.RAMDirectory;
import org.w3c.dom.Element; import org.w3c.dom.Element;
import org.w3c.dom.NodeList; import org.w3c.dom.NodeList;
@ -1246,6 +1253,60 @@ public class HighlighterTest extends TestCase implements Formatter {
helper.start(); helper.start();
} }
private Directory dir = new RAMDirectory();
private Analyzer a = new WhitespaceAnalyzer();
public void testWeightedTermsWithDeletes() throws IOException, ParseException {
makeIndex();
deleteDocument();
searchIndex();
}
private Document doc( String f, String v ){
Document doc = new Document();
doc.add( new Field( f, v, Store.YES, Index.TOKENIZED ) );
return doc;
}
private void makeIndex() throws IOException {
IndexWriter writer = new IndexWriter( dir, a, MaxFieldLength.LIMITED );
writer.addDocument( doc( "t_text1", "random words for highlighting tests del" ) );
writer.addDocument( doc( "t_text1", "more random words for second field del" ) );
writer.addDocument( doc( "t_text1", "random words for highlighting tests del" ) );
writer.addDocument( doc( "t_text1", "more random words for second field" ) );
writer.optimize();
writer.close();
}
private void deleteDocument() throws IOException {
IndexWriter writer = new IndexWriter( dir, a, false, MaxFieldLength.LIMITED );
writer.deleteDocuments( new Term( "t_text1", "del" ) );
// To see negative idf, keep comment the following line
//writer.optimize();
writer.close();
}
private void searchIndex() throws IOException, ParseException {
String q = "t_text1:random";
QueryParser parser = new QueryParser( "t_text1", a );
Query query = parser.parse( q );
IndexSearcher searcher = new IndexSearcher( dir );
// This scorer can return negative idf -> null fragment
Scorer scorer = new QueryScorer( query, searcher.getIndexReader(), "t_text1" );
// This scorer doesn't use idf (patch version)
//Scorer scorer = new QueryScorer( query, "t_text1" );
Highlighter h = new Highlighter( scorer );
TopDocs hits = searcher.search(query, null, 10);
for( int i = 0; i < hits.totalHits; i++ ){
Document doc = searcher.doc( hits.scoreDocs[i].doc );
String result = h.getBestFragment( a, "t_text1", doc.get( "t_text1" ));
System.out.println("result:" + result);
assertEquals("more <B>random</B> words for second field", result);
}
searcher.close();
}
/* /*
* *
* public void testBigramAnalyzer() throws IOException, ParseException { * public void testBigramAnalyzer() throws IOException, ParseException {