LUCENE-1355: highlighter can incorrectly produce negative idf when index has deletes

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@687052 13f79535-47bb-0310-9956-ffa450edef68
2008-08-19 13:16:41 +00:00 · 2008-08-19 13:16:41 +00:00 · 7675606908
parent 0541628af1
commit 7675606908
3 changed files with 69 additions and 1 deletions
--- a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermExtractor.java
+++ b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermExtractor.java
@ -66,6 +66,10 @@ public final class QueryTermExtractor
 	        try
            {
                int docFreq=reader.docFreq(new Term(fieldName,terms[i].term));
+                // docFreq counts deletes
+                if(totalNumDocs < docFreq) {
+                  docFreq = totalNumDocs;
+                }
                //IDF algorithm taken from DefaultSimilarity class
                float idf=(float)(Math.log((float)totalNumDocs/(double)(docFreq+1)) + 1.0);
                terms[i].weight*=idf;
--- a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
+++ b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
@ -409,7 +409,10 @@ public class WeightedSpanTermExtractor {
      while (it.hasNext()) {
        WeightedSpanTerm weightedSpanTerm = (WeightedSpanTerm) terms.get(it.next());
        int docFreq = reader.docFreq(new Term(fieldName, weightedSpanTerm.term));
-
+        // docFreq counts deletes
+        if(totalNumDocs < docFreq) {
+          docFreq = totalNumDocs;
+        }
        // IDF algorithm taken from DefaultSimilarity class
        float idf = (float) (Math.log((float) totalNumDocs / (double) (docFreq + 1)) + 1.0);
        weightedSpanTerm.weight *= idf;
--- a/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
+++ b/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
@ -42,9 +42,14 @@ import org.apache.lucene.analysis.WhitespaceAnalyzer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Field.Index;
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.index.CorruptIndexException;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.Term;
+import org.apache.lucene.index.IndexWriter.MaxFieldLength;
+import org.apache.lucene.queryParser.ParseException;
 import org.apache.lucene.queryParser.QueryParser;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.FilteredQuery;
@ -57,12 +62,14 @@ import org.apache.lucene.search.Query;
 import org.apache.lucene.search.RangeFilter;
 import org.apache.lucene.search.Searcher;
 import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.BooleanClause.Occur;
 import org.apache.lucene.search.highlight.SynonymTokenizer.TestHighlightRunner;
 import org.apache.lucene.search.spans.SpanNearQuery;
 import org.apache.lucene.search.spans.SpanNotQuery;
 import org.apache.lucene.search.spans.SpanQuery;
 import org.apache.lucene.search.spans.SpanTermQuery;
+import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.RAMDirectory;
 import org.w3c.dom.Element;
 import org.w3c.dom.NodeList;
@ -1246,6 +1253,60 @@ public class HighlighterTest extends TestCase implements Formatter {
    helper.start();
  }
  
+  private Directory dir = new RAMDirectory();
+  private Analyzer a = new WhitespaceAnalyzer();
+  
+  public void testWeightedTermsWithDeletes() throws IOException, ParseException {
+    makeIndex();
+    deleteDocument();
+    searchIndex();
+  }
+  
+  private Document doc( String f, String v ){
+    Document doc = new Document();
+    doc.add( new Field( f, v, Store.YES, Index.TOKENIZED ) );
+    return doc;
+  }
+  
+  private void makeIndex() throws IOException {
+    IndexWriter writer = new IndexWriter( dir, a, MaxFieldLength.LIMITED );
+    writer.addDocument( doc( "t_text1", "random words for highlighting tests del" ) );
+    writer.addDocument( doc( "t_text1", "more random words for second field del" ) );
+    writer.addDocument( doc( "t_text1", "random words for highlighting tests del" ) );
+    writer.addDocument( doc( "t_text1", "more random words for second field" ) );
+    writer.optimize();
+    writer.close();
+  }
+  
+  private void deleteDocument() throws IOException {
+    IndexWriter writer = new IndexWriter( dir, a, false, MaxFieldLength.LIMITED );
+    writer.deleteDocuments( new Term( "t_text1", "del" ) );
+    // To see negative idf, keep comment the following line
+    //writer.optimize();
+    writer.close();
+  }
+  
+  private void searchIndex() throws IOException, ParseException {
+    String q = "t_text1:random";
+    QueryParser parser = new QueryParser( "t_text1", a );
+    Query query = parser.parse( q );
+    IndexSearcher searcher = new IndexSearcher( dir );
+    // This scorer can return negative idf -> null fragment
+    Scorer scorer = new QueryScorer( query, searcher.getIndexReader(), "t_text1" );
+    // This scorer doesn't use idf (patch version)
+    //Scorer scorer = new QueryScorer( query, "t_text1" );
+    Highlighter h = new Highlighter( scorer );
+
+    TopDocs hits = searcher.search(query, null, 10);
+    for( int i = 0; i < hits.totalHits; i++ ){
+      Document doc = searcher.doc( hits.scoreDocs[i].doc );
+      String result = h.getBestFragment( a, "t_text1", doc.get( "t_text1" ));
+      System.out.println("result:" +  result);
+      assertEquals("more <B>random</B> words for second field", result);
+    }
+    searcher.close();
+  }
+
  /*
   * 
   * public void testBigramAnalyzer() throws IOException, ParseException {