remove score normalization from expert level search: LUCENE-469

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@354819 13f79535-47bb-0310-9956-ffa450edef68
2005-12-07 17:48:37 +00:00 · 2005-12-07 17:48:37 +00:00 · ad9e9bceb4
parent 2f98d903ce
commit ad9e9bceb4
9 changed files with 164 additions and 17 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -66,6 +66,11 @@ Changes in runtime behavior
    instead of using Integer and Float classes for parsing.
    (Yonik Seeley via Otis Gospodnetic)

+ 9. Expert level search routines returning TopDocs and TopFieldDocs
+    no longer normalize scores.  This also fixes bugs related to
+    MultiSearchers and score sorting/normalization.
+    (Luc Vanlerberghe via Yonik Seeley, LUCENE-469)
+
 New features

 1. Added support for stored compressed fields (patch #31149)
--- a/src/java/org/apache/lucene/search/FieldSortedHitQueue.java
+++ b/src/java/org/apache/lucene/search/FieldSortedHitQueue.java
@ -132,7 +132,7 @@ extends PriorityQueue {
    for (int i=0; i<n; ++i)
      fields[i] = comparators[i].sortValue(doc);
    doc.fields = fields;
-    if (maxscore > 1.0f) doc.score /= maxscore;   // normalize scores
+    //if (maxscore > 1.0f) doc.score /= maxscore;   // normalize scores
    return doc;
  }

--- a/src/java/org/apache/lucene/search/Hits.java
+++ b/src/java/org/apache/lucene/search/Hits.java
@ -67,8 +67,9 @@ public final class Hits {
    ScoreDoc[] scoreDocs = topDocs.scoreDocs;

    float scoreNorm = 1.0f;
-    if (length > 0 && scoreDocs[0].score > 1.0f) {
-      scoreNorm = 1.0f / scoreDocs[0].score;
+    
+    if (length > 0 && topDocs.getMaxScore() > 1.0f) {
+      scoreNorm = 1.0f / topDocs.getMaxScore();
    }

    int end = scoreDocs.length < length ? scoreDocs.length : length;
--- a/src/java/org/apache/lucene/search/IndexSearcher.java
+++ b/src/java/org/apache/lucene/search/IndexSearcher.java
@ -97,7 +97,7 @@ public class IndexSearcher extends Searcher {

    Scorer scorer = weight.scorer(reader);
    if (scorer == null)
-      return new TopDocs(0, new ScoreDoc[0]);
+      return new TopDocs(0, new ScoreDoc[0], Float.NEGATIVE_INFINITY);

    final BitSet bits = filter != null ? filter.bits(reader) : null;
    final HitQueue hq = new HitQueue(nDocs);
@ -120,7 +120,9 @@ public class IndexSearcher extends Searcher {
    for (int i = hq.size()-1; i >= 0; i--)        // put docs in array
      scoreDocs[i] = (ScoreDoc)hq.pop();

-    return new TopDocs(totalHits[0], scoreDocs);
+    float maxScore = (totalHits[0]==0) ? Float.NEGATIVE_INFINITY : scoreDocs[0].score;
+    
+    return new TopDocs(totalHits[0], scoreDocs, maxScore);
  }

  // inherit javadoc
@ -129,7 +131,7 @@ public class IndexSearcher extends Searcher {
      throws IOException {
    Scorer scorer = weight.scorer(reader);
    if (scorer == null)
-      return new TopFieldDocs(0, new ScoreDoc[0], sort.fields);
+      return new TopFieldDocs(0, new ScoreDoc[0], sort.fields, Float.NEGATIVE_INFINITY);

    final BitSet bits = filter != null ? filter.bits(reader) : null;
    final FieldSortedHitQueue hq =
@ -149,7 +151,7 @@ public class IndexSearcher extends Searcher {
    for (int i = hq.size()-1; i >= 0; i--)        // put docs in array
      scoreDocs[i] = hq.fillFields ((FieldDoc) hq.pop());

-    return new TopFieldDocs(totalHits[0], scoreDocs, hq.getFields());
+    return new TopFieldDocs(totalHits[0], scoreDocs, hq.getFields(), hq.getMaxScore());
  }

  // inherit javadoc
--- a/src/java/org/apache/lucene/search/MultiSearcher.java
+++ b/src/java/org/apache/lucene/search/MultiSearcher.java
@ -208,8 +208,10 @@ public class MultiSearcher extends Searcher {
    ScoreDoc[] scoreDocs = new ScoreDoc[hq.size()];
    for (int i = hq.size()-1; i >= 0; i--)	  // put docs in array
      scoreDocs[i] = (ScoreDoc)hq.pop();
-
-    return new TopDocs(totalHits, scoreDocs);
+    
+    float maxScore = (totalHits==0) ? Float.NEGATIVE_INFINITY : scoreDocs[0].score;
+    
+    return new TopDocs(totalHits, scoreDocs, maxScore);
  }

  public TopFieldDocs search (Weight weight, Filter filter, int n, Sort sort)
@ -217,10 +219,14 @@ public class MultiSearcher extends Searcher {
    FieldDocSortedHitQueue hq = null;
    int totalHits = 0;

+    float maxScore=Float.NEGATIVE_INFINITY;
+    
    for (int i = 0; i < searchables.length; i++) { // search each searcher
      TopFieldDocs docs = searchables[i].search (weight, filter, n, sort);
+      
      if (hq == null) hq = new FieldDocSortedHitQueue (docs.fields, n);
      totalHits += docs.totalHits;		  // update totalHits
+      maxScore = Math.max(maxScore, docs.getMaxScore());
      ScoreDoc[] scoreDocs = docs.scoreDocs;
      for (int j = 0; j < scoreDocs.length; j++) { // merge scoreDocs into hq
        ScoreDoc scoreDoc = scoreDocs[j];
@ -234,7 +240,7 @@ public class MultiSearcher extends Searcher {
    for (int i = hq.size() - 1; i >= 0; i--)	  // put docs in array
      scoreDocs[i] = (ScoreDoc) hq.pop();

-    return new TopFieldDocs (totalHits, scoreDocs, hq.getFields());
+    return new TopFieldDocs (totalHits, scoreDocs, hq.getFields(), maxScore);
  }


--- a/src/java/org/apache/lucene/search/ParallelMultiSearcher.java
+++ b/src/java/org/apache/lucene/search/ParallelMultiSearcher.java
@ -90,7 +90,9 @@ public class ParallelMultiSearcher extends MultiSearcher {
    for (int i = hq.size() - 1; i >= 0; i--) // put docs in array
      scoreDocs[i] = (ScoreDoc) hq.pop();

-    return new TopDocs(totalHits, scoreDocs);
+    float maxScore = (totalHits==0) ? Float.NEGATIVE_INFINITY : scoreDocs[0].score;
+    
+    return new TopDocs(totalHits, scoreDocs, maxScore);
  }

  /**
@ -120,6 +122,8 @@ public class ParallelMultiSearcher extends MultiSearcher {
      msta[i].start();
    }

+    float maxScore=Float.NEGATIVE_INFINITY;
+    
    for (int i = 0; i < searchables.length; i++) {
      try {
        msta[i].join();
@ -129,6 +133,7 @@ public class ParallelMultiSearcher extends MultiSearcher {
      IOException ioe = msta[i].getIOException();
      if (ioe == null) {
        totalHits += msta[i].hits();
+        maxScore=Math.max(maxScore, msta[i].getMaxScore());
      } else {
        // if one search produced an IOException, rethrow it
        throw ioe;
@ -139,7 +144,7 @@ public class ParallelMultiSearcher extends MultiSearcher {
    for (int i = hq.size() - 1; i >= 0; i--) // put docs in array
      scoreDocs[i] = (ScoreDoc) hq.pop();

-    return new TopFieldDocs(totalHits, scoreDocs, hq.getFields());
+    return new TopFieldDocs(totalHits, scoreDocs, hq.getFields(), maxScore);
  }

  /** Lower-level search API.
@ -274,6 +279,10 @@ class MultiSearcherThread extends Thread {
    return docs.totalHits;
  }

+  public float getMaxScore() {
+      return docs.getMaxScore();
+  }
+  
  public IOException getIOException() {
    return ioe;
  }
--- a/src/java/org/apache/lucene/search/TopDocs.java
+++ b/src/java/org/apache/lucene/search/TopDocs.java
@ -25,10 +25,23 @@ public class TopDocs implements java.io.Serializable {
  public int totalHits;
  /** Expert: The top hits for the query. */
  public ScoreDoc[] scoreDocs;
-
+  /** Expert: Stores the maximum score value encountered, needed for normalizing. */
+  private float maxScore;
+  
+  /** Expert: Returns the maximum score value encountered. */
+  public float getMaxScore() {
+      return maxScore;
+  }
+  
+  /** Expert: Sets the maximum score value encountered. */
+  public void setMaxScore(float maxScore) {
+      this.maxScore=maxScore;
+  }
+  
  /** Expert: Constructs a TopDocs.*/
-  TopDocs(int totalHits, ScoreDoc[] scoreDocs) {
+  TopDocs(int totalHits, ScoreDoc[] scoreDocs, float maxScore) {
    this.totalHits = totalHits;
    this.scoreDocs = scoreDocs;
+    this.maxScore = maxScore;
  }
 }
--- a/src/java/org/apache/lucene/search/TopFieldDocs.java
+++ b/src/java/org/apache/lucene/search/TopFieldDocs.java
@ -32,14 +32,15 @@ extends TopDocs {

 	/** The fields which were used to sort results by. */
 	public SortField[] fields;
-
+        
 	/** Creates one of these objects.
 	 * @param totalHits  Total number of hits for the query.
 	 * @param scoreDocs  The top hits for the query.
 	 * @param fields     The sort criteria used to find the top hits.
+	 * @param maxScore   The maximum score encountered.
 	 */
-	TopFieldDocs (int totalHits, ScoreDoc[] scoreDocs, SortField[] fields) {
-	  super (totalHits, scoreDocs);
+	TopFieldDocs (int totalHits, ScoreDoc[] scoreDocs, SortField[] fields, float maxScore) {
+	  super (totalHits, scoreDocs, maxScore);
 	  this.fields = fields;
 	}
 }
--- a/src/test/org/apache/lucene/search/TestMultiSearcher.java
+++ b/src/test/org/apache/lucene/search/TestMultiSearcher.java
@ -17,6 +17,7 @@ package org.apache.lucene.search;
 */

 import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.KeywordAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.index.IndexReader;
@ -181,4 +182,113 @@ public class TestMultiSearcher extends TestCase
        }
        mSearcher3.close();
    }
+    
+    private static Document createDocument(String contents1, String contents2) {
+        Document document=new Document();
+        
+        document.add(new Field("contents", contents1, Field.Store.YES, Field.Index.UN_TOKENIZED));
+        
+        if (contents2!=null) {
+            document.add(new Field("contents", contents2, Field.Store.YES, Field.Index.UN_TOKENIZED));
+        }
+        
+        return document;
+    }
+    
+    private static void initIndex(Directory directory, int nDocs, boolean create, String contents2) throws IOException {
+        IndexWriter indexWriter=null;
+        
+        try {
+            indexWriter=new IndexWriter(directory, new KeywordAnalyzer(), create);
+            
+            for (int i=0; i<nDocs; i++) {
+                indexWriter.addDocument(createDocument("doc" + i, contents2));
+            }
+        } finally {
+            if (indexWriter!=null) {
+                indexWriter.close();
+            }
+        }
+    }
+    
+    /* uncomment this when the highest score is always normalized to 1.0, even when it was < 1.0
+    public void testNormalization1() throws IOException {
+        testNormalization(1, "Using 1 document per index:");
+    }
+     */
+    
+    public void testNormalization10() throws IOException {
+        testNormalization(10, "Using 10 documents per index:");
+    }
+    
+    private void testNormalization(int nDocs, String message) throws IOException {
+        Query query=new TermQuery(new Term("contents", "doc0"));
+        
+        RAMDirectory ramDirectory1;
+        IndexSearcher indexSearcher1;
+        Hits hits;
+        
+        ramDirectory1=new RAMDirectory();
+        
+        // First put the documents in the same index
+        initIndex(ramDirectory1, nDocs, true, null); // documents with a single token "doc0", "doc1", etc...
+        initIndex(ramDirectory1, nDocs, false, "x"); // documents with two tokens "doc0" and "x", "doc1" and x, etc...
+        
+        indexSearcher1=new IndexSearcher(ramDirectory1);
+        
+        hits=indexSearcher1.search(query);
+        
+        assertEquals(message, 2, hits.length());
+        
+        assertEquals(message, 1, hits.score(0), 1e-6); // hits.score(0) is 0.594535 if only a single document is in first index
+        
+        // Store the scores for use later
+        float[] scores={ hits.score(0), hits.score(1) };
+        
+        assertTrue(message, scores[0] > scores[1]);
+        
+        indexSearcher1.close();
+        ramDirectory1.close();
+        hits=null;
+        
+        
+        
+        RAMDirectory ramDirectory2;
+        IndexSearcher indexSearcher2;
+        
+        ramDirectory1=new RAMDirectory();
+        ramDirectory2=new RAMDirectory();
+        
+        // Now put the documents in a different index
+        initIndex(ramDirectory1, nDocs, true, null); // documents with a single token "doc0", "doc1", etc...
+        initIndex(ramDirectory2, nDocs, true, "x"); // documents with two tokens "doc0" and "x", "doc1" and x, etc...
+        
+        indexSearcher1=new IndexSearcher(ramDirectory1);
+        indexSearcher2=new IndexSearcher(ramDirectory2);
+        
+        Searcher searcher=getMultiSearcherInstance(new Searcher[] { indexSearcher1, indexSearcher2 });
+        
+        hits=searcher.search(query);
+        
+        assertEquals(message, 2, hits.length());
+        
+        // The scores should be the same (within reason)
+        assertEquals(message, scores[0], hits.score(0), 1e-6); // This will a document from ramDirectory1
+        assertEquals(message, scores[1], hits.score(1), 1e-6); // This will a document from ramDirectory2
+        
+        
+        
+        // Adding a Sort.RELEVANCE object should not change anything
+        hits=searcher.search(query, Sort.RELEVANCE);
+        
+        assertEquals(message, 2, hits.length());
+        
+        assertEquals(message, scores[0], hits.score(0), 1e-6); // This will a document from ramDirectory1
+        assertEquals(message, scores[1], hits.score(1), 1e-6); // This will a document from ramDirectory2
+        
+        searcher.close();
+        
+        ramDirectory1.close();
+        ramDirectory2.close();
+    }
 }