LUCENE-3029: MultiPhraseQuery scores should not depend on docID

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1098782 13f79535-47bb-0310-9956-ffa450edef68
2025-02-28 21:39:25 +00:00 · 2011-05-02 20:39:26 +00:00 · 2011-05-02 20:39:26 +00:00 · fd0701bf4e
commit fd0701bf4e
parent abc9be2eef
5 changed files with 119 additions and 6 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -1477,6 +1477,10 @@ Bug fixes
  that warming is free to do whatever it needs to.  (Earwin Burrfoot
  via Mike McCandless)
 * LUCENE-3029: Fix corner case when MultiPhraseQuery is used with zero
  position-increment tokens that would sometimes assign different
  scores to identical docs.  (Mike McCandless)
 * LUCENE-2486: Fixed intermittent FileNotFoundException on doc store
  files when a mergedSegmentWarmer is set on IndexWriter.  (Mike
  McCandless)
--- a/lucene/src/java/org/apache/lucene/search/PhrasePositions.java
+++ b/lucene/src/java/org/apache/lucene/search/PhrasePositions.java
@ -28,13 +28,15 @@ final class PhrasePositions {
  int position;					  // position in doc
  int count;					  // remaining pos in this doc
  int offset;					  // position in phrase
  final int ord;                                  // unique across all PhrasePositions instances
  final DocsAndPositionsEnum postings;  	  // stream of docs & positions
  PhrasePositions next;	                          // used to make lists
  boolean repeats;       // there's other pp for same term (e.g. query="1st word 2nd word"~1) 
-  PhrasePositions(DocsAndPositionsEnum postings, int o) {
+  PhrasePositions(DocsAndPositionsEnum postings, int o, int ord) {
    this.postings = postings;
    offset = o;
    this.ord = ord;
  }
  final boolean next() throws IOException {	  // increments to next doc
--- a/lucene/src/java/org/apache/lucene/search/PhraseQueue.java
+++ b/lucene/src/java/org/apache/lucene/search/PhraseQueue.java
@ -30,10 +30,16 @@ final class PhraseQueue extends PriorityQueue<PhrasePositions> {
      if (pp1.position == pp2.position)
        // same doc and pp.position, so decide by actual term positions. 
        // rely on: pp.position == tp.position - offset. 
-        return pp1.offset < pp2.offset;
+        if (pp1.offset == pp2.offset) {
-      else
+          return pp1.ord < pp2.ord;
        } else {
          return pp1.offset < pp2.offset;
        }
      else {
        return pp1.position < pp2.position;
-    else
+      }
    else {
      return pp1.doc < pp2.doc;
    }
  }
 }
--- a/lucene/src/java/org/apache/lucene/search/PhraseScorer.java
+++ b/lucene/src/java/org/apache/lucene/search/PhraseScorer.java
@ -55,7 +55,7 @@ abstract class PhraseScorer extends Scorer {
    // this allows to easily identify a matching (exact) phrase 
    // when all PhrasePositions have exactly the same position.
    for (int i = 0; i < postings.length; i++) {
-      PhrasePositions pp = new PhrasePositions(postings[i].postings, postings[i].position);
+      PhrasePositions pp = new PhrasePositions(postings[i].postings, postings[i].position, i);
      if (last != null) {			  // add next to end of list
        last.next = pp;
      } else {
--- a/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java
+++ b/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java
@ -25,14 +25,22 @@ import org.apache.lucene.index.MultiFields;
 import org.apache.lucene.search.Explanation.IDFExplanation;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
-
+import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.store.RAMDirectory;
 import org.apache.lucene.util.LuceneTestCase;
 import java.io.IOException;
 import java.util.Collection;
 import java.util.LinkedList;
 import java.io.Reader;
 /**
 * This class tests the MultiPhraseQuery class.
@ -333,4 +341,97 @@ public class TestMultiPhraseQuery extends LuceneTestCase {
    reader.close();
    indexStore.close();
  }
  private static class TokenAndPos {
    public final String token;
    public final int pos;
    public TokenAndPos(String token, int pos) {
      this.token = token;
      this.pos = pos;
    }
  }
  private static class CannedAnalyzer extends Analyzer {
    private final TokenAndPos[] tokens;
    public CannedAnalyzer(TokenAndPos[] tokens) {
      this.tokens = tokens;
    }
    @Override
    public TokenStream tokenStream(String fieldName, Reader reader) {
      return new CannedTokenizer(tokens);
    }
  }
  private static class CannedTokenizer extends Tokenizer {
    private final TokenAndPos[] tokens;
    private int upto = 0;
    private int lastPos = 0;
    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
    private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
    public CannedTokenizer(TokenAndPos[] tokens) {
      this.tokens = tokens;
    }
    @Override
    public final boolean incrementToken() throws IOException {
      clearAttributes();      
      if (upto < tokens.length) {
        final TokenAndPos token = tokens[upto++];
        termAtt.setEmpty();
        termAtt.append(token.token);
        posIncrAtt.setPositionIncrement(token.pos - lastPos);
        lastPos = token.pos;
        return true;
      } else {
        return false;
      }
    }
  }
  public void testZeroPosIncr() throws IOException {
    Directory dir = new RAMDirectory();
    final TokenAndPos[] tokens = new TokenAndPos[3];
    tokens[0] = new TokenAndPos("a", 0);
    tokens[1] = new TokenAndPos("b", 0);
    tokens[2] = new TokenAndPos("c", 0);
    RandomIndexWriter writer = new RandomIndexWriter(random, dir, new CannedAnalyzer(tokens));
    Document doc = new Document();
    doc.add(new Field("field", "", Field.Store.NO, Field.Index.ANALYZED));
    writer.addDocument(doc);
    writer.addDocument(doc);
    IndexReader r = writer.getReader();
    writer.close();
    IndexSearcher s = new IndexSearcher(r);
    MultiPhraseQuery mpq = new MultiPhraseQuery();
    //mpq.setSlop(1);
    // NOTE: not great that if we do the else clause here we
    // get different scores!  MultiPhraseQuery counts that
    // phrase as occurring twice per doc (it should be 1, I
    // think?).  This is because MultipleTermPositions is able to
    // return the same position more than once (0, in this
    // case):
    if (true) {
      mpq.add(new Term[] {new Term("field", "b"), new Term("field", "c")}, 0);
      mpq.add(new Term[] {new Term("field", "a")}, 0);
    } else {
      mpq.add(new Term[] {new Term("field", "a")}, 0);
      mpq.add(new Term[] {new Term("field", "b"), new Term("field", "c")}, 0);
    }
    TopDocs hits = s.search(mpq, 2);
    assert hits.totalHits == 2;
    assertEquals(hits.scoreDocs[0].score, hits.scoreDocs[1].score, 1e-5);
    /*
    for(int hit=0;hit<hits.totalHits;hit++) {
      ScoreDoc sd = hits.scoreDocs[hit];
      System.out.println("  hit doc=" + sd.doc + " score=" + sd.score);
    }
    */
    r.close();
    dir.close();
  }
 }