LUCENE-3029: MultiPhraseQuery scores should not depend on docID

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1098782 13f79535-47bb-0310-9956-ffa450edef68
2025-02-28 21:39:25 +00:00 · 2011-05-02 20:39:26 +00:00 · 2011-05-02 20:39:26 +00:00 · fd0701bf4e
commit fd0701bf4e
parent abc9be2eef
5 changed files with 119 additions and 6 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -1477,6 +1477,10 @@ Bug fixes
  that warming is free to do whatever it needs to.  (Earwin Burrfoot
  via Mike McCandless)

+* LUCENE-3029: Fix corner case when MultiPhraseQuery is used with zero
+  position-increment tokens that would sometimes assign different
+  scores to identical docs.  (Mike McCandless)
+
 * LUCENE-2486: Fixed intermittent FileNotFoundException on doc store
  files when a mergedSegmentWarmer is set on IndexWriter.  (Mike
  McCandless)
--- a/lucene/src/java/org/apache/lucene/search/PhrasePositions.java
+++ b/lucene/src/java/org/apache/lucene/search/PhrasePositions.java
@ -28,13 +28,15 @@ final class PhrasePositions {
  int position;					  // position in doc
  int count;					  // remaining pos in this doc
  int offset;					  // position in phrase
+  final int ord;                                  // unique across all PhrasePositions instances
  final DocsAndPositionsEnum postings;  	  // stream of docs & positions
  PhrasePositions next;	                          // used to make lists
  boolean repeats;       // there's other pp for same term (e.g. query="1st word 2nd word"~1) 

-  PhrasePositions(DocsAndPositionsEnum postings, int o) {
+  PhrasePositions(DocsAndPositionsEnum postings, int o, int ord) {
    this.postings = postings;
    offset = o;
+    this.ord = ord;
  }

  final boolean next() throws IOException {	  // increments to next doc
--- a/lucene/src/java/org/apache/lucene/search/PhraseQueue.java
+++ b/lucene/src/java/org/apache/lucene/search/PhraseQueue.java
@ -30,10 +30,16 @@ final class PhraseQueue extends PriorityQueue<PhrasePositions> {
      if (pp1.position == pp2.position)
        // same doc and pp.position, so decide by actual term positions. 
        // rely on: pp.position == tp.position - offset. 
-        return pp1.offset < pp2.offset;
-      else
+        if (pp1.offset == pp2.offset) {
+          return pp1.ord < pp2.ord;
+        } else {
+          return pp1.offset < pp2.offset;
+        }
+      else {
        return pp1.position < pp2.position;
-    else
+      }
+    else {
      return pp1.doc < pp2.doc;
+    }
  }
 }
--- a/lucene/src/java/org/apache/lucene/search/PhraseScorer.java
+++ b/lucene/src/java/org/apache/lucene/search/PhraseScorer.java
@ -55,7 +55,7 @@ abstract class PhraseScorer extends Scorer {
    // this allows to easily identify a matching (exact) phrase 
    // when all PhrasePositions have exactly the same position.
    for (int i = 0; i < postings.length; i++) {
-      PhrasePositions pp = new PhrasePositions(postings[i].postings, postings[i].position);
+      PhrasePositions pp = new PhrasePositions(postings[i].postings, postings[i].position, i);
      if (last != null) {			  // add next to end of list
        last.next = pp;
      } else {
--- a/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java
+++ b/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java
@ -25,14 +25,22 @@ import org.apache.lucene.index.MultiFields;
 import org.apache.lucene.search.Explanation.IDFExplanation;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
-
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.store.RAMDirectory;
 import org.apache.lucene.util.LuceneTestCase;

 import java.io.IOException;
 import java.util.Collection;
 import java.util.LinkedList;
+import java.io.Reader;

 /**
 * This class tests the MultiPhraseQuery class.
@ -333,4 +341,97 @@ public class TestMultiPhraseQuery extends LuceneTestCase {
    reader.close();
    indexStore.close();
  }
+
+  private static class TokenAndPos {
+    public final String token;
+    public final int pos;
+    public TokenAndPos(String token, int pos) {
+      this.token = token;
+      this.pos = pos;
+    }
+  }
+
+  private static class CannedAnalyzer extends Analyzer {
+    private final TokenAndPos[] tokens;
+    
+    public CannedAnalyzer(TokenAndPos[] tokens) {
+      this.tokens = tokens;
+    }
+
+    @Override
+    public TokenStream tokenStream(String fieldName, Reader reader) {
+      return new CannedTokenizer(tokens);
+    }
+  }
+
+  private static class CannedTokenizer extends Tokenizer {
+    private final TokenAndPos[] tokens;
+    private int upto = 0;
+    private int lastPos = 0;
+    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+    private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+
+    public CannedTokenizer(TokenAndPos[] tokens) {
+      this.tokens = tokens;
+    }
+
+    @Override
+    public final boolean incrementToken() throws IOException {
+      clearAttributes();      
+      if (upto < tokens.length) {
+        final TokenAndPos token = tokens[upto++];
+        termAtt.setEmpty();
+        termAtt.append(token.token);
+        posIncrAtt.setPositionIncrement(token.pos - lastPos);
+        lastPos = token.pos;
+        return true;
+      } else {
+        return false;
+      }
+    }
+  }
+
+  public void testZeroPosIncr() throws IOException {
+    Directory dir = new RAMDirectory();
+    final TokenAndPos[] tokens = new TokenAndPos[3];
+    tokens[0] = new TokenAndPos("a", 0);
+    tokens[1] = new TokenAndPos("b", 0);
+    tokens[2] = new TokenAndPos("c", 0);
+
+    RandomIndexWriter writer = new RandomIndexWriter(random, dir, new CannedAnalyzer(tokens));
+    Document doc = new Document();
+    doc.add(new Field("field", "", Field.Store.NO, Field.Index.ANALYZED));
+    writer.addDocument(doc);
+    writer.addDocument(doc);
+    IndexReader r = writer.getReader();
+    writer.close();
+    IndexSearcher s = new IndexSearcher(r);
+    MultiPhraseQuery mpq = new MultiPhraseQuery();
+    //mpq.setSlop(1);
+
+    // NOTE: not great that if we do the else clause here we
+    // get different scores!  MultiPhraseQuery counts that
+    // phrase as occurring twice per doc (it should be 1, I
+    // think?).  This is because MultipleTermPositions is able to
+    // return the same position more than once (0, in this
+    // case):
+    if (true) {
+      mpq.add(new Term[] {new Term("field", "b"), new Term("field", "c")}, 0);
+      mpq.add(new Term[] {new Term("field", "a")}, 0);
+    } else {
+      mpq.add(new Term[] {new Term("field", "a")}, 0);
+      mpq.add(new Term[] {new Term("field", "b"), new Term("field", "c")}, 0);
+    }
+    TopDocs hits = s.search(mpq, 2);
+    assert hits.totalHits == 2;
+    assertEquals(hits.scoreDocs[0].score, hits.scoreDocs[1].score, 1e-5);
+    /*
+    for(int hit=0;hit<hits.totalHits;hit++) {
+      ScoreDoc sd = hits.scoreDocs[hit];
+      System.out.println("  hit doc=" + sd.doc + " score=" + sd.score);
+    }
+    */
+    r.close();
+    dir.close();
+  }
 }