LUCENE-3029: MultiPhraseQuery scores should not depend on docID

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1098782 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2011-05-02 20:39:26 +00:00
parent abc9be2eef
commit fd0701bf4e
5 changed files with 119 additions and 6 deletions

View File

@ -1477,6 +1477,10 @@ Bug fixes
that warming is free to do whatever it needs to. (Earwin Burrfoot
via Mike McCandless)
* LUCENE-3029: Fix corner case when MultiPhraseQuery is used with zero
position-increment tokens that would sometimes assign different
scores to identical docs. (Mike McCandless)
* LUCENE-2486: Fixed intermittent FileNotFoundException on doc store
files when a mergedSegmentWarmer is set on IndexWriter. (Mike
McCandless)

View File

@ -28,13 +28,15 @@ final class PhrasePositions {
int position; // position in doc
int count; // remaining pos in this doc
int offset; // position in phrase
final int ord; // unique across all PhrasePositions instances
final DocsAndPositionsEnum postings; // stream of docs & positions
PhrasePositions next; // used to make lists
boolean repeats; // there's other pp for same term (e.g. query="1st word 2nd word"~1)
PhrasePositions(DocsAndPositionsEnum postings, int o) {
PhrasePositions(DocsAndPositionsEnum postings, int o, int ord) {
this.postings = postings;
offset = o;
this.ord = ord;
}
final boolean next() throws IOException { // increments to next doc

View File

@ -30,10 +30,16 @@ final class PhraseQueue extends PriorityQueue<PhrasePositions> {
if (pp1.position == pp2.position)
// same doc and pp.position, so decide by actual term positions.
// rely on: pp.position == tp.position - offset.
return pp1.offset < pp2.offset;
else
if (pp1.offset == pp2.offset) {
return pp1.ord < pp2.ord;
} else {
return pp1.offset < pp2.offset;
}
else {
return pp1.position < pp2.position;
else
}
else {
return pp1.doc < pp2.doc;
}
}
}

View File

@ -55,7 +55,7 @@ abstract class PhraseScorer extends Scorer {
// this allows to easily identify a matching (exact) phrase
// when all PhrasePositions have exactly the same position.
for (int i = 0; i < postings.length; i++) {
PhrasePositions pp = new PhrasePositions(postings[i].postings, postings[i].position);
PhrasePositions pp = new PhrasePositions(postings[i].postings, postings[i].position, i);
if (last != null) { // add next to end of list
last.next = pp;
} else {

View File

@ -25,14 +25,22 @@ import org.apache.lucene.index.MultiFields;
import org.apache.lucene.search.Explanation.IDFExplanation;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.LuceneTestCase;
import java.io.IOException;
import java.util.Collection;
import java.util.LinkedList;
import java.io.Reader;
/**
* This class tests the MultiPhraseQuery class.
@ -333,4 +341,97 @@ public class TestMultiPhraseQuery extends LuceneTestCase {
reader.close();
indexStore.close();
}
private static class TokenAndPos {
public final String token;
public final int pos;
public TokenAndPos(String token, int pos) {
this.token = token;
this.pos = pos;
}
}
private static class CannedAnalyzer extends Analyzer {
private final TokenAndPos[] tokens;
public CannedAnalyzer(TokenAndPos[] tokens) {
this.tokens = tokens;
}
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
return new CannedTokenizer(tokens);
}
}
private static class CannedTokenizer extends Tokenizer {
private final TokenAndPos[] tokens;
private int upto = 0;
private int lastPos = 0;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
public CannedTokenizer(TokenAndPos[] tokens) {
this.tokens = tokens;
}
@Override
public final boolean incrementToken() throws IOException {
clearAttributes();
if (upto < tokens.length) {
final TokenAndPos token = tokens[upto++];
termAtt.setEmpty();
termAtt.append(token.token);
posIncrAtt.setPositionIncrement(token.pos - lastPos);
lastPos = token.pos;
return true;
} else {
return false;
}
}
}
public void testZeroPosIncr() throws IOException {
Directory dir = new RAMDirectory();
final TokenAndPos[] tokens = new TokenAndPos[3];
tokens[0] = new TokenAndPos("a", 0);
tokens[1] = new TokenAndPos("b", 0);
tokens[2] = new TokenAndPos("c", 0);
RandomIndexWriter writer = new RandomIndexWriter(random, dir, new CannedAnalyzer(tokens));
Document doc = new Document();
doc.add(new Field("field", "", Field.Store.NO, Field.Index.ANALYZED));
writer.addDocument(doc);
writer.addDocument(doc);
IndexReader r = writer.getReader();
writer.close();
IndexSearcher s = new IndexSearcher(r);
MultiPhraseQuery mpq = new MultiPhraseQuery();
//mpq.setSlop(1);
// NOTE: not great that if we do the else clause here we
// get different scores! MultiPhraseQuery counts that
// phrase as occurring twice per doc (it should be 1, I
// think?). This is because MultipleTermPositions is able to
// return the same position more than once (0, in this
// case):
if (true) {
mpq.add(new Term[] {new Term("field", "b"), new Term("field", "c")}, 0);
mpq.add(new Term[] {new Term("field", "a")}, 0);
} else {
mpq.add(new Term[] {new Term("field", "a")}, 0);
mpq.add(new Term[] {new Term("field", "b"), new Term("field", "c")}, 0);
}
TopDocs hits = s.search(mpq, 2);
assert hits.totalHits == 2;
assertEquals(hits.scoreDocs[0].score, hits.scoreDocs[1].score, 1e-5);
/*
for(int hit=0;hit<hits.totalHits;hit++) {
ScoreDoc sd = hits.scoreDocs[hit];
System.out.println(" hit doc=" + sd.doc + " score=" + sd.score);
}
*/
r.close();
dir.close();
}
}