LUCENE-3029: MultiPhraseQuery scores should not depend on docID

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1098782 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2011-05-02 20:39:26 +00:00
parent abc9be2eef
commit fd0701bf4e
5 changed files with 119 additions and 6 deletions

View File

@ -1477,6 +1477,10 @@ Bug fixes
that warming is free to do whatever it needs to. (Earwin Burrfoot that warming is free to do whatever it needs to. (Earwin Burrfoot
via Mike McCandless) via Mike McCandless)
* LUCENE-3029: Fix corner case when MultiPhraseQuery is used with zero
position-increment tokens that would sometimes assign different
scores to identical docs. (Mike McCandless)
* LUCENE-2486: Fixed intermittent FileNotFoundException on doc store * LUCENE-2486: Fixed intermittent FileNotFoundException on doc store
files when a mergedSegmentWarmer is set on IndexWriter. (Mike files when a mergedSegmentWarmer is set on IndexWriter. (Mike
McCandless) McCandless)

View File

@ -28,13 +28,15 @@ final class PhrasePositions {
int position; // position in doc int position; // position in doc
int count; // remaining pos in this doc int count; // remaining pos in this doc
int offset; // position in phrase int offset; // position in phrase
final int ord; // unique across all PhrasePositions instances
final DocsAndPositionsEnum postings; // stream of docs & positions final DocsAndPositionsEnum postings; // stream of docs & positions
PhrasePositions next; // used to make lists PhrasePositions next; // used to make lists
boolean repeats; // there's other pp for same term (e.g. query="1st word 2nd word"~1) boolean repeats; // there's other pp for same term (e.g. query="1st word 2nd word"~1)
PhrasePositions(DocsAndPositionsEnum postings, int o) { PhrasePositions(DocsAndPositionsEnum postings, int o, int ord) {
this.postings = postings; this.postings = postings;
offset = o; offset = o;
this.ord = ord;
} }
final boolean next() throws IOException { // increments to next doc final boolean next() throws IOException { // increments to next doc

View File

@ -30,10 +30,16 @@ final class PhraseQueue extends PriorityQueue<PhrasePositions> {
if (pp1.position == pp2.position) if (pp1.position == pp2.position)
// same doc and pp.position, so decide by actual term positions. // same doc and pp.position, so decide by actual term positions.
// rely on: pp.position == tp.position - offset. // rely on: pp.position == tp.position - offset.
return pp1.offset < pp2.offset; if (pp1.offset == pp2.offset) {
else return pp1.ord < pp2.ord;
} else {
return pp1.offset < pp2.offset;
}
else {
return pp1.position < pp2.position; return pp1.position < pp2.position;
else }
else {
return pp1.doc < pp2.doc; return pp1.doc < pp2.doc;
}
} }
} }

View File

@ -55,7 +55,7 @@ abstract class PhraseScorer extends Scorer {
// this allows to easily identify a matching (exact) phrase // this allows to easily identify a matching (exact) phrase
// when all PhrasePositions have exactly the same position. // when all PhrasePositions have exactly the same position.
for (int i = 0; i < postings.length; i++) { for (int i = 0; i < postings.length; i++) {
PhrasePositions pp = new PhrasePositions(postings[i].postings, postings[i].position); PhrasePositions pp = new PhrasePositions(postings[i].postings, postings[i].position, i);
if (last != null) { // add next to end of list if (last != null) { // add next to end of list
last.next = pp; last.next = pp;
} else { } else {

View File

@ -25,14 +25,22 @@ import org.apache.lucene.index.MultiFields;
import org.apache.lucene.search.Explanation.IDFExplanation; import org.apache.lucene.search.Explanation.IDFExplanation;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
import java.io.IOException; import java.io.IOException;
import java.util.Collection; import java.util.Collection;
import java.util.LinkedList; import java.util.LinkedList;
import java.io.Reader;
/** /**
* This class tests the MultiPhraseQuery class. * This class tests the MultiPhraseQuery class.
@ -333,4 +341,97 @@ public class TestMultiPhraseQuery extends LuceneTestCase {
reader.close(); reader.close();
indexStore.close(); indexStore.close();
} }
private static class TokenAndPos {
public final String token;
public final int pos;
public TokenAndPos(String token, int pos) {
this.token = token;
this.pos = pos;
}
}
private static class CannedAnalyzer extends Analyzer {
private final TokenAndPos[] tokens;
public CannedAnalyzer(TokenAndPos[] tokens) {
this.tokens = tokens;
}
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
return new CannedTokenizer(tokens);
}
}
private static class CannedTokenizer extends Tokenizer {
private final TokenAndPos[] tokens;
private int upto = 0;
private int lastPos = 0;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
public CannedTokenizer(TokenAndPos[] tokens) {
this.tokens = tokens;
}
@Override
public final boolean incrementToken() throws IOException {
clearAttributes();
if (upto < tokens.length) {
final TokenAndPos token = tokens[upto++];
termAtt.setEmpty();
termAtt.append(token.token);
posIncrAtt.setPositionIncrement(token.pos - lastPos);
lastPos = token.pos;
return true;
} else {
return false;
}
}
}
public void testZeroPosIncr() throws IOException {
Directory dir = new RAMDirectory();
final TokenAndPos[] tokens = new TokenAndPos[3];
tokens[0] = new TokenAndPos("a", 0);
tokens[1] = new TokenAndPos("b", 0);
tokens[2] = new TokenAndPos("c", 0);
RandomIndexWriter writer = new RandomIndexWriter(random, dir, new CannedAnalyzer(tokens));
Document doc = new Document();
doc.add(new Field("field", "", Field.Store.NO, Field.Index.ANALYZED));
writer.addDocument(doc);
writer.addDocument(doc);
IndexReader r = writer.getReader();
writer.close();
IndexSearcher s = new IndexSearcher(r);
MultiPhraseQuery mpq = new MultiPhraseQuery();
//mpq.setSlop(1);
// NOTE: not great that if we do the else clause here we
// get different scores! MultiPhraseQuery counts that
// phrase as occurring twice per doc (it should be 1, I
// think?). This is because MultipleTermPositions is able to
// return the same position more than once (0, in this
// case):
if (true) {
mpq.add(new Term[] {new Term("field", "b"), new Term("field", "c")}, 0);
mpq.add(new Term[] {new Term("field", "a")}, 0);
} else {
mpq.add(new Term[] {new Term("field", "a")}, 0);
mpq.add(new Term[] {new Term("field", "b"), new Term("field", "c")}, 0);
}
TopDocs hits = s.search(mpq, 2);
assert hits.totalHits == 2;
assertEquals(hits.scoreDocs[0].score, hits.scoreDocs[1].score, 1e-5);
/*
for(int hit=0;hit<hits.totalHits;hit++) {
ScoreDoc sd = hits.scoreDocs[hit];
System.out.println(" hit doc=" + sd.doc + " score=" + sd.score);
}
*/
r.close();
dir.close();
}
} }