mirror of
https://github.com/apache/lucene.git
synced 2025-02-28 21:39:25 +00:00
LUCENE-3029: MultiPhraseQuery scores should not depend on docID
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1098782 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
abc9be2eef
commit
fd0701bf4e
@ -1477,6 +1477,10 @@ Bug fixes
|
||||
that warming is free to do whatever it needs to. (Earwin Burrfoot
|
||||
via Mike McCandless)
|
||||
|
||||
* LUCENE-3029: Fix corner case when MultiPhraseQuery is used with zero
|
||||
position-increment tokens that would sometimes assign different
|
||||
scores to identical docs. (Mike McCandless)
|
||||
|
||||
* LUCENE-2486: Fixed intermittent FileNotFoundException on doc store
|
||||
files when a mergedSegmentWarmer is set on IndexWriter. (Mike
|
||||
McCandless)
|
||||
|
@ -28,13 +28,15 @@ final class PhrasePositions {
|
||||
int position; // position in doc
|
||||
int count; // remaining pos in this doc
|
||||
int offset; // position in phrase
|
||||
final int ord; // unique across all PhrasePositions instances
|
||||
final DocsAndPositionsEnum postings; // stream of docs & positions
|
||||
PhrasePositions next; // used to make lists
|
||||
boolean repeats; // there's other pp for same term (e.g. query="1st word 2nd word"~1)
|
||||
|
||||
PhrasePositions(DocsAndPositionsEnum postings, int o) {
|
||||
PhrasePositions(DocsAndPositionsEnum postings, int o, int ord) {
|
||||
this.postings = postings;
|
||||
offset = o;
|
||||
this.ord = ord;
|
||||
}
|
||||
|
||||
final boolean next() throws IOException { // increments to next doc
|
||||
|
@ -30,10 +30,16 @@ final class PhraseQueue extends PriorityQueue<PhrasePositions> {
|
||||
if (pp1.position == pp2.position)
|
||||
// same doc and pp.position, so decide by actual term positions.
|
||||
// rely on: pp.position == tp.position - offset.
|
||||
return pp1.offset < pp2.offset;
|
||||
else
|
||||
if (pp1.offset == pp2.offset) {
|
||||
return pp1.ord < pp2.ord;
|
||||
} else {
|
||||
return pp1.offset < pp2.offset;
|
||||
}
|
||||
else {
|
||||
return pp1.position < pp2.position;
|
||||
else
|
||||
}
|
||||
else {
|
||||
return pp1.doc < pp2.doc;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -55,7 +55,7 @@ abstract class PhraseScorer extends Scorer {
|
||||
// this allows to easily identify a matching (exact) phrase
|
||||
// when all PhrasePositions have exactly the same position.
|
||||
for (int i = 0; i < postings.length; i++) {
|
||||
PhrasePositions pp = new PhrasePositions(postings[i].postings, postings[i].position);
|
||||
PhrasePositions pp = new PhrasePositions(postings[i].postings, postings[i].position, i);
|
||||
if (last != null) { // add next to end of list
|
||||
last.next = pp;
|
||||
} else {
|
||||
|
@ -25,14 +25,22 @@ import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.search.Explanation.IDFExplanation;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
import java.util.LinkedList;
|
||||
import java.io.Reader;
|
||||
|
||||
/**
|
||||
* This class tests the MultiPhraseQuery class.
|
||||
@ -333,4 +341,97 @@ public class TestMultiPhraseQuery extends LuceneTestCase {
|
||||
reader.close();
|
||||
indexStore.close();
|
||||
}
|
||||
|
||||
private static class TokenAndPos {
|
||||
public final String token;
|
||||
public final int pos;
|
||||
public TokenAndPos(String token, int pos) {
|
||||
this.token = token;
|
||||
this.pos = pos;
|
||||
}
|
||||
}
|
||||
|
||||
private static class CannedAnalyzer extends Analyzer {
|
||||
private final TokenAndPos[] tokens;
|
||||
|
||||
public CannedAnalyzer(TokenAndPos[] tokens) {
|
||||
this.tokens = tokens;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new CannedTokenizer(tokens);
|
||||
}
|
||||
}
|
||||
|
||||
private static class CannedTokenizer extends Tokenizer {
|
||||
private final TokenAndPos[] tokens;
|
||||
private int upto = 0;
|
||||
private int lastPos = 0;
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
public CannedTokenizer(TokenAndPos[] tokens) {
|
||||
this.tokens = tokens;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
clearAttributes();
|
||||
if (upto < tokens.length) {
|
||||
final TokenAndPos token = tokens[upto++];
|
||||
termAtt.setEmpty();
|
||||
termAtt.append(token.token);
|
||||
posIncrAtt.setPositionIncrement(token.pos - lastPos);
|
||||
lastPos = token.pos;
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void testZeroPosIncr() throws IOException {
|
||||
Directory dir = new RAMDirectory();
|
||||
final TokenAndPos[] tokens = new TokenAndPos[3];
|
||||
tokens[0] = new TokenAndPos("a", 0);
|
||||
tokens[1] = new TokenAndPos("b", 0);
|
||||
tokens[2] = new TokenAndPos("c", 0);
|
||||
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random, dir, new CannedAnalyzer(tokens));
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("field", "", Field.Store.NO, Field.Index.ANALYZED));
|
||||
writer.addDocument(doc);
|
||||
writer.addDocument(doc);
|
||||
IndexReader r = writer.getReader();
|
||||
writer.close();
|
||||
IndexSearcher s = new IndexSearcher(r);
|
||||
MultiPhraseQuery mpq = new MultiPhraseQuery();
|
||||
//mpq.setSlop(1);
|
||||
|
||||
// NOTE: not great that if we do the else clause here we
|
||||
// get different scores! MultiPhraseQuery counts that
|
||||
// phrase as occurring twice per doc (it should be 1, I
|
||||
// think?). This is because MultipleTermPositions is able to
|
||||
// return the same position more than once (0, in this
|
||||
// case):
|
||||
if (true) {
|
||||
mpq.add(new Term[] {new Term("field", "b"), new Term("field", "c")}, 0);
|
||||
mpq.add(new Term[] {new Term("field", "a")}, 0);
|
||||
} else {
|
||||
mpq.add(new Term[] {new Term("field", "a")}, 0);
|
||||
mpq.add(new Term[] {new Term("field", "b"), new Term("field", "c")}, 0);
|
||||
}
|
||||
TopDocs hits = s.search(mpq, 2);
|
||||
assert hits.totalHits == 2;
|
||||
assertEquals(hits.scoreDocs[0].score, hits.scoreDocs[1].score, 1e-5);
|
||||
/*
|
||||
for(int hit=0;hit<hits.totalHits;hit++) {
|
||||
ScoreDoc sd = hits.scoreDocs[hit];
|
||||
System.out.println(" hit doc=" + sd.doc + " score=" + sd.score);
|
||||
}
|
||||
*/
|
||||
r.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user