mirror of
https://github.com/apache/lucene.git
synced 2025-02-28 21:39:25 +00:00
LUCENE-3029: MultiPhraseQuery scores should not depend on docID
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1098782 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
abc9be2eef
commit
fd0701bf4e
@ -1477,6 +1477,10 @@ Bug fixes
|
|||||||
that warming is free to do whatever it needs to. (Earwin Burrfoot
|
that warming is free to do whatever it needs to. (Earwin Burrfoot
|
||||||
via Mike McCandless)
|
via Mike McCandless)
|
||||||
|
|
||||||
|
* LUCENE-3029: Fix corner case when MultiPhraseQuery is used with zero
|
||||||
|
position-increment tokens that would sometimes assign different
|
||||||
|
scores to identical docs. (Mike McCandless)
|
||||||
|
|
||||||
* LUCENE-2486: Fixed intermittent FileNotFoundException on doc store
|
* LUCENE-2486: Fixed intermittent FileNotFoundException on doc store
|
||||||
files when a mergedSegmentWarmer is set on IndexWriter. (Mike
|
files when a mergedSegmentWarmer is set on IndexWriter. (Mike
|
||||||
McCandless)
|
McCandless)
|
||||||
|
@ -28,13 +28,15 @@ final class PhrasePositions {
|
|||||||
int position; // position in doc
|
int position; // position in doc
|
||||||
int count; // remaining pos in this doc
|
int count; // remaining pos in this doc
|
||||||
int offset; // position in phrase
|
int offset; // position in phrase
|
||||||
|
final int ord; // unique across all PhrasePositions instances
|
||||||
final DocsAndPositionsEnum postings; // stream of docs & positions
|
final DocsAndPositionsEnum postings; // stream of docs & positions
|
||||||
PhrasePositions next; // used to make lists
|
PhrasePositions next; // used to make lists
|
||||||
boolean repeats; // there's other pp for same term (e.g. query="1st word 2nd word"~1)
|
boolean repeats; // there's other pp for same term (e.g. query="1st word 2nd word"~1)
|
||||||
|
|
||||||
PhrasePositions(DocsAndPositionsEnum postings, int o) {
|
PhrasePositions(DocsAndPositionsEnum postings, int o, int ord) {
|
||||||
this.postings = postings;
|
this.postings = postings;
|
||||||
offset = o;
|
offset = o;
|
||||||
|
this.ord = ord;
|
||||||
}
|
}
|
||||||
|
|
||||||
final boolean next() throws IOException { // increments to next doc
|
final boolean next() throws IOException { // increments to next doc
|
||||||
|
@ -30,10 +30,16 @@ final class PhraseQueue extends PriorityQueue<PhrasePositions> {
|
|||||||
if (pp1.position == pp2.position)
|
if (pp1.position == pp2.position)
|
||||||
// same doc and pp.position, so decide by actual term positions.
|
// same doc and pp.position, so decide by actual term positions.
|
||||||
// rely on: pp.position == tp.position - offset.
|
// rely on: pp.position == tp.position - offset.
|
||||||
return pp1.offset < pp2.offset;
|
if (pp1.offset == pp2.offset) {
|
||||||
else
|
return pp1.ord < pp2.ord;
|
||||||
|
} else {
|
||||||
|
return pp1.offset < pp2.offset;
|
||||||
|
}
|
||||||
|
else {
|
||||||
return pp1.position < pp2.position;
|
return pp1.position < pp2.position;
|
||||||
else
|
}
|
||||||
|
else {
|
||||||
return pp1.doc < pp2.doc;
|
return pp1.doc < pp2.doc;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -55,7 +55,7 @@ abstract class PhraseScorer extends Scorer {
|
|||||||
// this allows to easily identify a matching (exact) phrase
|
// this allows to easily identify a matching (exact) phrase
|
||||||
// when all PhrasePositions have exactly the same position.
|
// when all PhrasePositions have exactly the same position.
|
||||||
for (int i = 0; i < postings.length; i++) {
|
for (int i = 0; i < postings.length; i++) {
|
||||||
PhrasePositions pp = new PhrasePositions(postings[i].postings, postings[i].position);
|
PhrasePositions pp = new PhrasePositions(postings[i].postings, postings[i].position, i);
|
||||||
if (last != null) { // add next to end of list
|
if (last != null) { // add next to end of list
|
||||||
last.next = pp;
|
last.next = pp;
|
||||||
} else {
|
} else {
|
||||||
|
@ -25,14 +25,22 @@ import org.apache.lucene.index.MultiFields;
|
|||||||
import org.apache.lucene.search.Explanation.IDFExplanation;
|
import org.apache.lucene.search.Explanation.IDFExplanation;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
|
import org.apache.lucene.index.IndexWriter;
|
||||||
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
|
import org.apache.lucene.store.RAMDirectory;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This class tests the MultiPhraseQuery class.
|
* This class tests the MultiPhraseQuery class.
|
||||||
@ -333,4 +341,97 @@ public class TestMultiPhraseQuery extends LuceneTestCase {
|
|||||||
reader.close();
|
reader.close();
|
||||||
indexStore.close();
|
indexStore.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static class TokenAndPos {
|
||||||
|
public final String token;
|
||||||
|
public final int pos;
|
||||||
|
public TokenAndPos(String token, int pos) {
|
||||||
|
this.token = token;
|
||||||
|
this.pos = pos;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class CannedAnalyzer extends Analyzer {
|
||||||
|
private final TokenAndPos[] tokens;
|
||||||
|
|
||||||
|
public CannedAnalyzer(TokenAndPos[] tokens) {
|
||||||
|
this.tokens = tokens;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
|
return new CannedTokenizer(tokens);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class CannedTokenizer extends Tokenizer {
|
||||||
|
private final TokenAndPos[] tokens;
|
||||||
|
private int upto = 0;
|
||||||
|
private int lastPos = 0;
|
||||||
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
|
public CannedTokenizer(TokenAndPos[] tokens) {
|
||||||
|
this.tokens = tokens;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public final boolean incrementToken() throws IOException {
|
||||||
|
clearAttributes();
|
||||||
|
if (upto < tokens.length) {
|
||||||
|
final TokenAndPos token = tokens[upto++];
|
||||||
|
termAtt.setEmpty();
|
||||||
|
termAtt.append(token.token);
|
||||||
|
posIncrAtt.setPositionIncrement(token.pos - lastPos);
|
||||||
|
lastPos = token.pos;
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testZeroPosIncr() throws IOException {
|
||||||
|
Directory dir = new RAMDirectory();
|
||||||
|
final TokenAndPos[] tokens = new TokenAndPos[3];
|
||||||
|
tokens[0] = new TokenAndPos("a", 0);
|
||||||
|
tokens[1] = new TokenAndPos("b", 0);
|
||||||
|
tokens[2] = new TokenAndPos("c", 0);
|
||||||
|
|
||||||
|
RandomIndexWriter writer = new RandomIndexWriter(random, dir, new CannedAnalyzer(tokens));
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(new Field("field", "", Field.Store.NO, Field.Index.ANALYZED));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
writer.addDocument(doc);
|
||||||
|
IndexReader r = writer.getReader();
|
||||||
|
writer.close();
|
||||||
|
IndexSearcher s = new IndexSearcher(r);
|
||||||
|
MultiPhraseQuery mpq = new MultiPhraseQuery();
|
||||||
|
//mpq.setSlop(1);
|
||||||
|
|
||||||
|
// NOTE: not great that if we do the else clause here we
|
||||||
|
// get different scores! MultiPhraseQuery counts that
|
||||||
|
// phrase as occurring twice per doc (it should be 1, I
|
||||||
|
// think?). This is because MultipleTermPositions is able to
|
||||||
|
// return the same position more than once (0, in this
|
||||||
|
// case):
|
||||||
|
if (true) {
|
||||||
|
mpq.add(new Term[] {new Term("field", "b"), new Term("field", "c")}, 0);
|
||||||
|
mpq.add(new Term[] {new Term("field", "a")}, 0);
|
||||||
|
} else {
|
||||||
|
mpq.add(new Term[] {new Term("field", "a")}, 0);
|
||||||
|
mpq.add(new Term[] {new Term("field", "b"), new Term("field", "c")}, 0);
|
||||||
|
}
|
||||||
|
TopDocs hits = s.search(mpq, 2);
|
||||||
|
assert hits.totalHits == 2;
|
||||||
|
assertEquals(hits.scoreDocs[0].score, hits.scoreDocs[1].score, 1e-5);
|
||||||
|
/*
|
||||||
|
for(int hit=0;hit<hits.totalHits;hit++) {
|
||||||
|
ScoreDoc sd = hits.scoreDocs[hit];
|
||||||
|
System.out.println(" hit doc=" + sd.doc + " score=" + sd.score);
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
r.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user