mirror of https://github.com/apache/lucene.git
LUCENE-3412: SloppyPhraseScorer was returning non-deterministic results for queries with many repeats.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1166541 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
4ea767545b
commit
fa93e73225
|
@ -590,6 +590,9 @@ Bug fixes
|
||||||
easily corrupt the index. (Mark Miller, Robert Muir, Mike
|
easily corrupt the index. (Mark Miller, Robert Muir, Mike
|
||||||
McCandless)
|
McCandless)
|
||||||
|
|
||||||
|
* LUCENE-3412: SloppyPhraseScorer was returning non-deterministic results
|
||||||
|
for queries with many repeats (Doron Cohen)
|
||||||
|
|
||||||
New Features
|
New Features
|
||||||
|
|
||||||
* LUCENE-3290: Added FieldInvertState.numUniqueTerms
|
* LUCENE-3290: Added FieldInvertState.numUniqueTerms
|
||||||
|
|
|
@ -18,7 +18,7 @@ package org.apache.lucene.search;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashSet;
|
import java.util.LinkedHashSet;
|
||||||
|
|
||||||
final class SloppyPhraseScorer extends PhraseScorer {
|
final class SloppyPhraseScorer extends PhraseScorer {
|
||||||
private int slop;
|
private int slop;
|
||||||
|
@ -70,7 +70,7 @@ final class SloppyPhraseScorer extends PhraseScorer {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
PhrasePositions pp2 = null;
|
PhrasePositions pp2 = null;
|
||||||
tpsDiffer = !pp.repeats || (pp2 = termPositionsDiffer(pp))==null;
|
tpsDiffer = !pp.repeats || (pp2 = termPositionsConflict(pp))==null;
|
||||||
if (pp2!=null && pp2!=pp) {
|
if (pp2!=null && pp2!=pp) {
|
||||||
pp = flip(pp,pp2); // flip pp to pp2
|
pp = flip(pp,pp2); // flip pp to pp2
|
||||||
}
|
}
|
||||||
|
@ -118,7 +118,7 @@ final class SloppyPhraseScorer extends PhraseScorer {
|
||||||
* position in doc. This case is detected by comparing just the (query) offsets,
|
* position in doc. This case is detected by comparing just the (query) offsets,
|
||||||
* and two such PPs are not considered "repeating".
|
* and two such PPs are not considered "repeating".
|
||||||
* <br>- Also mark each such pp by pp.repeats = true.
|
* <br>- Also mark each such pp by pp.repeats = true.
|
||||||
* <br>Later can consult with repeats[] in termPositionsDiffer(pp), making that check efficient.
|
* <br>Later can consult with repeats[] in termPositionsConflict(pp), making that check efficient.
|
||||||
* In particular, this allows to score queries with no repetitions with no overhead due to this computation.
|
* In particular, this allows to score queries with no repetitions with no overhead due to this computation.
|
||||||
* <br>- Example 1 - query with no repetitions: "ho my"~2
|
* <br>- Example 1 - query with no repetitions: "ho my"~2
|
||||||
* <br>- Example 2 - query with repetitions: "ho my my"~2
|
* <br>- Example 2 - query with repetitions: "ho my my"~2
|
||||||
|
@ -147,11 +147,11 @@ final class SloppyPhraseScorer extends PhraseScorer {
|
||||||
for (PhrasePositions pp = first; pp != null; pp = pp.next)
|
for (PhrasePositions pp = first; pp != null; pp = pp.next)
|
||||||
pp.firstPosition();
|
pp.firstPosition();
|
||||||
|
|
||||||
// one time initializatin for this scorer
|
// one time initialization for this scorer
|
||||||
if (!checkedRepeats) {
|
if (!checkedRepeats) {
|
||||||
checkedRepeats = true;
|
checkedRepeats = true;
|
||||||
// check for repeats
|
// check for repeats
|
||||||
HashSet<PhrasePositions> m = null;
|
LinkedHashSet<PhrasePositions> m = null; // see comment (*) below why order is important
|
||||||
for (PhrasePositions pp = first; pp != null; pp = pp.next) {
|
for (PhrasePositions pp = first; pp != null; pp = pp.next) {
|
||||||
int tpPos = pp.position + pp.offset;
|
int tpPos = pp.position + pp.offset;
|
||||||
for (PhrasePositions pp2 = pp.next; pp2 != null; pp2 = pp2.next) {
|
for (PhrasePositions pp2 = pp.next; pp2 != null; pp2 = pp2.next) {
|
||||||
|
@ -161,7 +161,7 @@ final class SloppyPhraseScorer extends PhraseScorer {
|
||||||
int tpPos2 = pp2.position + pp2.offset;
|
int tpPos2 = pp2.position + pp2.offset;
|
||||||
if (tpPos2 == tpPos) {
|
if (tpPos2 == tpPos) {
|
||||||
if (m == null)
|
if (m == null)
|
||||||
m = new HashSet<PhrasePositions>();
|
m = new LinkedHashSet<PhrasePositions>();
|
||||||
pp.repeats = true;
|
pp.repeats = true;
|
||||||
pp2.repeats = true;
|
pp2.repeats = true;
|
||||||
m.add(pp);
|
m.add(pp);
|
||||||
|
@ -174,12 +174,15 @@ final class SloppyPhraseScorer extends PhraseScorer {
|
||||||
}
|
}
|
||||||
|
|
||||||
// with repeats must advance some repeating pp's so they all start with differing tp's
|
// with repeats must advance some repeating pp's so they all start with differing tp's
|
||||||
|
// (*) It is important that pps are handled by their original order in the query,
|
||||||
|
// because we advance the pp with larger offset, and so processing them in that order
|
||||||
|
// allows to cover all pairs.
|
||||||
if (repeats!=null) {
|
if (repeats!=null) {
|
||||||
for (int i = 0; i < repeats.length; i++) {
|
for (int i = 0; i < repeats.length; i++) {
|
||||||
PhrasePositions pp = repeats[i];
|
PhrasePositions pp = repeats[i];
|
||||||
PhrasePositions pp2;
|
PhrasePositions pp2;
|
||||||
while ((pp2 = termPositionsDiffer(pp)) != null) {
|
while ((pp2 = termPositionsConflict(pp)) != null) {
|
||||||
if (!pp2.nextPosition()) // out of pps that do not differ, advance the pp with higher offset
|
if (!pp2.nextPosition()) // among pps that do not differ, advance the pp with higher offset
|
||||||
return -1; // ran out of a term -- done
|
return -1; // ran out of a term -- done
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -196,6 +199,7 @@ final class SloppyPhraseScorer extends PhraseScorer {
|
||||||
if (repeats!=null) {
|
if (repeats!=null) {
|
||||||
tmpPos = new PhrasePositions[pq.size()];
|
tmpPos = new PhrasePositions[pq.size()];
|
||||||
}
|
}
|
||||||
|
|
||||||
return end;
|
return end;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -205,9 +209,9 @@ final class SloppyPhraseScorer extends PhraseScorer {
|
||||||
* @return null if differ (i.e. valid) otherwise return the higher offset PhrasePositions
|
* @return null if differ (i.e. valid) otherwise return the higher offset PhrasePositions
|
||||||
* out of the first two PPs found to not differ.
|
* out of the first two PPs found to not differ.
|
||||||
*/
|
*/
|
||||||
private PhrasePositions termPositionsDiffer(PhrasePositions pp) {
|
private PhrasePositions termPositionsConflict(PhrasePositions pp) {
|
||||||
// efficiency note: a more efficient implementation could keep a map between repeating
|
// efficiency note: a more efficient implementation could keep a map between repeating
|
||||||
// pp's, so that if pp1a, pp1b, pp1c are repeats term1, and pp2a, pp2b are repeats
|
// pp's, so that if pp1a, pp1b, pp1c are repeats of term1, and pp2a, pp2b are repeats
|
||||||
// of term2, pp2a would only be checked against pp2b but not against pp1a, pp1b, pp1c.
|
// of term2, pp2a would only be checked against pp2b but not against pp1a, pp1b, pp1c.
|
||||||
// However this would complicate code, for a rather rare case, so choice is to compromise here.
|
// However this would complicate code, for a rather rare case, so choice is to compromise here.
|
||||||
int tpPos = pp.position + pp.offset;
|
int tpPos = pp.position + pp.offset;
|
||||||
|
|
|
@ -41,10 +41,13 @@ public class TestSloppyPhraseQuery extends LuceneTestCase {
|
||||||
private static final Document DOC_2_B = makeDocument("X " + S_2 + " Y N N N N " + S_2 + " Z");
|
private static final Document DOC_2_B = makeDocument("X " + S_2 + " Y N N N N " + S_2 + " Z");
|
||||||
private static final Document DOC_3_B = makeDocument("X " + S_1 + " A Y N N N N " + S_1 + " A Y");
|
private static final Document DOC_3_B = makeDocument("X " + S_1 + " A Y N N N N " + S_1 + " A Y");
|
||||||
private static final Document DOC_4 = makeDocument("A A X A X B A X B B A A X B A A");
|
private static final Document DOC_4 = makeDocument("A A X A X B A X B B A A X B A A");
|
||||||
|
private static final Document DOC_5_3 = makeDocument("H H H X X X H H H X X X H H H");
|
||||||
|
private static final Document DOC_5_4 = makeDocument("H H H H");
|
||||||
|
|
||||||
private static final PhraseQuery QUERY_1 = makePhraseQuery( S_1 );
|
private static final PhraseQuery QUERY_1 = makePhraseQuery( S_1 );
|
||||||
private static final PhraseQuery QUERY_2 = makePhraseQuery( S_2 );
|
private static final PhraseQuery QUERY_2 = makePhraseQuery( S_2 );
|
||||||
private static final PhraseQuery QUERY_4 = makePhraseQuery( "X A A");
|
private static final PhraseQuery QUERY_4 = makePhraseQuery( "X A A");
|
||||||
|
private static final PhraseQuery QUERY_5_4 = makePhraseQuery( "H H H H");
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test DOC_4 and QUERY_4.
|
* Test DOC_4 and QUERY_4.
|
||||||
|
@ -112,6 +115,21 @@ public class TestSloppyPhraseQuery extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** LUCENE-3412 */
|
||||||
|
public void testDoc5_Query5_Any_Slop_Should_be_consistent() throws Exception {
|
||||||
|
int nRepeats = 5;
|
||||||
|
for (int slop=0; slop<3; slop++) {
|
||||||
|
for (int trial=0; trial<nRepeats; trial++) {
|
||||||
|
// should steadily always find this one
|
||||||
|
checkPhraseQuery(DOC_5_4, QUERY_5_4, slop, 1);
|
||||||
|
}
|
||||||
|
for (int trial=0; trial<nRepeats; trial++) {
|
||||||
|
// should steadily never find this one
|
||||||
|
checkPhraseQuery(DOC_5_3, QUERY_5_4, slop, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private float checkPhraseQuery(Document doc, PhraseQuery query, int slop, int expectedNumResults) throws Exception {
|
private float checkPhraseQuery(Document doc, PhraseQuery query, int slop, int expectedNumResults) throws Exception {
|
||||||
query.setSlop(slop);
|
query.setSlop(slop);
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue