From fa93e73225cb10fab3ac8890aab1b610d292e313 Mon Sep 17 00:00:00 2001 From: Doron Cohen Date: Thu, 8 Sep 2011 06:51:40 +0000 Subject: [PATCH] LUCENE-3412: SloppyPhraseScorer was returning non-deterministic results for queries with many repeats. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1166541 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 3 ++ .../lucene/search/SloppyPhraseScorer.java | 28 +++++++++++-------- .../lucene/search/TestSloppyPhraseQuery.java | 18 ++++++++++++ 3 files changed, 37 insertions(+), 12 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 244ce3e327f..5c588d28aa5 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -590,6 +590,9 @@ Bug fixes easily corrupt the index. (Mark Miller, Robert Muir, Mike McCandless) +* LUCENE-3412: SloppyPhraseScorer was returning non-deterministic results + for queries with many repeats (Doron Cohen) + New Features * LUCENE-3290: Added FieldInvertState.numUniqueTerms diff --git a/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java b/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java index d13f5cb41e6..882013979c8 100644 --- a/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java +++ b/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java @@ -18,7 +18,7 @@ package org.apache.lucene.search; */ import java.io.IOException; -import java.util.HashSet; +import java.util.LinkedHashSet; final class SloppyPhraseScorer extends PhraseScorer { private int slop; @@ -70,7 +70,7 @@ final class SloppyPhraseScorer extends PhraseScorer { break; } PhrasePositions pp2 = null; - tpsDiffer = !pp.repeats || (pp2 = termPositionsDiffer(pp))==null; + tpsDiffer = !pp.repeats || (pp2 = termPositionsConflict(pp))==null; if (pp2!=null && pp2!=pp) { pp = flip(pp,pp2); // flip pp to pp2 } @@ -118,7 +118,7 @@ final class SloppyPhraseScorer extends PhraseScorer { * position in doc. This case is detected by comparing just the (query) offsets, * and two such PPs are not considered "repeating". *
- Also mark each such pp by pp.repeats = true. - *
Later can consult with repeats[] in termPositionsDiffer(pp), making that check efficient. + *
Later can consult with repeats[] in termPositionsConflict(pp), making that check efficient. * In particular, this allows to score queries with no repetitions with no overhead due to this computation. *
- Example 1 - query with no repetitions: "ho my"~2 *
- Example 2 - query with repetitions: "ho my my"~2 @@ -147,11 +147,11 @@ final class SloppyPhraseScorer extends PhraseScorer { for (PhrasePositions pp = first; pp != null; pp = pp.next) pp.firstPosition(); - // one time initializatin for this scorer + // one time initialization for this scorer if (!checkedRepeats) { checkedRepeats = true; // check for repeats - HashSet m = null; + LinkedHashSet m = null; // see comment (*) below why order is important for (PhrasePositions pp = first; pp != null; pp = pp.next) { int tpPos = pp.position + pp.offset; for (PhrasePositions pp2 = pp.next; pp2 != null; pp2 = pp2.next) { @@ -161,7 +161,7 @@ final class SloppyPhraseScorer extends PhraseScorer { int tpPos2 = pp2.position + pp2.offset; if (tpPos2 == tpPos) { if (m == null) - m = new HashSet(); + m = new LinkedHashSet(); pp.repeats = true; pp2.repeats = true; m.add(pp); @@ -173,14 +173,17 @@ final class SloppyPhraseScorer extends PhraseScorer { repeats = m.toArray(new PhrasePositions[0]); } - // with repeats must advance some repeating pp's so they all start with differing tp's + // with repeats must advance some repeating pp's so they all start with differing tp's + // (*) It is important that pps are handled by their original order in the query, + // because we advance the pp with larger offset, and so processing them in that order + // allows to cover all pairs. if (repeats!=null) { for (int i = 0; i < repeats.length; i++) { PhrasePositions pp = repeats[i]; PhrasePositions pp2; - while ((pp2 = termPositionsDiffer(pp)) != null) { - if (!pp2.nextPosition()) // out of pps that do not differ, advance the pp with higher offset - return -1; // ran out of a term -- done + while ((pp2 = termPositionsConflict(pp)) != null) { + if (!pp2.nextPosition()) // among pps that do not differ, advance the pp with higher offset + return -1; // ran out of a term -- done } } } @@ -196,6 +199,7 @@ final class SloppyPhraseScorer extends PhraseScorer { if (repeats!=null) { tmpPos = new PhrasePositions[pq.size()]; } + return end; } @@ -205,9 +209,9 @@ final class SloppyPhraseScorer extends PhraseScorer { * @return null if differ (i.e. valid) otherwise return the higher offset PhrasePositions * out of the first two PPs found to not differ. */ - private PhrasePositions termPositionsDiffer(PhrasePositions pp) { + private PhrasePositions termPositionsConflict(PhrasePositions pp) { // efficiency note: a more efficient implementation could keep a map between repeating - // pp's, so that if pp1a, pp1b, pp1c are repeats term1, and pp2a, pp2b are repeats + // pp's, so that if pp1a, pp1b, pp1c are repeats of term1, and pp2a, pp2b are repeats // of term2, pp2a would only be checked against pp2b but not against pp1a, pp1b, pp1c. // However this would complicate code, for a rather rare case, so choice is to compromise here. int tpPos = pp.position + pp.offset; diff --git a/lucene/src/test/org/apache/lucene/search/TestSloppyPhraseQuery.java b/lucene/src/test/org/apache/lucene/search/TestSloppyPhraseQuery.java index e7ae2eafa3a..c7076c2542a 100755 --- a/lucene/src/test/org/apache/lucene/search/TestSloppyPhraseQuery.java +++ b/lucene/src/test/org/apache/lucene/search/TestSloppyPhraseQuery.java @@ -41,10 +41,13 @@ public class TestSloppyPhraseQuery extends LuceneTestCase { private static final Document DOC_2_B = makeDocument("X " + S_2 + " Y N N N N " + S_2 + " Z"); private static final Document DOC_3_B = makeDocument("X " + S_1 + " A Y N N N N " + S_1 + " A Y"); private static final Document DOC_4 = makeDocument("A A X A X B A X B B A A X B A A"); + private static final Document DOC_5_3 = makeDocument("H H H X X X H H H X X X H H H"); + private static final Document DOC_5_4 = makeDocument("H H H H"); private static final PhraseQuery QUERY_1 = makePhraseQuery( S_1 ); private static final PhraseQuery QUERY_2 = makePhraseQuery( S_2 ); private static final PhraseQuery QUERY_4 = makePhraseQuery( "X A A"); + private static final PhraseQuery QUERY_5_4 = makePhraseQuery( "H H H H"); /** * Test DOC_4 and QUERY_4. @@ -112,6 +115,21 @@ public class TestSloppyPhraseQuery extends LuceneTestCase { } } + /** LUCENE-3412 */ + public void testDoc5_Query5_Any_Slop_Should_be_consistent() throws Exception { + int nRepeats = 5; + for (int slop=0; slop<3; slop++) { + for (int trial=0; trial