From c54608b60dc6fb3e451dd1d3c430cf42c38f5fb5 Mon Sep 17 00:00:00 2001 From: Doron Cohen Date: Thu, 22 Sep 2011 06:27:43 +0000 Subject: [PATCH] LUCENE-3215: SloppyPhraseScorer sometimes computed Infinite freq. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1173961 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 5 +- .../apache/lucene/search/PhrasePositions.java | 16 +- .../apache/lucene/search/PhraseScorer.java | 145 ++---- .../lucene/search/SloppyPhraseScorer.java | 443 ++++++++++-------- .../lucene/search/TestSloppyPhraseQuery.java | 144 ++++++ 5 files changed, 452 insertions(+), 301 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 9bb5f17d795..4ad68b0ce4a 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -606,7 +606,7 @@ Bug fixes * LUCENE-3432: IndexWriter.expungeDeletes with TieredMergePolicy should ignore the maxMergedSegmentMB setting (v.sevel via Mike McCandless) - + * LUCENE-3442: TermQuery.TermWeight.scorer() returns null for non-atomic IndexReaders (optimization bug, introcuced by LUCENE-2829), preventing QueryWrapperFilter and similar classes to get a top-level DocIdSet. @@ -621,6 +621,9 @@ Bug fixes incorrectly invoking the DeletionPolicy and (then possibly deleting files) on the closed IndexWriter (Robert Muir, Mike McCandless) +* LUCENE-3215: SloppyPhraseScorer sometimes computed Infinite freq + (Robert Muir, Doron Cohen) + New Features Optimizations diff --git a/lucene/src/java/org/apache/lucene/search/PhrasePositions.java b/lucene/src/java/org/apache/lucene/search/PhrasePositions.java index 00c638965cc..004d857c0f6 100644 --- a/lucene/src/java/org/apache/lucene/search/PhrasePositions.java +++ b/lucene/src/java/org/apache/lucene/search/PhrasePositions.java @@ -31,7 +31,7 @@ final class PhrasePositions { final int ord; // unique across all PhrasePositions instances final DocsAndPositionsEnum postings; // stream of docs & positions PhrasePositions next; // used to make lists - boolean repeats; // there's other pp for same term (e.g. query="1st word 2nd word"~1) + PhrasePositions nextRepeating; // link to next repeating pp: standing for same term in different query offsets PhrasePositions(DocsAndPositionsEnum postings, int o, int ord) { this.postings = postings; @@ -41,7 +41,7 @@ final class PhrasePositions { final boolean next() throws IOException { // increments to next doc doc = postings.nextDoc(); - if (doc == postings.NO_MORE_DOCS) { + if (doc == DocIdSetIterator.NO_MORE_DOCS) { return false; } return true; @@ -49,7 +49,7 @@ final class PhrasePositions { final boolean skipTo(int target) throws IOException { doc = postings.advance(target); - if (doc == postings.NO_MORE_DOCS) { + if (doc == DocIdSetIterator.NO_MORE_DOCS) { return false; } return true; @@ -73,4 +73,14 @@ final class PhrasePositions { } else return false; } + + /** for debug purposes */ + @Override + public String toString() { + String s = "d:"+doc+" o:"+offset+" p:"+position+" c:"+count; + if (nextRepeating!=null) { + s += " rpt[ "+nextRepeating+" ]"; + } + return s; + } } diff --git a/lucene/src/java/org/apache/lucene/search/PhraseScorer.java b/lucene/src/java/org/apache/lucene/search/PhraseScorer.java index 3afcb67516e..3f2f6d8db96 100644 --- a/lucene/src/java/org/apache/lucene/search/PhraseScorer.java +++ b/lucene/src/java/org/apache/lucene/search/PhraseScorer.java @@ -32,17 +32,14 @@ import org.apache.lucene.search.similarities.Similarity; * means a match. */ abstract class PhraseScorer extends Scorer { - private boolean firstTime = true; - private boolean more = true; - protected PhraseQueue pq; - protected PhrasePositions first, last; + PhrasePositions min, max; private float freq; //phrase frequency in current doc as computed by phraseFreq(). - protected final Similarity.SloppyDocScorer docScorer; + final Similarity.SloppyDocScorer docScorer; PhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, - Similarity.SloppyDocScorer docScorer) throws IOException { + Similarity.SloppyDocScorer docScorer) { super(weight); this.docScorer = docScorer; @@ -51,75 +48,66 @@ abstract class PhraseScorer extends Scorer { // reflects the phrase offset: pp.pos = tp.pos - offset. // this allows to easily identify a matching (exact) phrase // when all PhrasePositions have exactly the same position. - for (int i = 0; i < postings.length; i++) { - PhrasePositions pp = new PhrasePositions(postings[i].postings, postings[i].position, i); - if (last != null) { // add next to end of list - last.next = pp; - } else { - first = pp; + if (postings.length > 0) { + min = new PhrasePositions(postings[0].postings, postings[0].position, 0); + max = min; + max.doc = -1; + for (int i = 1; i < postings.length; i++) { + PhrasePositions pp = new PhrasePositions(postings[i].postings, postings[i].position, i); + max.next = pp; + max = pp; + max.doc = -1; } - last = pp; + max.next = min; // make it cyclic for easier manipulation } - - pq = new PhraseQueue(postings.length); // construct empty pq - first.doc = -1; } @Override - public int docID() { return first.doc; } + public int docID() { + return max.doc; + } @Override public int nextDoc() throws IOException { - if (firstTime) { - init(); - firstTime = false; - } else if (more) { - more = last.next(); // trigger further scanning - } - if (!doNext()) { - first.doc = NO_MORE_DOCS; - } - return first.doc; + return advance(max.doc); } - // next without initial increment - private boolean doNext() throws IOException { - while (more) { - while (more && first.doc < last.doc) { // find doc w/ all the terms - more = first.skipTo(last.doc); // skip first upto last - firstToLast(); // and move it to the end - } - - if (more) { - // found a doc with all of the terms - freq = phraseFreq(); // check for phrase - if (freq == 0.0f) // no match - more = last.next(); // trigger further scanning - else - return true; // found a match - } - } - return false; // no more matches - } - @Override public float score() throws IOException { - return docScorer.score(first.doc, freq); + return docScorer.score(max.doc, freq); } + private boolean advanceMin(int target) throws IOException { + if (!min.skipTo(target)) { + max.doc = NO_MORE_DOCS; // for further calls to docID() + return false; + } + min = min.next; // cyclic + max = max.next; // cyclic + return true; + } + @Override public int advance(int target) throws IOException { - firstTime = false; - for (PhrasePositions pp = first; more && pp != null; pp = pp.next) { - more = pp.skipTo(target); - } - if (more) { - sort(); // re-sort - } - if (!doNext()) { - first.doc = NO_MORE_DOCS; - } - return first.doc; + freq = 0.0f; + if (!advanceMin(target)) { + return NO_MORE_DOCS; + } + boolean restart=false; + while (freq == 0.0f) { + while (min.doc < max.doc || restart) { + restart = false; + if (!advanceMin(max.doc)) { + return NO_MORE_DOCS; + } + } + // found a doc with all of the terms + freq = phraseFreq(); // check for phrase + restart = true; + } + + // found a match + return max.doc; } /** @@ -137,44 +125,7 @@ abstract class PhraseScorer extends Scorer { *
Note, that containing all phrase terms does not guarantee a match - they have to be found in matching locations. * @return frequency of the phrase in current doc, 0 if not found. */ - protected abstract float phraseFreq() throws IOException; - - private void init() throws IOException { - for (PhrasePositions pp = first; more && pp != null; pp = pp.next) { - more = pp.next(); - } - if (more) { - sort(); - } - } - - private void sort() { - pq.clear(); - for (PhrasePositions pp = first; pp != null; pp = pp.next) { - pq.add(pp); - } - pqToList(); - } - - protected final void pqToList() { - last = first = null; - while (pq.top() != null) { - PhrasePositions pp = pq.pop(); - if (last != null) { // add next to end of list - last.next = pp; - } else - first = pp; - last = pp; - pp.next = null; - } - } - - protected final void firstToLast() { - last.next = first; // move first to end of list - last = first; - first = first.next; - last.next = null; - } + abstract float phraseFreq() throws IOException; @Override public String toString() { return "scorer(" + weight + ")"; } diff --git a/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java b/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java index 8609637e34d..c123ef7c6cc 100644 --- a/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java +++ b/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java @@ -18,218 +18,261 @@ package org.apache.lucene.search; */ import java.io.IOException; -import java.util.LinkedHashSet; +import java.util.ArrayList; import org.apache.lucene.search.similarities.Similarity; final class SloppyPhraseScorer extends PhraseScorer { - private int slop; - private PhrasePositions repeats[]; - private PhrasePositions tmpPos[]; // for flipping repeating pps. - private boolean checkedRepeats; - - SloppyPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, - int slop, Similarity.SloppyDocScorer docScorer) throws IOException { - super(weight, postings, docScorer); - this.slop = slop; + private int slop; + private boolean checkedRepeats; // flag to only check in first candidate doc in case there are no repeats + private boolean hasRepeats; // flag indicating that there are repeats (already checked in first candidate doc) + private PhraseQueue pq; // for advancing min position + private PhrasePositions[] nrPps; // non repeating pps ordered by their query offset + + SloppyPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, + int slop, Similarity.SloppyDocScorer docScorer) { + super(weight, postings, docScorer); + this.slop = slop; + } + + /** + * Score a candidate doc for all slop-valid position-combinations (matches) + * encountered while traversing/hopping the PhrasePositions. + *
The score contribution of a match depends on the distance: + *
- highest score for distance=0 (exact match). + *
- score gets lower as distance gets higher. + *
Example: for query "a b"~2, a document "x a b a y" can be scored twice: + * once for "a b" (distance=0), and once for "b a" (distance=2). + *
Possibly not all valid combinations are encountered, because for efficiency + * we always propagate the least PhrasePosition. This allows to base on + * PriorityQueue and move forward faster. + * As result, for example, document "a b c b a" + * would score differently for queries "a b c"~4 and "c b a"~4, although + * they really are equivalent. + * Similarly, for doc "a b c b a f g", query "c b"~2 + * would get same score as "g f"~2, although "c b"~2 could be matched twice. + * We may want to fix this in the future (currently not, for performance reasons). + */ + @Override + protected float phraseFreq() throws IOException { + int end = initPhrasePositions(); + //printPositions(System.err, "INIT DONE:"); + if (end==Integer.MIN_VALUE) { + return 0.0f; } - - /** - * Score a candidate doc for all slop-valid position-combinations (matches) - * encountered while traversing/hopping the PhrasePositions. - *
The score contribution of a match depends on the distance: - *
- highest score for distance=0 (exact match). - *
- score gets lower as distance gets higher. - *
Example: for query "a b"~2, a document "x a b a y" can be scored twice: - * once for "a b" (distance=0), and once for "b a" (distance=2). - *
Possibly not all valid combinations are encountered, because for efficiency - * we always propagate the least PhrasePosition. This allows to base on - * PriorityQueue and move forward faster. - * As result, for example, document "a b c b a" - * would score differently for queries "a b c"~4 and "c b a"~4, although - * they really are equivalent. - * Similarly, for doc "a b c b a f g", query "c b"~2 - * would get same score as "g f"~2, although "c b"~2 could be matched twice. - * We may want to fix this in the future (currently not, for performance reasons). - */ - @Override - protected float phraseFreq() throws IOException { - int end = initPhrasePositions(); - - float freq = 0.0f; - boolean done = (end<0); - while (!done) { - PhrasePositions pp = pq.pop(); - int start = pp.position; - int next = pq.top().position; - - boolean tpsDiffer = true; - for (int pos = start; pos <= next || !tpsDiffer; pos = pp.position) { - if (pos<=next && tpsDiffer) - start = pos; // advance pp to min window - if (!pp.nextPosition()) { - done = true; // ran out of a term -- done - break; - } - PhrasePositions pp2 = null; - tpsDiffer = !pp.repeats || (pp2 = termPositionsConflict(pp))==null; - if (pp2!=null && pp2!=pp) { - pp = flip(pp,pp2); // flip pp to pp2 - } - } - - int matchLength = end - start; - if (matchLength <= slop) - freq += docScorer.computeSlopFactor(matchLength); // score match - - if (pp.position > end) - end = pp.position; - pq.add(pp); // restore pq + + float freq = 0.0f; + PhrasePositions pp = pq.pop(); + int matchLength = end - pp.position; + int next = pq.size()>0 ? pq.top().position : pp.position; + //printQueue(System.err, pp, "Bef Loop: next="+next+" mlen="+end+"-"+pp.position+"="+matchLength); + while (pp.nextPosition() && (end=advanceRepeats(pp, end)) != Integer.MIN_VALUE) { + if (pp.position > next) { + //printQueue(System.err, pp, "A: >next="+next+" matchLength="+matchLength); + if (matchLength <= slop) { + freq += docScorer.computeSlopFactor(matchLength); // score match + } + pq.add(pp); + pp = pq.pop(); + next = pq.size()>0 ? pq.top().position : pp.position; + matchLength = end - pp.position; + //printQueue(System.err, pp, "B: >next="+next+" matchLength="+matchLength); + } else { + int matchLength2 = end - pp.position; + //printQueue(System.err, pp, "C: mlen2 repeatsEnd) { + repeatsEnd = pp.position; + } + if (!hasRepeats) { + return repeatsEnd; + } + int tpPos = tpPos(pp); + for (PhrasePositions pp2=pp.nextRepeating; pp2!=null; pp2=pp2.nextRepeating) { + while (tpPos(pp2) <= tpPos) { + if (!pp2.nextPosition()) { + return Integer.MIN_VALUE; + } + } + tpPos = tpPos(pp2); + if (pp2.position > repeatsEnd) { + repeatsEnd = pp2.position; + } + // "dirty" trick: with holes, given a pp, its repeating pp2 might have smaller position. + // so in order to have the right "start" in matchLength computation we fake pp.position. + // this relies on pp.nextPosition() not using pp.position. + if (pp2.position < pp.position) { + pp.position = pp2.position; + } + } + return repeatsEnd; + } + + /** + * Initialize PhrasePositions in place. + * There is a one time initialization for this scorer (taking place at the first doc that matches all terms): + * + * Examples: + *
    + *
  1. no repetitions: "ho my"~2 + *
  2. repetitions: "ho my my"~2 + *
  3. repetitions: "my ho my"~2 + *
+ * @return end (max position), or Integer.MIN_VALUE if any term ran out (i.e. done) + */ + private int initPhrasePositions() throws IOException { + int end = Integer.MIN_VALUE; + + // no repeats at all (most common case is also the simplest one) + if (checkedRepeats && !hasRepeats) { + // build queue from list + pq.clear(); + for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max + pp.firstPosition(); + if (pp.position > end) { + end = pp.position; + } + pq.add(pp); // build pq from list + } + return end; } - // flip pp2 and pp in the queue: pop until finding pp2, insert back all but pp2, insert pp back. - // assumes: pp!=pp2, pp2 in pq, pp not in pq. - // called only when there are repeating pps. - private PhrasePositions flip(PhrasePositions pp, PhrasePositions pp2) { - int n=0; - PhrasePositions pp3; - //pop until finding pp2 - while ((pp3=pq.pop()) != pp2) { - tmpPos[n++] = pp3; + //printPositions(System.err, "Init: 1: Bef position"); + + // position the pp's + for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max + pp.firstPosition(); + } + + //printPositions(System.err, "Init: 2: Aft position"); + + // one time initialization for this scorer (done only for the first candidate doc) + if (!checkedRepeats) { + checkedRepeats = true; + ArrayList ppsA = new ArrayList(); + PhrasePositions dummyPP = new PhrasePositions(null, -1, -1); + // check for repeats + for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max + if (pp.nextRepeating != null) { + continue; // a repetition of an earlier pp + } + ppsA.add(pp); + int tpPos = tpPos(pp); + for (PhrasePositions prevB=pp, pp2=pp.next; pp2!= min; pp2=pp2.next) { + if ( + pp2.nextRepeating != null // already detected as a repetition of an earlier pp + || pp.offset == pp2.offset // not a repetition: the two PPs are originally in same offset in the query! + || tpPos(pp2) != tpPos) { // not a repetition + continue; + } + // a repetition + hasRepeats = true; + prevB.nextRepeating = pp2; // add pp2 to the repeats linked list + pp2.nextRepeating = dummyPP; // allows not to handle the last pp in a sub-list + prevB = pp2; + } } - //insert back all but pp2 - for (n--; n>=0; n--) { - pq.insertWithOverflow(tmpPos[n]); + if (hasRepeats) { + // clean dummy markers + for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max + if (pp.nextRepeating == dummyPP) { + pp.nextRepeating = null; + } + } + } + nrPps = ppsA.toArray(new PhrasePositions[0]); + pq = new PhraseQueue(nrPps.length); + } + + //printPositions(System.err, "Init: 3: Aft check-repeats"); + + // with repeats must advance some repeating pp's so they all start with differing tp's + if (hasRepeats) { + for (PhrasePositions pp: nrPps) { + if ((end=advanceRepeats(pp, end)) == Integer.MIN_VALUE) { + return Integer.MIN_VALUE; // ran out of a term -- done (no valid matches in current doc) + } + } + } + + //printPositions(System.err, "Init: 4: Aft advance-repeats"); + + // build queue from non repeating pps + pq.clear(); + for (PhrasePositions pp: nrPps) { + if (pp.position > end) { + end = pp.position; } - //insert pp back pq.add(pp); - return pp2; } + + return end; + } + + /** Actual position in doc of a PhrasePosition, relies on that position = tpPos - offset) */ + private final int tpPos(PhrasePositions pp) { + return pp.position + pp.offset; + } + +// private void printPositions(PrintStream ps, String title) { +// ps.println(); +// ps.println("---- "+title); +// int k = 0; +// if (nrPps!=null) { +// for (PhrasePositions pp: nrPps) { +// ps.println(" " + k++ + " " + pp); +// } +// } else { +// for (PhrasePositions pp=min; 0==k || pp!=min; pp = pp.next) { +// ps.println(" " + k++ + " " + pp); +// } +// } +// } - /** - * Init PhrasePositions in place. - * There is a one time initialization for this scorer (taking place at the first doc that matches all terms): - *
- Put in repeats[] each pp that has another pp with same position in the doc. - * This relies on that the position in PP is computed as (TP.position - offset) and - * so by adding offset we actually compare positions and identify that the two are - * the same term. - * An exclusion to this is two distinct terms in the same offset in query and same - * position in doc. This case is detected by comparing just the (query) offsets, - * and two such PPs are not considered "repeating". - *
- Also mark each such pp by pp.repeats = true. - *
Later can consult with repeats[] in termPositionsConflict(pp), making that check efficient. - * In particular, this allows to score queries with no repetitions with no overhead due to this computation. - *
- Example 1 - query with no repetitions: "ho my"~2 - *
- Example 2 - query with repetitions: "ho my my"~2 - *
- Example 3 - query with repetitions: "my ho my"~2 - *
Init per doc w/repeats in query, includes propagating some repeating pp's to avoid false phrase detection. - * @return end (max position), or -1 if any term ran out (i.e. done) - * @throws IOException - */ - private int initPhrasePositions() throws IOException { - int end = 0; - - // no repeats at all (most common case is also the simplest one) - if (checkedRepeats && repeats==null) { - // build queue from list - pq.clear(); - for (PhrasePositions pp = first; pp != null; pp = pp.next) { - pp.firstPosition(); - if (pp.position > end) - end = pp.position; - pq.add(pp); // build pq from list - } - return end; - } - - // position the pp's - for (PhrasePositions pp = first; pp != null; pp = pp.next) - pp.firstPosition(); - - // one time initialization for this scorer - if (!checkedRepeats) { - checkedRepeats = true; - // check for repeats - LinkedHashSet m = null; // see comment (*) below why order is important - for (PhrasePositions pp = first; pp != null; pp = pp.next) { - int tpPos = pp.position + pp.offset; - for (PhrasePositions pp2 = pp.next; pp2 != null; pp2 = pp2.next) { - if (pp.offset == pp2.offset) { - continue; // not a repetition: the two PPs are originally in same offset in the query! - } - int tpPos2 = pp2.position + pp2.offset; - if (tpPos2 == tpPos) { - if (m == null) - m = new LinkedHashSet(); - pp.repeats = true; - pp2.repeats = true; - m.add(pp); - m.add(pp2); - } - } - } - if (m!=null) - repeats = m.toArray(new PhrasePositions[0]); - } - - // with repeats must advance some repeating pp's so they all start with differing tp's - // (*) It is important that pps are handled by their original order in the query, - // because we advance the pp with larger offset, and so processing them in that order - // allows to cover all pairs. - if (repeats!=null) { - for (int i = 0; i < repeats.length; i++) { - PhrasePositions pp = repeats[i]; - PhrasePositions pp2; - while ((pp2 = termPositionsConflict(pp)) != null) { - if (!pp2.nextPosition()) // among pps that do not differ, advance the pp with higher offset - return -1; // ran out of a term -- done - } - } - } - - // build queue from list - pq.clear(); - for (PhrasePositions pp = first; pp != null; pp = pp.next) { - if (pp.position > end) - end = pp.position; - pq.add(pp); // build pq from list - } - - if (repeats!=null) { - tmpPos = new PhrasePositions[pq.size()]; - } - - return end; - } - - /** - * We disallow two pp's to have the same TermPosition, thereby verifying multiple occurrences - * in the query of the same word would go elsewhere in the matched doc. - * @return null if differ (i.e. valid) otherwise return the higher offset PhrasePositions - * out of the first two PPs found to not differ. - */ - private PhrasePositions termPositionsConflict(PhrasePositions pp) { - // efficiency note: a more efficient implementation could keep a map between repeating - // pp's, so that if pp1a, pp1b, pp1c are repeats of term1, and pp2a, pp2b are repeats - // of term2, pp2a would only be checked against pp2b but not against pp1a, pp1b, pp1c. - // However this would complicate code, for a rather rare case, so choice is to compromise here. - int tpPos = pp.position + pp.offset; - for (int i = 0; i < repeats.length; i++) { - PhrasePositions pp2 = repeats[i]; - if (pp2 == pp) { - continue; - } - if (pp.offset == pp2.offset) { - continue; // not a repetition: the two PPs are originally in same offset in the query! - } - int tpPos2 = pp2.position + pp2.offset; - if (tpPos2 == tpPos) { - return pp.offset > pp2.offset ? pp : pp2; // do not differ: return the one with higher offset. - } - } - return null; - } +// private void printQueue(PrintStream ps, PhrasePositions ext, String title) { +// ps.println(); +// ps.println("---- "+title); +// ps.println("EXT: "+ext); +// PhrasePositions[] t = new PhrasePositions[pq.size()]; +// if (pq.size()>0) { +// t[0] = pq.pop(); +// ps.println(" " + 0 + " " + t[0]); +// for (int i=1; i=0; i--) { +// pq.add(t[i]); +// } +// } +// } } diff --git a/lucene/src/test/org/apache/lucene/search/TestSloppyPhraseQuery.java b/lucene/src/test/org/apache/lucene/search/TestSloppyPhraseQuery.java index eb0a217bf85..5cab9c7a204 100755 --- a/lucene/src/test/org/apache/lucene/search/TestSloppyPhraseQuery.java +++ b/lucene/src/test/org/apache/lucene/search/TestSloppyPhraseQuery.java @@ -201,4 +201,148 @@ public class TestSloppyPhraseQuery extends LuceneTestCase { return false; } } + + /** checks that no scores or freqs are infinite */ + private void assertSaneScoring(PhraseQuery pq, IndexSearcher searcher) throws Exception { + searcher.search(pq, new Collector() { + Scorer scorer; + + @Override + public void setScorer(Scorer scorer) throws IOException { + this.scorer = scorer; + } + + @Override + public void collect(int doc) throws IOException { + assertFalse(Float.isInfinite(scorer.freq())); + assertFalse(Float.isInfinite(scorer.score())); + } + + @Override + public void setNextReader(AtomicReaderContext context) throws IOException { + // do nothing + } + + @Override + public boolean acceptsDocsOutOfOrder() { + return false; + } + }); + QueryUtils.check(random, pq, searcher); + } + + // LUCENE-3215 + public void testSlopWithHoles() throws Exception { + Directory dir = newDirectory(); + RandomIndexWriter iw = new RandomIndexWriter(random, dir); + FieldType customType = new FieldType(TextField.TYPE_UNSTORED); + customType.setOmitNorms(true); + Field f = new Field("lyrics", customType, ""); + Document doc = new Document(); + doc.add(f); + f.setValue("drug drug"); + iw.addDocument(doc); + f.setValue("drug druggy drug"); + iw.addDocument(doc); + f.setValue("drug druggy druggy drug"); + iw.addDocument(doc); + f.setValue("drug druggy drug druggy drug"); + iw.addDocument(doc); + IndexReader ir = iw.getReader(); + iw.close(); + IndexSearcher is = newSearcher(ir); + + PhraseQuery pq = new PhraseQuery(); + // "drug the drug"~1 + pq.add(new Term("lyrics", "drug"), 1); + pq.add(new Term("lyrics", "drug"), 4); + pq.setSlop(0); + assertEquals(0, is.search(pq, 4).totalHits); + pq.setSlop(1); + assertEquals(3, is.search(pq, 4).totalHits); + pq.setSlop(2); + assertEquals(4, is.search(pq, 4).totalHits); + is.close(); + ir.close(); + dir.close(); + } + + // LUCENE-3215 + public void testInfiniteFreq1() throws Exception { + String document = "drug druggy drug drug drug"; + + Directory dir = newDirectory(); + RandomIndexWriter iw = new RandomIndexWriter(random, dir); + Document doc = new Document(); + doc.add(newField("lyrics", document, new FieldType(TextField.TYPE_UNSTORED))); + iw.addDocument(doc); + IndexReader ir = iw.getReader(); + iw.close(); + + IndexSearcher is = newSearcher(ir); + PhraseQuery pq = new PhraseQuery(); + // "drug the drug"~1 + pq.add(new Term("lyrics", "drug"), 1); + pq.add(new Term("lyrics", "drug"), 3); + pq.setSlop(1); + assertSaneScoring(pq, is); + is.close(); + ir.close(); + dir.close(); + } + + // LUCENE-3215 + public void testInfiniteFreq2() throws Exception { + String document = + "So much fun to be had in my head " + + "No more sunshine " + + "So much fun just lying in my bed " + + "No more sunshine " + + "I can't face the sunlight and the dirt outside " + + "Wanna stay in 666 where this darkness don't lie " + + "Drug drug druggy " + + "Got a feeling sweet like honey " + + "Drug drug druggy " + + "Need sensation like my baby " + + "Show me your scars you're so aware " + + "I'm not barbaric I just care " + + "Drug drug drug " + + "I need a reflection to prove I exist " + + "No more sunshine " + + "I am a victim of designer blitz " + + "No more sunshine " + + "Dance like a robot when you're chained at the knee " + + "The C.I.A say you're all they'll ever need " + + "Drug drug druggy " + + "Got a feeling sweet like honey " + + "Drug drug druggy " + + "Need sensation like my baby " + + "Snort your lines you're so aware " + + "I'm not barbaric I just care " + + "Drug drug druggy " + + "Got a feeling sweet like honey " + + "Drug drug druggy " + + "Need sensation like my baby"; + + Directory dir = newDirectory(); + + RandomIndexWriter iw = new RandomIndexWriter(random, dir); + Document doc = new Document(); + doc.add(newField("lyrics", document, new FieldType(TextField.TYPE_UNSTORED))); + iw.addDocument(doc); + IndexReader ir = iw.getReader(); + iw.close(); + + IndexSearcher is = newSearcher(ir); + + PhraseQuery pq = new PhraseQuery(); + // "drug the drug"~5 + pq.add(new Term("lyrics", "drug"), 1); + pq.add(new Term("lyrics", "drug"), 3); + pq.setSlop(5); + assertSaneScoring(pq, is); + is.close(); + ir.close(); + dir.close(); + } }