mirror of https://github.com/apache/lucene.git
LUCENE-3215: SloppyPhraseScorer sometimes computed Infinite freq.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1173961 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
18008231ed
commit
c54608b60d
|
@ -606,7 +606,7 @@ Bug fixes
|
|||
* LUCENE-3432: IndexWriter.expungeDeletes with TieredMergePolicy
|
||||
should ignore the maxMergedSegmentMB setting (v.sevel via Mike
|
||||
McCandless)
|
||||
|
||||
|
||||
* LUCENE-3442: TermQuery.TermWeight.scorer() returns null for non-atomic
|
||||
IndexReaders (optimization bug, introcuced by LUCENE-2829), preventing
|
||||
QueryWrapperFilter and similar classes to get a top-level DocIdSet.
|
||||
|
@ -621,6 +621,9 @@ Bug fixes
|
|||
incorrectly invoking the DeletionPolicy and (then possibly deleting
|
||||
files) on the closed IndexWriter (Robert Muir, Mike McCandless)
|
||||
|
||||
* LUCENE-3215: SloppyPhraseScorer sometimes computed Infinite freq
|
||||
(Robert Muir, Doron Cohen)
|
||||
|
||||
New Features
|
||||
|
||||
Optimizations
|
||||
|
|
|
@ -31,7 +31,7 @@ final class PhrasePositions {
|
|||
final int ord; // unique across all PhrasePositions instances
|
||||
final DocsAndPositionsEnum postings; // stream of docs & positions
|
||||
PhrasePositions next; // used to make lists
|
||||
boolean repeats; // there's other pp for same term (e.g. query="1st word 2nd word"~1)
|
||||
PhrasePositions nextRepeating; // link to next repeating pp: standing for same term in different query offsets
|
||||
|
||||
PhrasePositions(DocsAndPositionsEnum postings, int o, int ord) {
|
||||
this.postings = postings;
|
||||
|
@ -41,7 +41,7 @@ final class PhrasePositions {
|
|||
|
||||
final boolean next() throws IOException { // increments to next doc
|
||||
doc = postings.nextDoc();
|
||||
if (doc == postings.NO_MORE_DOCS) {
|
||||
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
|
@ -49,7 +49,7 @@ final class PhrasePositions {
|
|||
|
||||
final boolean skipTo(int target) throws IOException {
|
||||
doc = postings.advance(target);
|
||||
if (doc == postings.NO_MORE_DOCS) {
|
||||
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
|
@ -73,4 +73,14 @@ final class PhrasePositions {
|
|||
} else
|
||||
return false;
|
||||
}
|
||||
|
||||
/** for debug purposes */
|
||||
@Override
|
||||
public String toString() {
|
||||
String s = "d:"+doc+" o:"+offset+" p:"+position+" c:"+count;
|
||||
if (nextRepeating!=null) {
|
||||
s += " rpt[ "+nextRepeating+" ]";
|
||||
}
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -32,17 +32,14 @@ import org.apache.lucene.search.similarities.Similarity;
|
|||
* means a match.
|
||||
*/
|
||||
abstract class PhraseScorer extends Scorer {
|
||||
private boolean firstTime = true;
|
||||
private boolean more = true;
|
||||
protected PhraseQueue pq;
|
||||
protected PhrasePositions first, last;
|
||||
PhrasePositions min, max;
|
||||
|
||||
private float freq; //phrase frequency in current doc as computed by phraseFreq().
|
||||
|
||||
protected final Similarity.SloppyDocScorer docScorer;
|
||||
final Similarity.SloppyDocScorer docScorer;
|
||||
|
||||
PhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
|
||||
Similarity.SloppyDocScorer docScorer) throws IOException {
|
||||
Similarity.SloppyDocScorer docScorer) {
|
||||
super(weight);
|
||||
this.docScorer = docScorer;
|
||||
|
||||
|
@ -51,75 +48,66 @@ abstract class PhraseScorer extends Scorer {
|
|||
// reflects the phrase offset: pp.pos = tp.pos - offset.
|
||||
// this allows to easily identify a matching (exact) phrase
|
||||
// when all PhrasePositions have exactly the same position.
|
||||
for (int i = 0; i < postings.length; i++) {
|
||||
PhrasePositions pp = new PhrasePositions(postings[i].postings, postings[i].position, i);
|
||||
if (last != null) { // add next to end of list
|
||||
last.next = pp;
|
||||
} else {
|
||||
first = pp;
|
||||
if (postings.length > 0) {
|
||||
min = new PhrasePositions(postings[0].postings, postings[0].position, 0);
|
||||
max = min;
|
||||
max.doc = -1;
|
||||
for (int i = 1; i < postings.length; i++) {
|
||||
PhrasePositions pp = new PhrasePositions(postings[i].postings, postings[i].position, i);
|
||||
max.next = pp;
|
||||
max = pp;
|
||||
max.doc = -1;
|
||||
}
|
||||
last = pp;
|
||||
max.next = min; // make it cyclic for easier manipulation
|
||||
}
|
||||
|
||||
pq = new PhraseQueue(postings.length); // construct empty pq
|
||||
first.doc = -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docID() { return first.doc; }
|
||||
public int docID() {
|
||||
return max.doc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
if (firstTime) {
|
||||
init();
|
||||
firstTime = false;
|
||||
} else if (more) {
|
||||
more = last.next(); // trigger further scanning
|
||||
}
|
||||
if (!doNext()) {
|
||||
first.doc = NO_MORE_DOCS;
|
||||
}
|
||||
return first.doc;
|
||||
return advance(max.doc);
|
||||
}
|
||||
|
||||
// next without initial increment
|
||||
private boolean doNext() throws IOException {
|
||||
while (more) {
|
||||
while (more && first.doc < last.doc) { // find doc w/ all the terms
|
||||
more = first.skipTo(last.doc); // skip first upto last
|
||||
firstToLast(); // and move it to the end
|
||||
}
|
||||
|
||||
if (more) {
|
||||
// found a doc with all of the terms
|
||||
freq = phraseFreq(); // check for phrase
|
||||
if (freq == 0.0f) // no match
|
||||
more = last.next(); // trigger further scanning
|
||||
else
|
||||
return true; // found a match
|
||||
}
|
||||
}
|
||||
return false; // no more matches
|
||||
}
|
||||
|
||||
@Override
|
||||
public float score() throws IOException {
|
||||
return docScorer.score(first.doc, freq);
|
||||
return docScorer.score(max.doc, freq);
|
||||
}
|
||||
|
||||
private boolean advanceMin(int target) throws IOException {
|
||||
if (!min.skipTo(target)) {
|
||||
max.doc = NO_MORE_DOCS; // for further calls to docID()
|
||||
return false;
|
||||
}
|
||||
min = min.next; // cyclic
|
||||
max = max.next; // cyclic
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(int target) throws IOException {
|
||||
firstTime = false;
|
||||
for (PhrasePositions pp = first; more && pp != null; pp = pp.next) {
|
||||
more = pp.skipTo(target);
|
||||
}
|
||||
if (more) {
|
||||
sort(); // re-sort
|
||||
}
|
||||
if (!doNext()) {
|
||||
first.doc = NO_MORE_DOCS;
|
||||
}
|
||||
return first.doc;
|
||||
freq = 0.0f;
|
||||
if (!advanceMin(target)) {
|
||||
return NO_MORE_DOCS;
|
||||
}
|
||||
boolean restart=false;
|
||||
while (freq == 0.0f) {
|
||||
while (min.doc < max.doc || restart) {
|
||||
restart = false;
|
||||
if (!advanceMin(max.doc)) {
|
||||
return NO_MORE_DOCS;
|
||||
}
|
||||
}
|
||||
// found a doc with all of the terms
|
||||
freq = phraseFreq(); // check for phrase
|
||||
restart = true;
|
||||
}
|
||||
|
||||
// found a match
|
||||
return max.doc;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -137,44 +125,7 @@ abstract class PhraseScorer extends Scorer {
|
|||
* <br>Note, that containing all phrase terms does not guarantee a match - they have to be found in matching locations.
|
||||
* @return frequency of the phrase in current doc, 0 if not found.
|
||||
*/
|
||||
protected abstract float phraseFreq() throws IOException;
|
||||
|
||||
private void init() throws IOException {
|
||||
for (PhrasePositions pp = first; more && pp != null; pp = pp.next) {
|
||||
more = pp.next();
|
||||
}
|
||||
if (more) {
|
||||
sort();
|
||||
}
|
||||
}
|
||||
|
||||
private void sort() {
|
||||
pq.clear();
|
||||
for (PhrasePositions pp = first; pp != null; pp = pp.next) {
|
||||
pq.add(pp);
|
||||
}
|
||||
pqToList();
|
||||
}
|
||||
|
||||
protected final void pqToList() {
|
||||
last = first = null;
|
||||
while (pq.top() != null) {
|
||||
PhrasePositions pp = pq.pop();
|
||||
if (last != null) { // add next to end of list
|
||||
last.next = pp;
|
||||
} else
|
||||
first = pp;
|
||||
last = pp;
|
||||
pp.next = null;
|
||||
}
|
||||
}
|
||||
|
||||
protected final void firstToLast() {
|
||||
last.next = first; // move first to end of list
|
||||
last = first;
|
||||
first = first.next;
|
||||
last.next = null;
|
||||
}
|
||||
abstract float phraseFreq() throws IOException;
|
||||
|
||||
@Override
|
||||
public String toString() { return "scorer(" + weight + ")"; }
|
||||
|
|
|
@ -18,218 +18,261 @@ package org.apache.lucene.search;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.ArrayList;
|
||||
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
|
||||
final class SloppyPhraseScorer extends PhraseScorer {
|
||||
private int slop;
|
||||
private PhrasePositions repeats[];
|
||||
private PhrasePositions tmpPos[]; // for flipping repeating pps.
|
||||
private boolean checkedRepeats;
|
||||
|
||||
SloppyPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
|
||||
int slop, Similarity.SloppyDocScorer docScorer) throws IOException {
|
||||
super(weight, postings, docScorer);
|
||||
this.slop = slop;
|
||||
private int slop;
|
||||
private boolean checkedRepeats; // flag to only check in first candidate doc in case there are no repeats
|
||||
private boolean hasRepeats; // flag indicating that there are repeats (already checked in first candidate doc)
|
||||
private PhraseQueue pq; // for advancing min position
|
||||
private PhrasePositions[] nrPps; // non repeating pps ordered by their query offset
|
||||
|
||||
SloppyPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
|
||||
int slop, Similarity.SloppyDocScorer docScorer) {
|
||||
super(weight, postings, docScorer);
|
||||
this.slop = slop;
|
||||
}
|
||||
|
||||
/**
|
||||
* Score a candidate doc for all slop-valid position-combinations (matches)
|
||||
* encountered while traversing/hopping the PhrasePositions.
|
||||
* <br> The score contribution of a match depends on the distance:
|
||||
* <br> - highest score for distance=0 (exact match).
|
||||
* <br> - score gets lower as distance gets higher.
|
||||
* <br>Example: for query "a b"~2, a document "x a b a y" can be scored twice:
|
||||
* once for "a b" (distance=0), and once for "b a" (distance=2).
|
||||
* <br>Possibly not all valid combinations are encountered, because for efficiency
|
||||
* we always propagate the least PhrasePosition. This allows to base on
|
||||
* PriorityQueue and move forward faster.
|
||||
* As result, for example, document "a b c b a"
|
||||
* would score differently for queries "a b c"~4 and "c b a"~4, although
|
||||
* they really are equivalent.
|
||||
* Similarly, for doc "a b c b a f g", query "c b"~2
|
||||
* would get same score as "g f"~2, although "c b"~2 could be matched twice.
|
||||
* We may want to fix this in the future (currently not, for performance reasons).
|
||||
*/
|
||||
@Override
|
||||
protected float phraseFreq() throws IOException {
|
||||
int end = initPhrasePositions();
|
||||
//printPositions(System.err, "INIT DONE:");
|
||||
if (end==Integer.MIN_VALUE) {
|
||||
return 0.0f;
|
||||
}
|
||||
|
||||
/**
|
||||
* Score a candidate doc for all slop-valid position-combinations (matches)
|
||||
* encountered while traversing/hopping the PhrasePositions.
|
||||
* <br> The score contribution of a match depends on the distance:
|
||||
* <br> - highest score for distance=0 (exact match).
|
||||
* <br> - score gets lower as distance gets higher.
|
||||
* <br>Example: for query "a b"~2, a document "x a b a y" can be scored twice:
|
||||
* once for "a b" (distance=0), and once for "b a" (distance=2).
|
||||
* <br>Possibly not all valid combinations are encountered, because for efficiency
|
||||
* we always propagate the least PhrasePosition. This allows to base on
|
||||
* PriorityQueue and move forward faster.
|
||||
* As result, for example, document "a b c b a"
|
||||
* would score differently for queries "a b c"~4 and "c b a"~4, although
|
||||
* they really are equivalent.
|
||||
* Similarly, for doc "a b c b a f g", query "c b"~2
|
||||
* would get same score as "g f"~2, although "c b"~2 could be matched twice.
|
||||
* We may want to fix this in the future (currently not, for performance reasons).
|
||||
*/
|
||||
@Override
|
||||
protected float phraseFreq() throws IOException {
|
||||
int end = initPhrasePositions();
|
||||
|
||||
float freq = 0.0f;
|
||||
boolean done = (end<0);
|
||||
while (!done) {
|
||||
PhrasePositions pp = pq.pop();
|
||||
int start = pp.position;
|
||||
int next = pq.top().position;
|
||||
|
||||
boolean tpsDiffer = true;
|
||||
for (int pos = start; pos <= next || !tpsDiffer; pos = pp.position) {
|
||||
if (pos<=next && tpsDiffer)
|
||||
start = pos; // advance pp to min window
|
||||
if (!pp.nextPosition()) {
|
||||
done = true; // ran out of a term -- done
|
||||
break;
|
||||
}
|
||||
PhrasePositions pp2 = null;
|
||||
tpsDiffer = !pp.repeats || (pp2 = termPositionsConflict(pp))==null;
|
||||
if (pp2!=null && pp2!=pp) {
|
||||
pp = flip(pp,pp2); // flip pp to pp2
|
||||
}
|
||||
}
|
||||
|
||||
int matchLength = end - start;
|
||||
if (matchLength <= slop)
|
||||
freq += docScorer.computeSlopFactor(matchLength); // score match
|
||||
|
||||
if (pp.position > end)
|
||||
end = pp.position;
|
||||
pq.add(pp); // restore pq
|
||||
|
||||
float freq = 0.0f;
|
||||
PhrasePositions pp = pq.pop();
|
||||
int matchLength = end - pp.position;
|
||||
int next = pq.size()>0 ? pq.top().position : pp.position;
|
||||
//printQueue(System.err, pp, "Bef Loop: next="+next+" mlen="+end+"-"+pp.position+"="+matchLength);
|
||||
while (pp.nextPosition() && (end=advanceRepeats(pp, end)) != Integer.MIN_VALUE) {
|
||||
if (pp.position > next) {
|
||||
//printQueue(System.err, pp, "A: >next="+next+" matchLength="+matchLength);
|
||||
if (matchLength <= slop) {
|
||||
freq += docScorer.computeSlopFactor(matchLength); // score match
|
||||
}
|
||||
pq.add(pp);
|
||||
pp = pq.pop();
|
||||
next = pq.size()>0 ? pq.top().position : pp.position;
|
||||
matchLength = end - pp.position;
|
||||
//printQueue(System.err, pp, "B: >next="+next+" matchLength="+matchLength);
|
||||
} else {
|
||||
int matchLength2 = end - pp.position;
|
||||
//printQueue(System.err, pp, "C: mlen2<mlen: next="+next+" matchLength="+matchLength+" matchLength2="+matchLength2);
|
||||
if (matchLength2 < matchLength) {
|
||||
matchLength = matchLength2;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (matchLength <= slop) {
|
||||
freq += docScorer.computeSlopFactor(matchLength); // score match
|
||||
}
|
||||
return freq;
|
||||
}
|
||||
|
||||
return freq;
|
||||
/**
|
||||
* Advance repeating pps of an input (non-repeating) pp.
|
||||
* Return a modified 'end' in case pp or its repeats exceeds original 'end'.
|
||||
* "Dirty" trick: when there are repeats, modifies pp's position to that of
|
||||
* least repeater of pp (needed when due to holes repeaters' positions are "back").
|
||||
*/
|
||||
private int advanceRepeats(PhrasePositions pp, int end) throws IOException {
|
||||
int repeatsEnd = end;
|
||||
if (pp.position > repeatsEnd) {
|
||||
repeatsEnd = pp.position;
|
||||
}
|
||||
if (!hasRepeats) {
|
||||
return repeatsEnd;
|
||||
}
|
||||
int tpPos = tpPos(pp);
|
||||
for (PhrasePositions pp2=pp.nextRepeating; pp2!=null; pp2=pp2.nextRepeating) {
|
||||
while (tpPos(pp2) <= tpPos) {
|
||||
if (!pp2.nextPosition()) {
|
||||
return Integer.MIN_VALUE;
|
||||
}
|
||||
}
|
||||
tpPos = tpPos(pp2);
|
||||
if (pp2.position > repeatsEnd) {
|
||||
repeatsEnd = pp2.position;
|
||||
}
|
||||
// "dirty" trick: with holes, given a pp, its repeating pp2 might have smaller position.
|
||||
// so in order to have the right "start" in matchLength computation we fake pp.position.
|
||||
// this relies on pp.nextPosition() not using pp.position.
|
||||
if (pp2.position < pp.position) {
|
||||
pp.position = pp2.position;
|
||||
}
|
||||
}
|
||||
return repeatsEnd;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize PhrasePositions in place.
|
||||
* There is a one time initialization for this scorer (taking place at the first doc that matches all terms):
|
||||
* <ul>
|
||||
* <li>Detect groups of repeating pps: those with same tpPos (tpPos==position in the doc) but different offsets in query.
|
||||
* <li>For each such group:
|
||||
* <ul>
|
||||
* <li>form an inner linked list of the repeating ones.
|
||||
* <li>propagate all group members but first so that they land on different tpPos().
|
||||
* </ul>
|
||||
* <li>Mark whether there are repetitions at all, so that scoring queries with no repetitions has no overhead due to this computation.
|
||||
* <li>Insert to pq only non repeating PPs, or PPs that are the first in a repeating group.
|
||||
* </ul>
|
||||
* Examples:
|
||||
* <ol>
|
||||
* <li>no repetitions: <b>"ho my"~2</b>
|
||||
* <li>repetitions: <b>"ho my my"~2</b>
|
||||
* <li>repetitions: <b>"my ho my"~2</b>
|
||||
* </ol>
|
||||
* @return end (max position), or Integer.MIN_VALUE if any term ran out (i.e. done)
|
||||
*/
|
||||
private int initPhrasePositions() throws IOException {
|
||||
int end = Integer.MIN_VALUE;
|
||||
|
||||
// no repeats at all (most common case is also the simplest one)
|
||||
if (checkedRepeats && !hasRepeats) {
|
||||
// build queue from list
|
||||
pq.clear();
|
||||
for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
|
||||
pp.firstPosition();
|
||||
if (pp.position > end) {
|
||||
end = pp.position;
|
||||
}
|
||||
pq.add(pp); // build pq from list
|
||||
}
|
||||
return end;
|
||||
}
|
||||
|
||||
// flip pp2 and pp in the queue: pop until finding pp2, insert back all but pp2, insert pp back.
|
||||
// assumes: pp!=pp2, pp2 in pq, pp not in pq.
|
||||
// called only when there are repeating pps.
|
||||
private PhrasePositions flip(PhrasePositions pp, PhrasePositions pp2) {
|
||||
int n=0;
|
||||
PhrasePositions pp3;
|
||||
//pop until finding pp2
|
||||
while ((pp3=pq.pop()) != pp2) {
|
||||
tmpPos[n++] = pp3;
|
||||
//printPositions(System.err, "Init: 1: Bef position");
|
||||
|
||||
// position the pp's
|
||||
for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
|
||||
pp.firstPosition();
|
||||
}
|
||||
|
||||
//printPositions(System.err, "Init: 2: Aft position");
|
||||
|
||||
// one time initialization for this scorer (done only for the first candidate doc)
|
||||
if (!checkedRepeats) {
|
||||
checkedRepeats = true;
|
||||
ArrayList<PhrasePositions> ppsA = new ArrayList<PhrasePositions>();
|
||||
PhrasePositions dummyPP = new PhrasePositions(null, -1, -1);
|
||||
// check for repeats
|
||||
for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
|
||||
if (pp.nextRepeating != null) {
|
||||
continue; // a repetition of an earlier pp
|
||||
}
|
||||
ppsA.add(pp);
|
||||
int tpPos = tpPos(pp);
|
||||
for (PhrasePositions prevB=pp, pp2=pp.next; pp2!= min; pp2=pp2.next) {
|
||||
if (
|
||||
pp2.nextRepeating != null // already detected as a repetition of an earlier pp
|
||||
|| pp.offset == pp2.offset // not a repetition: the two PPs are originally in same offset in the query!
|
||||
|| tpPos(pp2) != tpPos) { // not a repetition
|
||||
continue;
|
||||
}
|
||||
// a repetition
|
||||
hasRepeats = true;
|
||||
prevB.nextRepeating = pp2; // add pp2 to the repeats linked list
|
||||
pp2.nextRepeating = dummyPP; // allows not to handle the last pp in a sub-list
|
||||
prevB = pp2;
|
||||
}
|
||||
}
|
||||
//insert back all but pp2
|
||||
for (n--; n>=0; n--) {
|
||||
pq.insertWithOverflow(tmpPos[n]);
|
||||
if (hasRepeats) {
|
||||
// clean dummy markers
|
||||
for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
|
||||
if (pp.nextRepeating == dummyPP) {
|
||||
pp.nextRepeating = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
nrPps = ppsA.toArray(new PhrasePositions[0]);
|
||||
pq = new PhraseQueue(nrPps.length);
|
||||
}
|
||||
|
||||
//printPositions(System.err, "Init: 3: Aft check-repeats");
|
||||
|
||||
// with repeats must advance some repeating pp's so they all start with differing tp's
|
||||
if (hasRepeats) {
|
||||
for (PhrasePositions pp: nrPps) {
|
||||
if ((end=advanceRepeats(pp, end)) == Integer.MIN_VALUE) {
|
||||
return Integer.MIN_VALUE; // ran out of a term -- done (no valid matches in current doc)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//printPositions(System.err, "Init: 4: Aft advance-repeats");
|
||||
|
||||
// build queue from non repeating pps
|
||||
pq.clear();
|
||||
for (PhrasePositions pp: nrPps) {
|
||||
if (pp.position > end) {
|
||||
end = pp.position;
|
||||
}
|
||||
//insert pp back
|
||||
pq.add(pp);
|
||||
return pp2;
|
||||
}
|
||||
|
||||
return end;
|
||||
}
|
||||
|
||||
/** Actual position in doc of a PhrasePosition, relies on that position = tpPos - offset) */
|
||||
private final int tpPos(PhrasePositions pp) {
|
||||
return pp.position + pp.offset;
|
||||
}
|
||||
|
||||
// private void printPositions(PrintStream ps, String title) {
|
||||
// ps.println();
|
||||
// ps.println("---- "+title);
|
||||
// int k = 0;
|
||||
// if (nrPps!=null) {
|
||||
// for (PhrasePositions pp: nrPps) {
|
||||
// ps.println(" " + k++ + " " + pp);
|
||||
// }
|
||||
// } else {
|
||||
// for (PhrasePositions pp=min; 0==k || pp!=min; pp = pp.next) {
|
||||
// ps.println(" " + k++ + " " + pp);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
/**
|
||||
* Init PhrasePositions in place.
|
||||
* There is a one time initialization for this scorer (taking place at the first doc that matches all terms):
|
||||
* <br>- Put in repeats[] each pp that has another pp with same position in the doc.
|
||||
* This relies on that the position in PP is computed as (TP.position - offset) and
|
||||
* so by adding offset we actually compare positions and identify that the two are
|
||||
* the same term.
|
||||
* An exclusion to this is two distinct terms in the same offset in query and same
|
||||
* position in doc. This case is detected by comparing just the (query) offsets,
|
||||
* and two such PPs are not considered "repeating".
|
||||
* <br>- Also mark each such pp by pp.repeats = true.
|
||||
* <br>Later can consult with repeats[] in termPositionsConflict(pp), making that check efficient.
|
||||
* In particular, this allows to score queries with no repetitions with no overhead due to this computation.
|
||||
* <br>- Example 1 - query with no repetitions: "ho my"~2
|
||||
* <br>- Example 2 - query with repetitions: "ho my my"~2
|
||||
* <br>- Example 3 - query with repetitions: "my ho my"~2
|
||||
* <br>Init per doc w/repeats in query, includes propagating some repeating pp's to avoid false phrase detection.
|
||||
* @return end (max position), or -1 if any term ran out (i.e. done)
|
||||
* @throws IOException
|
||||
*/
|
||||
private int initPhrasePositions() throws IOException {
|
||||
int end = 0;
|
||||
|
||||
// no repeats at all (most common case is also the simplest one)
|
||||
if (checkedRepeats && repeats==null) {
|
||||
// build queue from list
|
||||
pq.clear();
|
||||
for (PhrasePositions pp = first; pp != null; pp = pp.next) {
|
||||
pp.firstPosition();
|
||||
if (pp.position > end)
|
||||
end = pp.position;
|
||||
pq.add(pp); // build pq from list
|
||||
}
|
||||
return end;
|
||||
}
|
||||
|
||||
// position the pp's
|
||||
for (PhrasePositions pp = first; pp != null; pp = pp.next)
|
||||
pp.firstPosition();
|
||||
|
||||
// one time initialization for this scorer
|
||||
if (!checkedRepeats) {
|
||||
checkedRepeats = true;
|
||||
// check for repeats
|
||||
LinkedHashSet<PhrasePositions> m = null; // see comment (*) below why order is important
|
||||
for (PhrasePositions pp = first; pp != null; pp = pp.next) {
|
||||
int tpPos = pp.position + pp.offset;
|
||||
for (PhrasePositions pp2 = pp.next; pp2 != null; pp2 = pp2.next) {
|
||||
if (pp.offset == pp2.offset) {
|
||||
continue; // not a repetition: the two PPs are originally in same offset in the query!
|
||||
}
|
||||
int tpPos2 = pp2.position + pp2.offset;
|
||||
if (tpPos2 == tpPos) {
|
||||
if (m == null)
|
||||
m = new LinkedHashSet<PhrasePositions>();
|
||||
pp.repeats = true;
|
||||
pp2.repeats = true;
|
||||
m.add(pp);
|
||||
m.add(pp2);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (m!=null)
|
||||
repeats = m.toArray(new PhrasePositions[0]);
|
||||
}
|
||||
|
||||
// with repeats must advance some repeating pp's so they all start with differing tp's
|
||||
// (*) It is important that pps are handled by their original order in the query,
|
||||
// because we advance the pp with larger offset, and so processing them in that order
|
||||
// allows to cover all pairs.
|
||||
if (repeats!=null) {
|
||||
for (int i = 0; i < repeats.length; i++) {
|
||||
PhrasePositions pp = repeats[i];
|
||||
PhrasePositions pp2;
|
||||
while ((pp2 = termPositionsConflict(pp)) != null) {
|
||||
if (!pp2.nextPosition()) // among pps that do not differ, advance the pp with higher offset
|
||||
return -1; // ran out of a term -- done
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// build queue from list
|
||||
pq.clear();
|
||||
for (PhrasePositions pp = first; pp != null; pp = pp.next) {
|
||||
if (pp.position > end)
|
||||
end = pp.position;
|
||||
pq.add(pp); // build pq from list
|
||||
}
|
||||
|
||||
if (repeats!=null) {
|
||||
tmpPos = new PhrasePositions[pq.size()];
|
||||
}
|
||||
|
||||
return end;
|
||||
}
|
||||
|
||||
/**
|
||||
* We disallow two pp's to have the same TermPosition, thereby verifying multiple occurrences
|
||||
* in the query of the same word would go elsewhere in the matched doc.
|
||||
* @return null if differ (i.e. valid) otherwise return the higher offset PhrasePositions
|
||||
* out of the first two PPs found to not differ.
|
||||
*/
|
||||
private PhrasePositions termPositionsConflict(PhrasePositions pp) {
|
||||
// efficiency note: a more efficient implementation could keep a map between repeating
|
||||
// pp's, so that if pp1a, pp1b, pp1c are repeats of term1, and pp2a, pp2b are repeats
|
||||
// of term2, pp2a would only be checked against pp2b but not against pp1a, pp1b, pp1c.
|
||||
// However this would complicate code, for a rather rare case, so choice is to compromise here.
|
||||
int tpPos = pp.position + pp.offset;
|
||||
for (int i = 0; i < repeats.length; i++) {
|
||||
PhrasePositions pp2 = repeats[i];
|
||||
if (pp2 == pp) {
|
||||
continue;
|
||||
}
|
||||
if (pp.offset == pp2.offset) {
|
||||
continue; // not a repetition: the two PPs are originally in same offset in the query!
|
||||
}
|
||||
int tpPos2 = pp2.position + pp2.offset;
|
||||
if (tpPos2 == tpPos) {
|
||||
return pp.offset > pp2.offset ? pp : pp2; // do not differ: return the one with higher offset.
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
// private void printQueue(PrintStream ps, PhrasePositions ext, String title) {
|
||||
// ps.println();
|
||||
// ps.println("---- "+title);
|
||||
// ps.println("EXT: "+ext);
|
||||
// PhrasePositions[] t = new PhrasePositions[pq.size()];
|
||||
// if (pq.size()>0) {
|
||||
// t[0] = pq.pop();
|
||||
// ps.println(" " + 0 + " " + t[0]);
|
||||
// for (int i=1; i<t.length; i++) {
|
||||
// t[i] = pq.pop();
|
||||
// assert t[i-1].position <= t[i].position : "PQ is out of order: "+(i-1)+"::"+t[i-1]+" "+i+"::"+t[i];
|
||||
// ps.println(" " + i + " " + t[i]);
|
||||
// }
|
||||
// // add them back
|
||||
// for (int i=t.length-1; i>=0; i--) {
|
||||
// pq.add(t[i]);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
}
|
||||
|
|
|
@ -201,4 +201,148 @@ public class TestSloppyPhraseQuery extends LuceneTestCase {
|
|||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/** checks that no scores or freqs are infinite */
|
||||
private void assertSaneScoring(PhraseQuery pq, IndexSearcher searcher) throws Exception {
|
||||
searcher.search(pq, new Collector() {
|
||||
Scorer scorer;
|
||||
|
||||
@Override
|
||||
public void setScorer(Scorer scorer) throws IOException {
|
||||
this.scorer = scorer;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void collect(int doc) throws IOException {
|
||||
assertFalse(Float.isInfinite(scorer.freq()));
|
||||
assertFalse(Float.isInfinite(scorer.score()));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setNextReader(AtomicReaderContext context) throws IOException {
|
||||
// do nothing
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean acceptsDocsOutOfOrder() {
|
||||
return false;
|
||||
}
|
||||
});
|
||||
QueryUtils.check(random, pq, searcher);
|
||||
}
|
||||
|
||||
// LUCENE-3215
|
||||
public void testSlopWithHoles() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random, dir);
|
||||
FieldType customType = new FieldType(TextField.TYPE_UNSTORED);
|
||||
customType.setOmitNorms(true);
|
||||
Field f = new Field("lyrics", customType, "");
|
||||
Document doc = new Document();
|
||||
doc.add(f);
|
||||
f.setValue("drug drug");
|
||||
iw.addDocument(doc);
|
||||
f.setValue("drug druggy drug");
|
||||
iw.addDocument(doc);
|
||||
f.setValue("drug druggy druggy drug");
|
||||
iw.addDocument(doc);
|
||||
f.setValue("drug druggy drug druggy drug");
|
||||
iw.addDocument(doc);
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.close();
|
||||
IndexSearcher is = newSearcher(ir);
|
||||
|
||||
PhraseQuery pq = new PhraseQuery();
|
||||
// "drug the drug"~1
|
||||
pq.add(new Term("lyrics", "drug"), 1);
|
||||
pq.add(new Term("lyrics", "drug"), 4);
|
||||
pq.setSlop(0);
|
||||
assertEquals(0, is.search(pq, 4).totalHits);
|
||||
pq.setSlop(1);
|
||||
assertEquals(3, is.search(pq, 4).totalHits);
|
||||
pq.setSlop(2);
|
||||
assertEquals(4, is.search(pq, 4).totalHits);
|
||||
is.close();
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
// LUCENE-3215
|
||||
public void testInfiniteFreq1() throws Exception {
|
||||
String document = "drug druggy drug drug drug";
|
||||
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random, dir);
|
||||
Document doc = new Document();
|
||||
doc.add(newField("lyrics", document, new FieldType(TextField.TYPE_UNSTORED)));
|
||||
iw.addDocument(doc);
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.close();
|
||||
|
||||
IndexSearcher is = newSearcher(ir);
|
||||
PhraseQuery pq = new PhraseQuery();
|
||||
// "drug the drug"~1
|
||||
pq.add(new Term("lyrics", "drug"), 1);
|
||||
pq.add(new Term("lyrics", "drug"), 3);
|
||||
pq.setSlop(1);
|
||||
assertSaneScoring(pq, is);
|
||||
is.close();
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
// LUCENE-3215
|
||||
public void testInfiniteFreq2() throws Exception {
|
||||
String document =
|
||||
"So much fun to be had in my head " +
|
||||
"No more sunshine " +
|
||||
"So much fun just lying in my bed " +
|
||||
"No more sunshine " +
|
||||
"I can't face the sunlight and the dirt outside " +
|
||||
"Wanna stay in 666 where this darkness don't lie " +
|
||||
"Drug drug druggy " +
|
||||
"Got a feeling sweet like honey " +
|
||||
"Drug drug druggy " +
|
||||
"Need sensation like my baby " +
|
||||
"Show me your scars you're so aware " +
|
||||
"I'm not barbaric I just care " +
|
||||
"Drug drug drug " +
|
||||
"I need a reflection to prove I exist " +
|
||||
"No more sunshine " +
|
||||
"I am a victim of designer blitz " +
|
||||
"No more sunshine " +
|
||||
"Dance like a robot when you're chained at the knee " +
|
||||
"The C.I.A say you're all they'll ever need " +
|
||||
"Drug drug druggy " +
|
||||
"Got a feeling sweet like honey " +
|
||||
"Drug drug druggy " +
|
||||
"Need sensation like my baby " +
|
||||
"Snort your lines you're so aware " +
|
||||
"I'm not barbaric I just care " +
|
||||
"Drug drug druggy " +
|
||||
"Got a feeling sweet like honey " +
|
||||
"Drug drug druggy " +
|
||||
"Need sensation like my baby";
|
||||
|
||||
Directory dir = newDirectory();
|
||||
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random, dir);
|
||||
Document doc = new Document();
|
||||
doc.add(newField("lyrics", document, new FieldType(TextField.TYPE_UNSTORED)));
|
||||
iw.addDocument(doc);
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.close();
|
||||
|
||||
IndexSearcher is = newSearcher(ir);
|
||||
|
||||
PhraseQuery pq = new PhraseQuery();
|
||||
// "drug the drug"~5
|
||||
pq.add(new Term("lyrics", "drug"), 1);
|
||||
pq.add(new Term("lyrics", "drug"), 3);
|
||||
pq.setSlop(5);
|
||||
assertSaneScoring(pq, is);
|
||||
is.close();
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue