mirror of https://github.com/apache/lucene.git
LUCENE-736: Sloppy phrase query with repeating terms matches wrong docs.
For example query "B C B"~2 matches the doc "A B C D E". Search time cost of this fix is 4%, for sloppy phrase search. This fix also partially brings back the the tests checkSkipTo() in QueryUtils, (which was disabled by LUCENE-730), but this is now invoked only for non Boolean scorers. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@531733 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
bb6d2cc9c9
commit
e06188c0d6
|
@ -98,6 +98,10 @@ Bug fixes
|
|||
the instance of IndexWriter (but, not the index itself) by
|
||||
referencing already deleted segments. This bug was only present
|
||||
in 2.2 (trunk), ie was never released. (Mike McCandless)
|
||||
|
||||
13. LUCENE-736: Sloppy phrase query with repeating terms matches wrong docs.
|
||||
For example query "B C B"~2 matches the doc "A B C D E". (Doron Cohen)
|
||||
|
||||
|
||||
New features
|
||||
|
||||
|
|
|
@ -22,27 +22,30 @@ import org.apache.lucene.index.*;
|
|||
|
||||
final class ExactPhraseScorer extends PhraseScorer {
|
||||
|
||||
ExactPhraseScorer(Weight weight, TermPositions[] tps, int[] positions, Similarity similarity,
|
||||
ExactPhraseScorer(Weight weight, TermPositions[] tps, int[] offsets, Similarity similarity,
|
||||
byte[] norms) {
|
||||
super(weight, tps, positions, similarity, norms);
|
||||
super(weight, tps, offsets, similarity, norms);
|
||||
}
|
||||
|
||||
protected final float phraseFreq() throws IOException {
|
||||
// sort list with pq
|
||||
pq.clear();
|
||||
for (PhrasePositions pp = first; pp != null; pp = pp.next) {
|
||||
pp.firstPosition();
|
||||
pq.put(pp); // build pq from list
|
||||
}
|
||||
pqToList(); // rebuild list from pq
|
||||
|
||||
// for counting how many times the exact phrase is found in current document,
|
||||
// just count how many times all PhrasePosition's have exactly the same position.
|
||||
int freq = 0;
|
||||
do { // find position w/ all terms
|
||||
while (first.position < last.position) { // scan forward in first
|
||||
do {
|
||||
if (!first.nextPosition())
|
||||
return (float)freq;
|
||||
} while (first.position < last.position);
|
||||
firstToLast();
|
||||
do {
|
||||
if (!first.nextPosition())
|
||||
return (float)freq;
|
||||
} while (first.position < last.position);
|
||||
firstToLast();
|
||||
}
|
||||
freq++; // all equal: a match
|
||||
} while (last.nextPosition());
|
||||
|
|
|
@ -20,6 +20,9 @@ package org.apache.lucene.search;
|
|||
import java.io.IOException;
|
||||
import org.apache.lucene.index.*;
|
||||
|
||||
/**
|
||||
* Position of a term in a document that takes into account the term offset within the phrase.
|
||||
*/
|
||||
final class PhrasePositions {
|
||||
int doc; // current doc
|
||||
int position; // position in doc
|
||||
|
@ -27,6 +30,7 @@ final class PhrasePositions {
|
|||
int offset; // position in phrase
|
||||
TermPositions tp; // stream of positions
|
||||
PhrasePositions next; // used to make lists
|
||||
boolean repeats; // there's other pp for same term (e.g. query="1st word 2nd word"~1)
|
||||
|
||||
PhrasePositions(TermPositions t, int o) {
|
||||
tp = t;
|
||||
|
@ -61,6 +65,12 @@ final class PhrasePositions {
|
|||
nextPosition();
|
||||
}
|
||||
|
||||
/**
|
||||
* Go to next location of this term current document, and set
|
||||
* <code>position</code> as <code>location - offset</code>, so that a
|
||||
* matching exact phrase is easily identified when all PhrasePositions
|
||||
* have exactly the same <code>position</code>.
|
||||
*/
|
||||
final boolean nextPosition() throws IOException {
|
||||
if (count-- > 0) { // read subsequent pos's
|
||||
position = tp.nextPosition() - offset;
|
||||
|
|
|
@ -28,7 +28,12 @@ final class PhraseQueue extends PriorityQueue {
|
|||
PhrasePositions pp1 = (PhrasePositions)o1;
|
||||
PhrasePositions pp2 = (PhrasePositions)o2;
|
||||
if (pp1.doc == pp2.doc)
|
||||
return pp1.position < pp2.position;
|
||||
if (pp1.position == pp2.position)
|
||||
// same doc and pp.position, so decide by actual term positions.
|
||||
// rely on: pp.position == tp.position - offset.
|
||||
return pp1.offset < pp2.offset;
|
||||
else
|
||||
return pp1.position < pp2.position;
|
||||
else
|
||||
return pp1.doc < pp2.doc;
|
||||
}
|
||||
|
|
|
@ -21,6 +21,16 @@ import java.io.IOException;
|
|||
|
||||
import org.apache.lucene.index.*;
|
||||
|
||||
/** Expert: Scoring functionality for phrase queries.
|
||||
* <br>A document is considered matching if it contains the phrase-query terms
|
||||
* at "valid" positons. What "valid positions" are
|
||||
* depends on the type of the phrase query: for an exact phrase query terms are required
|
||||
* to appear in adjacent locations, while for a sloppy phrase query some distance between
|
||||
* the terms is allowed. The abstract method {@link #phraseFreq()} of extending classes
|
||||
* is invoked for each document containing all the phrase query terms, in order to
|
||||
* compute the frequency of the phrase query in that document. A non zero frequency
|
||||
* means a match.
|
||||
*/
|
||||
abstract class PhraseScorer extends Scorer {
|
||||
private Weight weight;
|
||||
protected byte[] norms;
|
||||
|
@ -31,19 +41,23 @@ abstract class PhraseScorer extends Scorer {
|
|||
protected PhraseQueue pq;
|
||||
protected PhrasePositions first, last;
|
||||
|
||||
private float freq;
|
||||
private float freq; //prhase frequency in current doc as computed by phraseFreq().
|
||||
|
||||
|
||||
PhraseScorer(Weight weight, TermPositions[] tps, int[] positions, Similarity similarity,
|
||||
PhraseScorer(Weight weight, TermPositions[] tps, int[] offsets, Similarity similarity,
|
||||
byte[] norms) {
|
||||
super(similarity);
|
||||
this.norms = norms;
|
||||
this.weight = weight;
|
||||
this.value = weight.getValue();
|
||||
|
||||
// convert tps to a list
|
||||
// convert tps to a list of phrase positions.
|
||||
// note: phrase-position differs from term-position in that its position
|
||||
// reflects the phrase offset: pp.pos = tp.pos - offset.
|
||||
// this allows to easily identify a matching (exact) phrase
|
||||
// when all PhrasePositions have exactly the same position.
|
||||
for (int i = 0; i < tps.length; i++) {
|
||||
PhrasePositions pp = new PhrasePositions(tps[i], positions[i]);
|
||||
PhrasePositions pp = new PhrasePositions(tps[i], offsets[i]);
|
||||
if (last != null) { // add next to end of list
|
||||
last.next = pp;
|
||||
} else
|
||||
|
@ -94,6 +108,7 @@ abstract class PhraseScorer extends Scorer {
|
|||
}
|
||||
|
||||
public boolean skipTo(int target) throws IOException {
|
||||
firstTime = false;
|
||||
for (PhrasePositions pp = first; more && pp != null; pp = pp.next) {
|
||||
more = pp.skipTo(target);
|
||||
}
|
||||
|
@ -102,6 +117,13 @@ abstract class PhraseScorer extends Scorer {
|
|||
return doNext();
|
||||
}
|
||||
|
||||
/**
|
||||
* For a document containing all the phrase query terms, compute the
|
||||
* frequency of the phrase in that document.
|
||||
* A non zero frequency means a match.
|
||||
* <br>Note, that containing all phrase terms does not guarantee a match - they have to be found in matching locations.
|
||||
* @return frequency of the phrase in current doc, 0 if not found.
|
||||
*/
|
||||
protected abstract float phraseFreq() throws IOException;
|
||||
|
||||
private void init() throws IOException {
|
||||
|
|
|
@ -20,38 +20,58 @@ package org.apache.lucene.search;
|
|||
import org.apache.lucene.index.TermPositions;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
|
||||
final class SloppyPhraseScorer extends PhraseScorer {
|
||||
private int slop;
|
||||
private PhrasePositions repeats[];
|
||||
private boolean checkedRepeats;
|
||||
|
||||
SloppyPhraseScorer(Weight weight, TermPositions[] tps, int[] positions, Similarity similarity,
|
||||
SloppyPhraseScorer(Weight weight, TermPositions[] tps, int[] offsets, Similarity similarity,
|
||||
int slop, byte[] norms) {
|
||||
super(weight, tps, positions, similarity, norms);
|
||||
super(weight, tps, offsets, similarity, norms);
|
||||
this.slop = slop;
|
||||
}
|
||||
|
||||
/**
|
||||
* Score a candidate doc for all slop-valid position-combinations (matches)
|
||||
* encountered while traversing/hopping the PhrasePositions.
|
||||
* <br> The score contribution of a match depends on the distance:
|
||||
* <br> - highest score for distance=0 (exact match).
|
||||
* <br> - score gets lower as distance gets higher.
|
||||
* <br>Example: for query "a b"~2, a document "x a b a y" can be scored twice:
|
||||
* once for "a b" (distance=0), and once for "b a" (distance=2).
|
||||
* <br>Pssibly not all valid combinations are encountered, because for efficiency
|
||||
* we always propagate the least PhrasePosition. This allows to base on
|
||||
* PriorityQueue and move forward faster.
|
||||
* As result, for example, document "a b c b a"
|
||||
* would score differently for queries "a b c"~4 and "c b a"~4, although
|
||||
* they really are equivalent.
|
||||
* Similarly, for doc "a b c b a f g", query "c b"~2
|
||||
* would get same score as "g f"~2, although "c b"~2 could be matched twice.
|
||||
* We may want to fix this in the future (currently not, for performance reasons).
|
||||
*/
|
||||
protected final float phraseFreq() throws IOException {
|
||||
pq.clear();
|
||||
int end = 0;
|
||||
for (PhrasePositions pp = first; pp != null; pp = pp.next) {
|
||||
pp.firstPosition();
|
||||
if (pp.position > end)
|
||||
end = pp.position;
|
||||
pq.put(pp); // build pq from list
|
||||
}
|
||||
|
||||
int end = initPhrasePositions();
|
||||
|
||||
float freq = 0.0f;
|
||||
boolean done = false;
|
||||
do {
|
||||
boolean done = (end<0);
|
||||
while (!done) {
|
||||
PhrasePositions pp = (PhrasePositions) pq.pop();
|
||||
int start = pp.position;
|
||||
int next = ((PhrasePositions) pq.top()).position;
|
||||
for (int pos = start; pos <= next; pos = pp.position) {
|
||||
start = pos; // advance pp to min window
|
||||
|
||||
boolean tpsDiffer = true;
|
||||
for (int pos = start; pos <= next || !tpsDiffer; pos = pp.position) {
|
||||
if (pos<=next && tpsDiffer)
|
||||
start = pos; // advance pp to min window
|
||||
if (!pp.nextPosition()) {
|
||||
done = true; // ran out of a term -- done
|
||||
done = true; // ran out of a term -- done
|
||||
break;
|
||||
}
|
||||
tpsDiffer = !pp.repeats || termPositionsDiffer(pp);
|
||||
}
|
||||
|
||||
int matchLength = end - start;
|
||||
|
@ -61,8 +81,113 @@ final class SloppyPhraseScorer extends PhraseScorer {
|
|||
if (pp.position > end)
|
||||
end = pp.position;
|
||||
pq.put(pp); // restore pq
|
||||
} while (!done);
|
||||
}
|
||||
|
||||
return freq;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Init PhrasePositions in place.
|
||||
* There is a one time initializatin for this scorer:
|
||||
* <br>- Put in repeats[] each pp that has another pp with same position in the doc.
|
||||
* <br>- Also mark each such pp by pp.repeats = true.
|
||||
* <br>Later can consult with repeats[] in termPositionsDiffer(pp), making that check efficient.
|
||||
* In particular, this allows to score queries with no repetiotions with no overhead due to this computation.
|
||||
* <br>- Example 1 - query with no repetitions: "ho my"~2
|
||||
* <br>- Example 2 - query with repetitions: "ho my my"~2
|
||||
* <br>- Example 3 - query with repetitions: "my ho my"~2
|
||||
* <br>Init per doc w/repeats in query, includes propagating some repeating pp's to avoid false phrase detection.
|
||||
* @return end (max position), or -1 if any term ran out (i.e. done)
|
||||
* @throws IOException
|
||||
*/
|
||||
private int initPhrasePositions() throws IOException {
|
||||
int end = 0;
|
||||
|
||||
// no repeats at all (most common case is also the simplest one)
|
||||
if (checkedRepeats && repeats==null) {
|
||||
// build queue from list
|
||||
pq.clear();
|
||||
for (PhrasePositions pp = first; pp != null; pp = pp.next) {
|
||||
pp.firstPosition();
|
||||
if (pp.position > end)
|
||||
end = pp.position;
|
||||
pq.put(pp); // build pq from list
|
||||
}
|
||||
return end;
|
||||
}
|
||||
|
||||
// position the pp's
|
||||
for (PhrasePositions pp = first; pp != null; pp = pp.next)
|
||||
pp.firstPosition();
|
||||
|
||||
// one time initializatin for this scorer
|
||||
if (!checkedRepeats) {
|
||||
checkedRepeats = true;
|
||||
// check for repeats
|
||||
HashMap m = null;
|
||||
for (PhrasePositions pp = first; pp != null; pp = pp.next) {
|
||||
int tpPos = pp.position + pp.offset;
|
||||
for (PhrasePositions pp2 = pp.next; pp2 != null; pp2 = pp2.next) {
|
||||
int tpPos2 = pp2.position + pp2.offset;
|
||||
if (tpPos2 == tpPos) {
|
||||
if (m == null)
|
||||
m = new HashMap();
|
||||
pp.repeats = true;
|
||||
pp2.repeats = true;
|
||||
m.put(pp,null);
|
||||
m.put(pp2,null);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (m!=null)
|
||||
repeats = (PhrasePositions[]) m.keySet().toArray(new PhrasePositions[0]);
|
||||
}
|
||||
|
||||
// with repeats must advance some repeating pp's so they all start with differing tp's
|
||||
if (repeats!=null) {
|
||||
// must propagate higher offsets first (otherwise might miss matches).
|
||||
Arrays.sort(repeats, new Comparator() {
|
||||
public int compare(Object x, Object y) {
|
||||
return ((PhrasePositions) y).offset - ((PhrasePositions) x).offset;
|
||||
}});
|
||||
// now advance them
|
||||
for (int i = 0; i < repeats.length; i++) {
|
||||
PhrasePositions pp = repeats[i];
|
||||
while (!termPositionsDiffer(pp)) {
|
||||
if (!pp.nextPosition())
|
||||
return -1; // ran out of a term -- done
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// build queue from list
|
||||
pq.clear();
|
||||
for (PhrasePositions pp = first; pp != null; pp = pp.next) {
|
||||
if (pp.position > end)
|
||||
end = pp.position;
|
||||
pq.put(pp); // build pq from list
|
||||
}
|
||||
|
||||
return end;
|
||||
}
|
||||
|
||||
// disalow two pp's to have the same tp position, so that same word twice
|
||||
// in query would go elswhere in the matched doc
|
||||
private boolean termPositionsDiffer(PhrasePositions pp) {
|
||||
// efficiency note: a more efficient implemention could keep a map between repeating
|
||||
// pp's, so that if pp1a, pp1b, pp1c are repeats term1, and pp2a, pp2b are repeats
|
||||
// of term2, pp2a would only be checked against pp2b but not against pp1a, pp1b, pp1c.
|
||||
// However this would complicate code, for a rather rare case, so choice is to compromise here.
|
||||
int tpPos = pp.position + pp.offset;
|
||||
for (int i = 0; i < repeats.length; i++) {
|
||||
PhrasePositions pp2 = repeats[i];
|
||||
if (pp2 == pp)
|
||||
continue;
|
||||
int tpPos2 = pp2.position + pp2.offset;
|
||||
if (tpPos2 == tpPos)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -68,18 +68,15 @@ public class QueryUtils {
|
|||
|
||||
/** various query sanity checks on a searcher */
|
||||
public static void check(Query q1, Searcher s) {
|
||||
// Disabled because this started failing after LUCENE-730 patch was applied
|
||||
// try {
|
||||
try {
|
||||
check(q1);
|
||||
/* disabled for use of BooleanScorer in BooleanScorer2.
|
||||
if (s!=null && s instanceof IndexSearcher) {
|
||||
IndexSearcher is = (IndexSearcher)s;
|
||||
// checkSkipTo(q1,is);
|
||||
checkSkipTo(q1,is);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
/** alternate scorer skipTo(),skipTo(),next(),next(),skipTo(),skipTo(), etc
|
||||
|
@ -87,42 +84,71 @@ public class QueryUtils {
|
|||
*/
|
||||
public static void checkSkipTo(final Query q, final IndexSearcher s) throws IOException {
|
||||
//System.out.println("Checking "+q);
|
||||
final Weight w = q.weight(s);
|
||||
final Scorer scorer = w.scorer(s.getIndexReader());
|
||||
|
||||
// FUTURE: ensure scorer.doc()==-1
|
||||
|
||||
|
||||
if (BooleanQuery.getUseScorer14()) return; // 1.4 doesn't support skipTo
|
||||
|
||||
final int[] which = new int[1];
|
||||
final int[] sdoc = new int[] {-1};
|
||||
final float maxDiff = 1e-5f;
|
||||
s.search(q,new HitCollector() {
|
||||
public void collect(int doc, float score) {
|
||||
try {
|
||||
boolean more = (which[0]++&0x02)==0 ? scorer.skipTo(sdoc[0]+1) : scorer.next();
|
||||
sdoc[0] = scorer.doc();
|
||||
float scorerScore = scorer.score();
|
||||
float scoreDiff = Math.abs(score-scorerScore);
|
||||
scoreDiff=0; // TODO: remove this go get LUCENE-697 failures
|
||||
if (more==false || doc != sdoc[0] || scoreDiff>maxDiff) {
|
||||
throw new RuntimeException("ERROR matching docs:"
|
||||
+"\n\tscorer.more=" + more + " doc="+sdoc[0] + " scorerScore="+scorerScore
|
||||
+" scoreDiff="+scoreDiff + " maxDiff="+maxDiff
|
||||
+"\n\thitCollector.doc=" + doc + " score="+score
|
||||
+"\n\t Scorer=" + scorer
|
||||
+"\n\t Query=" + q
|
||||
+"\n\t Searcher=" + s
|
||||
);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
});
|
||||
final int skip_op = 0;
|
||||
final int next_op = 1;
|
||||
final int orders [][] = {
|
||||
{skip_op},
|
||||
{next_op},
|
||||
{skip_op, next_op},
|
||||
{next_op, skip_op},
|
||||
{skip_op, skip_op, next_op, next_op},
|
||||
{next_op, next_op, skip_op, skip_op},
|
||||
{skip_op, skip_op, skip_op, next_op, next_op},
|
||||
};
|
||||
for (int k = 0; k < orders.length; k++) {
|
||||
final int order[] = orders[k];
|
||||
//System.out.print("Order:");for (int i = 0; i < order.length; i++) System.out.print(order[i]==skip_op ? " skip()":" next()"); System.out.println();
|
||||
final int opidx[] = {0};
|
||||
|
||||
// make sure next call to scorer is false.
|
||||
TestCase.assertFalse((which[0]++&0x02)==0 ? scorer.skipTo(sdoc[0]+1) : scorer.next());
|
||||
final Weight w = q.weight(s);
|
||||
final Scorer scorer = w.scorer(s.getIndexReader());
|
||||
|
||||
if (scorer instanceof BooleanScorer || scorer instanceof BooleanScorer2) {
|
||||
return; // TODO change this if BooleanScorers would once again guarantee docs in order.
|
||||
}
|
||||
|
||||
// FUTURE: ensure scorer.doc()==-1
|
||||
|
||||
final int[] sdoc = new int[] {-1};
|
||||
final float maxDiff = 1e-5f;
|
||||
s.search(q,new HitCollector() {
|
||||
public void collect(int doc, float score) {
|
||||
try {
|
||||
int op = order[(opidx[0]++)%order.length];
|
||||
//System.out.println(op==skip_op ? "skip("+(sdoc[0]+1)+")":"next()");
|
||||
boolean more = op==skip_op ? scorer.skipTo(sdoc[0]+1) : scorer.next();
|
||||
sdoc[0] = scorer.doc();
|
||||
float scorerScore = scorer.score();
|
||||
float scoreDiff = Math.abs(score-scorerScore);
|
||||
if (more==false || doc != sdoc[0] || scoreDiff>maxDiff) {
|
||||
StringBuffer sbord = new StringBuffer();
|
||||
for (int i = 0; i < order.length; i++)
|
||||
sbord.append(order[i]==skip_op ? " skip()":" next()");
|
||||
throw new RuntimeException("ERROR matching docs:"
|
||||
+"\n\tscorer.more=" + more + " doc="+sdoc[0] + " scorerScore="+scorerScore
|
||||
+" scoreDiff="+scoreDiff + " maxDiff="+maxDiff
|
||||
+"\n\thitCollector.doc=" + doc + " score="+score
|
||||
+"\n\t Scorer=" + scorer
|
||||
+"\n\t Query=" + q
|
||||
+"\n\t Searcher=" + s
|
||||
+"\n\t Order=" + sbord
|
||||
);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// make sure next call to scorer is false.
|
||||
int op = order[(opidx[0]++)%order.length];
|
||||
//System.out.println(op==skip_op ? "last: skip()":"last: next()");
|
||||
boolean more = op==skip_op ? scorer.skipTo(sdoc[0]+1) : scorer.next();
|
||||
TestCase.assertFalse(more);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -35,6 +35,10 @@ import java.io.Reader;
|
|||
* @author Erik Hatcher
|
||||
*/
|
||||
public class TestPhraseQuery extends TestCase {
|
||||
|
||||
/** threshold for comparing floats */
|
||||
public static final float SCORE_COMP_THRESH = 1e-6f;
|
||||
|
||||
private IndexSearcher searcher;
|
||||
private PhraseQuery query;
|
||||
private RAMDirectory directory;
|
||||
|
@ -57,8 +61,17 @@ public class TestPhraseQuery extends TestCase {
|
|||
doc.add(new Field("repeated", "this is a repeated field - first part", Field.Store.YES, Field.Index.TOKENIZED));
|
||||
Fieldable repeatedField = new Field("repeated", "second part of a repeated field", Field.Store.YES, Field.Index.TOKENIZED);
|
||||
doc.add(repeatedField);
|
||||
doc.add(new Field("palindrome", "one two three two one", Field.Store.YES, Field.Index.TOKENIZED));
|
||||
writer.addDocument(doc);
|
||||
|
||||
doc = new Document();
|
||||
doc.add(new Field("nonexist", "phrase exist notexist exist found", Field.Store.YES, Field.Index.TOKENIZED));
|
||||
writer.addDocument(doc);
|
||||
|
||||
doc = new Document();
|
||||
doc.add(new Field("nonexist", "phrase exist notexist exist found", Field.Store.YES, Field.Index.TOKENIZED));
|
||||
writer.addDocument(doc);
|
||||
|
||||
writer.optimize();
|
||||
writer.close();
|
||||
|
||||
|
@ -341,12 +354,188 @@ public class TestPhraseQuery extends TestCase {
|
|||
query.add(new Term("repeated", "part"));
|
||||
query.add(new Term("repeated", "second"));
|
||||
query.add(new Term("repeated", "part"));
|
||||
query.setSlop(99);
|
||||
query.setSlop(100);
|
||||
|
||||
Hits hits = searcher.search(query);
|
||||
assertEquals(0, hits.length());
|
||||
assertEquals("slop of 100 just right", 1, hits.length());
|
||||
QueryUtils.check(query,searcher);
|
||||
|
||||
query.setSlop(99);
|
||||
|
||||
hits = searcher.search(query);
|
||||
assertEquals("slop of 99 not enough", 0, hits.length());
|
||||
QueryUtils.check(query,searcher);
|
||||
}
|
||||
|
||||
// work on two docs like this: "phrase exist notexist exist found"
|
||||
public void testNonExistingPhrase() throws IOException {
|
||||
// phrase without repetitions that exists in 2 docs
|
||||
query.add(new Term("nonexist", "phrase"));
|
||||
query.add(new Term("nonexist", "notexist"));
|
||||
query.add(new Term("nonexist", "found"));
|
||||
query.setSlop(2); // would be found this way
|
||||
|
||||
Hits hits = searcher.search(query);
|
||||
assertEquals("phrase without repetitions exists in 2 docs", 2, hits.length());
|
||||
QueryUtils.check(query,searcher);
|
||||
|
||||
// phrase with repetitions that exists in 2 docs
|
||||
query = new PhraseQuery();
|
||||
query.add(new Term("nonexist", "phrase"));
|
||||
query.add(new Term("nonexist", "exist"));
|
||||
query.add(new Term("nonexist", "exist"));
|
||||
query.setSlop(1); // would be found
|
||||
|
||||
hits = searcher.search(query);
|
||||
assertEquals("phrase with repetitions exists in two docs", 2, hits.length());
|
||||
QueryUtils.check(query,searcher);
|
||||
|
||||
// phrase I with repetitions that does not exist in any doc
|
||||
query = new PhraseQuery();
|
||||
query.add(new Term("nonexist", "phrase"));
|
||||
query.add(new Term("nonexist", "notexist"));
|
||||
query.add(new Term("nonexist", "phrase"));
|
||||
query.setSlop(1000); // would not be found no matter how high the slop is
|
||||
|
||||
hits = searcher.search(query);
|
||||
assertEquals("nonexisting phrase with repetitions does not exist in any doc", 0, hits.length());
|
||||
QueryUtils.check(query,searcher);
|
||||
|
||||
// phrase II with repetitions that does not exist in any doc
|
||||
query = new PhraseQuery();
|
||||
query.add(new Term("nonexist", "phrase"));
|
||||
query.add(new Term("nonexist", "exist"));
|
||||
query.add(new Term("nonexist", "exist"));
|
||||
query.add(new Term("nonexist", "exist"));
|
||||
query.setSlop(1000); // would not be found no matter how high the slop is
|
||||
|
||||
hits = searcher.search(query);
|
||||
assertEquals("nonexisting phrase with repetitions does not exist in any doc", 0, hits.length());
|
||||
QueryUtils.check(query,searcher);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Working on a 2 fields like this:
|
||||
* Field("field", "one two three four five")
|
||||
* Field("palindrome", "one two three two one")
|
||||
* Phrase of size 2 occuriong twice, once in order and once in reverse,
|
||||
* because doc is a palyndrome, is counted twice.
|
||||
* Also, in this case order in query does not matter.
|
||||
* Also, when an exact match is found, both sloppy scorer and exact scorer scores the same.
|
||||
*/
|
||||
public void testPalyndrome2() throws Exception {
|
||||
|
||||
// search on non palyndrome, find phrase with no slop, using exact phrase scorer
|
||||
query.setSlop(0); // to use exact phrase scorer
|
||||
query.add(new Term("field", "two"));
|
||||
query.add(new Term("field", "three"));
|
||||
Hits hits = searcher.search(query);
|
||||
assertEquals("phrase found with exact phrase scorer", 1, hits.length());
|
||||
float score0 = hits.score(0);
|
||||
//System.out.println("(exact) field: two three: "+score0);
|
||||
QueryUtils.check(query,searcher);
|
||||
|
||||
// search on non palyndrome, find phrase with slop 2, though no slop required here.
|
||||
query.setSlop(2); // to use sloppy scorer
|
||||
hits = searcher.search(query);
|
||||
assertEquals("just sloppy enough", 1, hits.length());
|
||||
float score1 = hits.score(0);
|
||||
//System.out.println("(sloppy) field: two three: "+score1);
|
||||
assertEquals("exact scorer and sloppy scorer score the same when slop does not matter",score0, score1, SCORE_COMP_THRESH);
|
||||
QueryUtils.check(query,searcher);
|
||||
|
||||
// search ordered in palyndrome, find it twice
|
||||
query = new PhraseQuery();
|
||||
query.setSlop(2); // must be at least two for both ordered and reversed to match
|
||||
query.add(new Term("palindrome", "two"));
|
||||
query.add(new Term("palindrome", "three"));
|
||||
hits = searcher.search(query);
|
||||
assertEquals("just sloppy enough", 1, hits.length());
|
||||
float score2 = hits.score(0);
|
||||
//System.out.println("palindrome: two three: "+score2);
|
||||
QueryUtils.check(query,searcher);
|
||||
|
||||
//commented out for sloppy-phrase efficiency (issue 736) - see SloppyPhraseScorer.phraseFreq().
|
||||
//assertTrue("ordered scores higher in palindrome",score1+SCORE_COMP_THRESH<score2);
|
||||
|
||||
// search reveresed in palyndrome, find it twice
|
||||
query = new PhraseQuery();
|
||||
query.setSlop(2); // must be at least two for both ordered and reversed to match
|
||||
query.add(new Term("palindrome", "three"));
|
||||
query.add(new Term("palindrome", "two"));
|
||||
hits = searcher.search(query);
|
||||
assertEquals("just sloppy enough", 1, hits.length());
|
||||
float score3 = hits.score(0);
|
||||
//System.out.println("palindrome: three two: "+score3);
|
||||
QueryUtils.check(query,searcher);
|
||||
|
||||
//commented out for sloppy-phrase efficiency (issue 736) - see SloppyPhraseScorer.phraseFreq().
|
||||
//assertTrue("reversed scores higher in palindrome",score1+SCORE_COMP_THRESH<score3);
|
||||
//assertEquals("ordered or reversed does not matter",score2, score3, SCORE_COMP_THRESH);
|
||||
}
|
||||
|
||||
/**
|
||||
* Working on a 2 fields like this:
|
||||
* Field("field", "one two three four five")
|
||||
* Field("palindrome", "one two three two one")
|
||||
* Phrase of size 3 occuriong twice, once in order and once in reverse,
|
||||
* because doc is a palyndrome, is counted twice.
|
||||
* Also, in this case order in query does not matter.
|
||||
* Also, when an exact match is found, both sloppy scorer and exact scorer scores the same.
|
||||
*/
|
||||
public void testPalyndrome3() throws Exception {
|
||||
|
||||
// search on non palyndrome, find phrase with no slop, using exact phrase scorer
|
||||
query.setSlop(0); // to use exact phrase scorer
|
||||
query.add(new Term("field", "one"));
|
||||
query.add(new Term("field", "two"));
|
||||
query.add(new Term("field", "three"));
|
||||
Hits hits = searcher.search(query);
|
||||
assertEquals("phrase found with exact phrase scorer", 1, hits.length());
|
||||
float score0 = hits.score(0);
|
||||
//System.out.println("(exact) field: one two three: "+score0);
|
||||
QueryUtils.check(query,searcher);
|
||||
|
||||
// search on non palyndrome, find phrase with slop 3, though no slop required here.
|
||||
query.setSlop(4); // to use sloppy scorer
|
||||
hits = searcher.search(query);
|
||||
assertEquals("just sloppy enough", 1, hits.length());
|
||||
float score1 = hits.score(0);
|
||||
//System.out.println("(sloppy) field: one two three: "+score1);
|
||||
assertEquals("exact scorer and sloppy scorer score the same when slop does not matter",score0, score1, SCORE_COMP_THRESH);
|
||||
QueryUtils.check(query,searcher);
|
||||
|
||||
// search ordered in palyndrome, find it twice
|
||||
query = new PhraseQuery();
|
||||
query.setSlop(4); // must be at least four for both ordered and reversed to match
|
||||
query.add(new Term("palindrome", "one"));
|
||||
query.add(new Term("palindrome", "two"));
|
||||
query.add(new Term("palindrome", "three"));
|
||||
hits = searcher.search(query);
|
||||
assertEquals("just sloppy enough", 1, hits.length());
|
||||
float score2 = hits.score(0);
|
||||
//System.out.println("palindrome: one two three: "+score2);
|
||||
QueryUtils.check(query,searcher);
|
||||
|
||||
//commented out for sloppy-phrase efficiency (issue 736) - see SloppyPhraseScorer.phraseFreq().
|
||||
//assertTrue("ordered scores higher in palindrome",score1+SCORE_COMP_THRESH<score2);
|
||||
|
||||
// search reveresed in palyndrome, find it twice
|
||||
query = new PhraseQuery();
|
||||
query.setSlop(4); // must be at least four for both ordered and reversed to match
|
||||
query.add(new Term("palindrome", "three"));
|
||||
query.add(new Term("palindrome", "two"));
|
||||
query.add(new Term("palindrome", "one"));
|
||||
hits = searcher.search(query);
|
||||
assertEquals("just sloppy enough", 1, hits.length());
|
||||
float score3 = hits.score(0);
|
||||
//System.out.println("palindrome: three two one: "+score3);
|
||||
QueryUtils.check(query,searcher);
|
||||
|
||||
//commented out for sloppy-phrase efficiency (issue 736) - see SloppyPhraseScorer.phraseFreq().
|
||||
//assertTrue("reversed scores higher in palindrome",score1+SCORE_COMP_THRESH<score3);
|
||||
//assertEquals("ordered or reversed does not matter",score2, score3, SCORE_COMP_THRESH);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue