mirror of https://github.com/apache/lucene.git
LUCENE-3821: SloppyPhraseScorer sometimes misses documents that ExactPhraseScorer finds.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1299112 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
bf2be9b35a
commit
187bfe2ebc
|
@ -919,7 +919,13 @@ Bug fixes
|
|||
from the delegate DocIdSet.iterator(), which is allowed to return
|
||||
null by DocIdSet specification when no documents match.
|
||||
(Shay Banon via Uwe Schindler)
|
||||
|
||||
|
||||
* LUCENE-3821: SloppyPhraseScorer missed documents that ExactPhraseScorer finds
|
||||
When phrase queru had repeating terms (e.g. "yes ho yes")
|
||||
sloppy query missed documents that exact query matched.
|
||||
Fixed except when for repeating multiterms (e.g. "yes ho yes|no").
|
||||
(Robert Muir, Doron Cohen)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-3653: Improve concurrency in VirtualMethod and AttributeSource by
|
||||
|
|
|
@ -22,7 +22,6 @@ import java.util.*;
|
|||
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.AtomicReader;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexReaderContext;
|
||||
|
@ -238,7 +237,7 @@ public class MultiPhraseQuery extends Query {
|
|||
docFreq = termsEnum.docFreq();
|
||||
}
|
||||
|
||||
postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(postingsEnum, docFreq, positions.get(pos).intValue(), terms[0]);
|
||||
postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(postingsEnum, docFreq, positions.get(pos).intValue(), terms);
|
||||
}
|
||||
|
||||
// sort by increasing docFreq order
|
||||
|
@ -314,9 +313,21 @@ public class MultiPhraseQuery extends Query {
|
|||
}
|
||||
|
||||
buffer.append("\"");
|
||||
int k = 0;
|
||||
Iterator<Term[]> i = termArrays.iterator();
|
||||
int lastPos = -1;
|
||||
boolean first = true;
|
||||
while (i.hasNext()) {
|
||||
Term[] terms = i.next();
|
||||
int position = positions.get(k);
|
||||
if (first) {
|
||||
first = false;
|
||||
} else {
|
||||
buffer.append(" ");
|
||||
for (int j=1; j<(position-lastPos); j++) {
|
||||
buffer.append("? ");
|
||||
}
|
||||
}
|
||||
if (terms.length > 1) {
|
||||
buffer.append("(");
|
||||
for (int j = 0; j < terms.length; j++) {
|
||||
|
@ -328,8 +339,8 @@ public class MultiPhraseQuery extends Query {
|
|||
} else {
|
||||
buffer.append(terms[0].text());
|
||||
}
|
||||
if (i.hasNext())
|
||||
buffer.append(" ");
|
||||
lastPos = position;
|
||||
++k;
|
||||
}
|
||||
buffer.append("\"");
|
||||
|
||||
|
|
|
@ -31,12 +31,15 @@ final class PhrasePositions {
|
|||
final int ord; // unique across all PhrasePositions instances
|
||||
final DocsAndPositionsEnum postings; // stream of docs & positions
|
||||
PhrasePositions next; // used to make lists
|
||||
PhrasePositions nextRepeating; // link to next repeating pp: standing for same term in different query offsets
|
||||
int rptGroup = -1; // >=0 indicates that this is a repeating PP
|
||||
int rptInd; // index in the rptGroup
|
||||
final Term[] terms; // for repetitions initialization
|
||||
|
||||
PhrasePositions(DocsAndPositionsEnum postings, int o, int ord) {
|
||||
PhrasePositions(DocsAndPositionsEnum postings, int o, int ord, Term[] terms) {
|
||||
this.postings = postings;
|
||||
offset = o;
|
||||
this.ord = ord;
|
||||
this.terms = terms;
|
||||
}
|
||||
|
||||
final boolean next() throws IOException { // increments to next doc
|
||||
|
@ -78,8 +81,8 @@ final class PhrasePositions {
|
|||
@Override
|
||||
public String toString() {
|
||||
String s = "d:"+doc+" o:"+offset+" p:"+position+" c:"+count;
|
||||
if (nextRepeating!=null) {
|
||||
s += " rpt[ "+nextRepeating+" ]";
|
||||
if (rptGroup >=0 ) {
|
||||
s += " rpt:"+rptGroup+",i"+rptInd;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.search;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
|
@ -137,23 +138,46 @@ public class PhraseQuery extends Query {
|
|||
final DocsAndPositionsEnum postings;
|
||||
final int docFreq;
|
||||
final int position;
|
||||
final Term term;
|
||||
final Term[] terms;
|
||||
final int nTerms; // for faster comparisons
|
||||
|
||||
public PostingsAndFreq(DocsAndPositionsEnum postings, int docFreq, int position, Term term) {
|
||||
public PostingsAndFreq(DocsAndPositionsEnum postings, int docFreq, int position, Term... terms) {
|
||||
this.postings = postings;
|
||||
this.docFreq = docFreq;
|
||||
this.position = position;
|
||||
this.term = term;
|
||||
nTerms = terms==null ? 0 : terms.length;
|
||||
if (nTerms>0) {
|
||||
if (terms.length==1) {
|
||||
this.terms = terms;
|
||||
} else {
|
||||
Term[] terms2 = new Term[terms.length];
|
||||
System.arraycopy(terms, 0, terms2, 0, terms.length);
|
||||
Arrays.sort(terms2);
|
||||
this.terms = terms2;
|
||||
}
|
||||
} else {
|
||||
this.terms = null;
|
||||
}
|
||||
}
|
||||
|
||||
public int compareTo(PostingsAndFreq other) {
|
||||
if (docFreq == other.docFreq) {
|
||||
if (position == other.position) {
|
||||
return term.compareTo(other.term);
|
||||
}
|
||||
if (docFreq != other.docFreq) {
|
||||
return docFreq - other.docFreq;
|
||||
}
|
||||
if (position != other.position) {
|
||||
return position - other.position;
|
||||
}
|
||||
return docFreq - other.docFreq;
|
||||
if (nTerms != other.nTerms) {
|
||||
return nTerms - other.nTerms;
|
||||
}
|
||||
if (nTerms == 0) {
|
||||
return 0;
|
||||
}
|
||||
for (int i=0; i<terms.length; i++) {
|
||||
int res = terms[i].compareTo(other.terms[i]);
|
||||
if (res!=0) return res;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -162,7 +186,9 @@ public class PhraseQuery extends Query {
|
|||
int result = 1;
|
||||
result = prime * result + docFreq;
|
||||
result = prime * result + position;
|
||||
result = prime * result + ((term == null) ? 0 : term.hashCode());
|
||||
for (int i=0; i<nTerms; i++) {
|
||||
result = prime * result + terms[i].hashCode();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -174,10 +200,8 @@ public class PhraseQuery extends Query {
|
|||
PostingsAndFreq other = (PostingsAndFreq) obj;
|
||||
if (docFreq != other.docFreq) return false;
|
||||
if (position != other.position) return false;
|
||||
if (term == null) {
|
||||
if (other.term != null) return false;
|
||||
} else if (!term.equals(other.term)) return false;
|
||||
return true;
|
||||
if (terms == null) return other.terms == null;
|
||||
return Arrays.equals(terms, other.terms);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -49,11 +49,11 @@ abstract class PhraseScorer extends Scorer {
|
|||
// this allows to easily identify a matching (exact) phrase
|
||||
// when all PhrasePositions have exactly the same position.
|
||||
if (postings.length > 0) {
|
||||
min = new PhrasePositions(postings[0].postings, postings[0].position, 0);
|
||||
min = new PhrasePositions(postings[0].postings, postings[0].position, 0, postings[0].terms);
|
||||
max = min;
|
||||
max.doc = -1;
|
||||
for (int i = 1; i < postings.length; i++) {
|
||||
PhrasePositions pp = new PhrasePositions(postings[i].postings, postings[i].position, i);
|
||||
PhrasePositions pp = new PhrasePositions(postings[i].postings, postings[i].position, i, postings[i].terms);
|
||||
max.next = pp;
|
||||
max = pp;
|
||||
max.doc = -1;
|
||||
|
|
|
@ -19,22 +19,38 @@ package org.apache.lucene.search;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedHashMap;
|
||||
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.util.OpenBitSet;
|
||||
|
||||
final class SloppyPhraseScorer extends PhraseScorer {
|
||||
private int slop;
|
||||
private boolean checkedRepeats; // flag to only check in first candidate doc in case there are no repeats
|
||||
private boolean hasRepeats; // flag indicating that there are repeats (already checked in first candidate doc)
|
||||
private PhraseQueue pq; // for advancing min position
|
||||
private PhrasePositions[] nrPps; // non repeating pps ordered by their query offset
|
||||
|
||||
private final int slop;
|
||||
private final int numPostings;
|
||||
private final PhraseQueue pq; // for advancing min position
|
||||
|
||||
private int end; // current largest phrase position
|
||||
|
||||
private boolean hasRpts; // flag indicating that there are repetitions (as checked in first candidate doc)
|
||||
private boolean checkedRpts; // flag to only check for repetitions in first candidate doc
|
||||
private boolean hasMultiTermRpts; //
|
||||
private PhrasePositions[][] rptGroups; // in each group are PPs that repeats each other (i.e. same term), sorted by (query) offset
|
||||
private PhrasePositions[] rptStack; // temporary stack for switching colliding repeating pps
|
||||
|
||||
SloppyPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
|
||||
int slop, Similarity.SloppySimScorer docScorer) {
|
||||
super(weight, postings, docScorer);
|
||||
this.slop = slop;
|
||||
this.numPostings = postings==null ? 0 : postings.length;
|
||||
pq = new PhraseQueue(postings.length);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Score a candidate doc for all slop-valid position-combinations (matches)
|
||||
* encountered while traversing/hopping the PhrasePositions.
|
||||
|
@ -55,31 +71,27 @@ final class SloppyPhraseScorer extends PhraseScorer {
|
|||
*/
|
||||
@Override
|
||||
protected float phraseFreq() throws IOException {
|
||||
int end = initPhrasePositions();
|
||||
//printPositions(System.err, "INIT DONE:");
|
||||
if (end==Integer.MIN_VALUE) {
|
||||
if (!initPhrasePositions()) {
|
||||
return 0.0f;
|
||||
}
|
||||
|
||||
float freq = 0.0f;
|
||||
PhrasePositions pp = pq.pop();
|
||||
int matchLength = end - pp.position;
|
||||
int next = pq.size()>0 ? pq.top().position : pp.position;
|
||||
//printQueue(System.err, pp, "Bef Loop: next="+next+" mlen="+end+"-"+pp.position+"="+matchLength);
|
||||
while (pp.nextPosition() && (end=advanceRepeats(pp, end)) != Integer.MIN_VALUE) {
|
||||
if (pp.position > next) {
|
||||
//printQueue(System.err, pp, "A: >next="+next+" matchLength="+matchLength);
|
||||
int next = pq.top().position;
|
||||
while (advancePP(pp)) {
|
||||
if (hasRpts && !advanceRpts(pp)) {
|
||||
break; // pps exhausted
|
||||
}
|
||||
if (pp.position > next) { // done minimizing current match-length
|
||||
if (matchLength <= slop) {
|
||||
freq += docScorer.computeSlopFactor(matchLength); // score match
|
||||
}
|
||||
pq.add(pp);
|
||||
pp = pq.pop();
|
||||
next = pq.size()>0 ? pq.top().position : pp.position;
|
||||
next = pq.top().position;
|
||||
matchLength = end - pp.position;
|
||||
//printQueue(System.err, pp, "B: >next="+next+" matchLength="+matchLength);
|
||||
} else {
|
||||
int matchLength2 = end - pp.position;
|
||||
//printQueue(System.err, pp, "C: mlen2<mlen: next="+next+" matchLength="+matchLength+" matchLength2="+matchLength2);
|
||||
if (matchLength2 < matchLength) {
|
||||
matchLength = matchLength2;
|
||||
}
|
||||
|
@ -91,53 +103,82 @@ final class SloppyPhraseScorer extends PhraseScorer {
|
|||
return freq;
|
||||
}
|
||||
|
||||
/**
|
||||
* Advance repeating pps of an input (non-repeating) pp.
|
||||
* Return a modified 'end' in case pp or its repeats exceeds original 'end'.
|
||||
* "Dirty" trick: when there are repeats, modifies pp's position to that of
|
||||
* least repeater of pp (needed when due to holes repeaters' positions are "back").
|
||||
*/
|
||||
private int advanceRepeats(PhrasePositions pp, int end) throws IOException {
|
||||
int repeatsEnd = end;
|
||||
if (pp.position > repeatsEnd) {
|
||||
repeatsEnd = pp.position;
|
||||
/** advance a PhrasePosition and update 'end', return false if exhausted */
|
||||
private boolean advancePP(PhrasePositions pp) throws IOException {
|
||||
if (!pp.nextPosition()) {
|
||||
return false;
|
||||
}
|
||||
if (!hasRepeats) {
|
||||
return repeatsEnd;
|
||||
if (pp.position > end) {
|
||||
end = pp.position;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/** pp was just advanced. If that caused a repeater collision, resolve by advancing the lesser
|
||||
* of the two colliding pps. Note that there can only be one collision, as by the initialization
|
||||
* there were no collisions before pp was advanced. */
|
||||
private boolean advanceRpts(PhrasePositions pp) throws IOException {
|
||||
if (pp.rptGroup < 0) {
|
||||
return true; // not a repeater
|
||||
}
|
||||
PhrasePositions[] rg = rptGroups[pp.rptGroup];
|
||||
OpenBitSet bits = new OpenBitSet(rg.length); // for re-queuing after collisions are resolved
|
||||
int k0 = pp.rptInd;
|
||||
int k;
|
||||
while((k=collide(pp)) >= 0) {
|
||||
pp = lesser(pp, rg[k]); // always advance the lesser of the (only) two colliding pps
|
||||
if (!advancePP(pp)) {
|
||||
return false; // exhausted
|
||||
}
|
||||
if (k != k0) { // careful: mark only those currently in the queue
|
||||
bits.set(k); // mark that pp2 need to be re-queued
|
||||
}
|
||||
}
|
||||
// collisions resolved, now re-queue
|
||||
// empty (partially) the queue until seeing all pps advanced for resolving collisions
|
||||
int n = 0;
|
||||
while (bits.cardinality() > 0) {
|
||||
PhrasePositions pp2 = pq.pop();
|
||||
rptStack[n++] = pp2;
|
||||
if (pp2.rptGroup >= 0 && bits.get(pp2.rptInd)) {
|
||||
bits.clear(pp2.rptInd);
|
||||
}
|
||||
}
|
||||
// add back to queue
|
||||
for (int i=n-1; i>=0; i--) {
|
||||
pq.add(rptStack[i]);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/** compare two pps, but only by position and offset */
|
||||
private PhrasePositions lesser(PhrasePositions pp, PhrasePositions pp2) {
|
||||
if (pp.position < pp2.position ||
|
||||
(pp.position == pp2.position && pp.offset < pp2.offset)) {
|
||||
return pp;
|
||||
}
|
||||
return pp2;
|
||||
}
|
||||
|
||||
/** index of a pp2 colliding with pp, or -1 if none */
|
||||
private int collide(PhrasePositions pp) {
|
||||
int tpPos = tpPos(pp);
|
||||
for (PhrasePositions pp2=pp.nextRepeating; pp2!=null; pp2=pp2.nextRepeating) {
|
||||
while (tpPos(pp2) <= tpPos) {
|
||||
if (!pp2.nextPosition()) {
|
||||
return Integer.MIN_VALUE;
|
||||
}
|
||||
}
|
||||
tpPos = tpPos(pp2);
|
||||
if (pp2.position > repeatsEnd) {
|
||||
repeatsEnd = pp2.position;
|
||||
}
|
||||
// "dirty" trick: with holes, given a pp, its repeating pp2 might have smaller position.
|
||||
// so in order to have the right "start" in matchLength computation we fake pp.position.
|
||||
// this relies on pp.nextPosition() not using pp.position.
|
||||
if (pp2.position < pp.position) {
|
||||
pp.position = pp2.position;
|
||||
PhrasePositions[] rg = rptGroups[pp.rptGroup];
|
||||
for (int i=0; i<rg.length; i++) {
|
||||
PhrasePositions pp2 = rg[i];
|
||||
if (pp2 != pp && tpPos(pp2) == tpPos) {
|
||||
return pp2.rptInd;
|
||||
}
|
||||
}
|
||||
return repeatsEnd;
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize PhrasePositions in place.
|
||||
* There is a one time initialization for this scorer (taking place at the first doc that matches all terms):
|
||||
* A one time initialization for this scorer (on first doc matching all terms):
|
||||
* <ul>
|
||||
* <li>Detect groups of repeating pps: those with same tpPos (tpPos==position in the doc) but different offsets in query.
|
||||
* <li>For each such group:
|
||||
* <ul>
|
||||
* <li>form an inner linked list of the repeating ones.
|
||||
* <li>propagate all group members but first so that they land on different tpPos().
|
||||
* </ul>
|
||||
* <li>Mark whether there are repetitions at all, so that scoring queries with no repetitions has no overhead due to this computation.
|
||||
* <li>Insert to pq only non repeating PPs, or PPs that are the first in a repeating group.
|
||||
* <li>Check if there are repetitions
|
||||
* <li>If there are, find groups of repetitions.
|
||||
* </ul>
|
||||
* Examples:
|
||||
* <ol>
|
||||
|
@ -145,118 +186,305 @@ final class SloppyPhraseScorer extends PhraseScorer {
|
|||
* <li>repetitions: <b>"ho my my"~2</b>
|
||||
* <li>repetitions: <b>"my ho my"~2</b>
|
||||
* </ol>
|
||||
* @return end (max position), or Integer.MIN_VALUE if any term ran out (i.e. done)
|
||||
* @return false if PPs are exhausted (and so current doc will not be a match)
|
||||
*/
|
||||
private int initPhrasePositions() throws IOException {
|
||||
int end = Integer.MIN_VALUE;
|
||||
|
||||
// no repeats at all (most common case is also the simplest one)
|
||||
if (checkedRepeats && !hasRepeats) {
|
||||
// build queue from list
|
||||
pq.clear();
|
||||
for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
|
||||
pp.firstPosition();
|
||||
if (pp.position > end) {
|
||||
end = pp.position;
|
||||
}
|
||||
pq.add(pp); // build pq from list
|
||||
}
|
||||
return end;
|
||||
private boolean initPhrasePositions() throws IOException {
|
||||
end = Integer.MIN_VALUE;
|
||||
if (!checkedRpts) {
|
||||
return initFirstTime();
|
||||
}
|
||||
|
||||
//printPositions(System.err, "Init: 1: Bef position");
|
||||
|
||||
// position the pp's
|
||||
for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
|
||||
pp.firstPosition();
|
||||
if (!hasRpts) {
|
||||
initSimple();
|
||||
return true; // PPs available
|
||||
}
|
||||
|
||||
//printPositions(System.err, "Init: 2: Aft position");
|
||||
|
||||
// one time initialization for this scorer (done only for the first candidate doc)
|
||||
if (!checkedRepeats) {
|
||||
checkedRepeats = true;
|
||||
ArrayList<PhrasePositions> ppsA = new ArrayList<PhrasePositions>();
|
||||
PhrasePositions dummyPP = new PhrasePositions(null, -1, -1);
|
||||
// check for repeats
|
||||
for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
|
||||
if (pp.nextRepeating != null) {
|
||||
continue; // a repetition of an earlier pp
|
||||
}
|
||||
ppsA.add(pp);
|
||||
int tpPos = tpPos(pp);
|
||||
for (PhrasePositions prevB=pp, pp2=pp.next; pp2!= min; pp2=pp2.next) {
|
||||
if (
|
||||
pp2.nextRepeating != null // already detected as a repetition of an earlier pp
|
||||
|| pp.offset == pp2.offset // not a repetition: the two PPs are originally in same offset in the query!
|
||||
|| tpPos(pp2) != tpPos) { // not a repetition
|
||||
continue;
|
||||
}
|
||||
// a repetition
|
||||
hasRepeats = true;
|
||||
prevB.nextRepeating = pp2; // add pp2 to the repeats linked list
|
||||
pp2.nextRepeating = dummyPP; // allows not to handle the last pp in a sub-list
|
||||
prevB = pp2;
|
||||
}
|
||||
}
|
||||
if (hasRepeats) {
|
||||
// clean dummy markers
|
||||
for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
|
||||
if (pp.nextRepeating == dummyPP) {
|
||||
pp.nextRepeating = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
nrPps = ppsA.toArray(new PhrasePositions[0]);
|
||||
pq = new PhraseQueue(nrPps.length);
|
||||
}
|
||||
|
||||
//printPositions(System.err, "Init: 3: Aft check-repeats");
|
||||
|
||||
// with repeats must advance some repeating pp's so they all start with differing tp's
|
||||
if (hasRepeats) {
|
||||
for (PhrasePositions pp: nrPps) {
|
||||
if ((end=advanceRepeats(pp, end)) == Integer.MIN_VALUE) {
|
||||
return Integer.MIN_VALUE; // ran out of a term -- done (no valid matches in current doc)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//printPositions(System.err, "Init: 4: Aft advance-repeats");
|
||||
|
||||
// build queue from non repeating pps
|
||||
return initComplex();
|
||||
}
|
||||
|
||||
/** no repeats: simplest case, and most common. It is important to keep this piece of the code simple and efficient */
|
||||
private void initSimple() throws IOException {
|
||||
//System.err.println("initSimple: doc: "+min.doc);
|
||||
pq.clear();
|
||||
for (PhrasePositions pp: nrPps) {
|
||||
// position pps and build queue from list
|
||||
for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
|
||||
pp.firstPosition();
|
||||
if (pp.position > end) {
|
||||
end = pp.position;
|
||||
}
|
||||
pq.add(pp);
|
||||
}
|
||||
|
||||
return end;
|
||||
}
|
||||
|
||||
/** with repeats: not so simple. */
|
||||
private boolean initComplex() throws IOException {
|
||||
//System.err.println("initComplex: doc: "+min.doc);
|
||||
placeFirstPositions();
|
||||
if (!advanceRepeatGroups()) {
|
||||
return false; // PPs exhausted
|
||||
}
|
||||
fillQueue();
|
||||
return true; // PPs available
|
||||
}
|
||||
|
||||
/** move all PPs to their first position */
|
||||
private void placeFirstPositions() throws IOException {
|
||||
for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
|
||||
pp.firstPosition();
|
||||
}
|
||||
}
|
||||
|
||||
/** Fill the queue (all pps are already placed */
|
||||
private void fillQueue() {
|
||||
pq.clear();
|
||||
for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
|
||||
if (pp.position > end) {
|
||||
end = pp.position;
|
||||
}
|
||||
pq.add(pp);
|
||||
}
|
||||
}
|
||||
|
||||
/** At initialization (each doc), each repetition group is sorted by (query) offset.
|
||||
* This provides the start condition: no collisions.
|
||||
* <p>Case 1: no multi-term repeats<br>
|
||||
* It is sufficient to advance each pp in the group by one less than its group index.
|
||||
* So lesser pp is not advanced, 2nd one advance once, 3rd one advanced twice, etc.
|
||||
* <p>Case 2: multi-term repeats<br>
|
||||
*
|
||||
* @return false if PPs are exhausted.
|
||||
*/
|
||||
private boolean advanceRepeatGroups() throws IOException {
|
||||
for (PhrasePositions[] rg: rptGroups) {
|
||||
if (hasMultiTermRpts) {
|
||||
// more involved, some may not collide
|
||||
int incr;
|
||||
for (int i=0; i<rg.length; i+=incr) {
|
||||
incr = 1;
|
||||
PhrasePositions pp = rg[i];
|
||||
int k;
|
||||
while((k=collide(pp)) >= 0) {
|
||||
PhrasePositions pp2 = lesser(pp, rg[k]);
|
||||
if (!advancePP(pp2)) { // at initialization always advance pp with higher offset
|
||||
return false; // exhausted
|
||||
}
|
||||
if (pp2.rptInd < i) { // should not happen?
|
||||
incr = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// simpler, we know exactly how much to advance
|
||||
for (int j=1; j<rg.length; j++) {
|
||||
for (int k=0; k<j; k++) {
|
||||
if (!rg[j].nextPosition()) {
|
||||
return false; // PPs exhausted
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return true; // PPs available
|
||||
}
|
||||
|
||||
/** initialize with checking for repeats. Heavy work, but done only for the first candidate doc.<p>
|
||||
* If there are repetitions, check if multi-term postings (MTP) are involved.<p>
|
||||
* Without MTP, once PPs are placed in the first candidate doc, repeats (and groups) are visible.<br>
|
||||
* With MTP, a more complex check is needed, up-front, as there may be "hidden collisions".<br>
|
||||
* For example P1 has {A,B}, P1 has {B,C}, and the first doc is: "A C B". At start, P1 would point
|
||||
* to "A", p2 to "C", and it will not be identified that P1 and P2 are repetitions of each other.<p>
|
||||
* The more complex initialization has two parts:<br>
|
||||
* (1) identification of repetition groups.<br>
|
||||
* (2) advancing repeat groups at the start of the doc.<br>
|
||||
* For (1), a possible solution is to just create a single repetition group,
|
||||
* made of all repeating pps. But this would slow down the check for collisions,
|
||||
* as all pps would need to be checked. Instead, we compute "connected regions"
|
||||
* on the bipartite graph of postings and terms.
|
||||
*/
|
||||
private boolean initFirstTime() throws IOException {
|
||||
//System.err.println("initFirstTime: doc: "+min.doc);
|
||||
checkedRpts = true;
|
||||
placeFirstPositions();
|
||||
|
||||
LinkedHashMap<Term,Integer> rptTerms = repeatingTerms();
|
||||
hasRpts = !rptTerms.isEmpty();
|
||||
|
||||
if (hasRpts) {
|
||||
rptStack = new PhrasePositions[numPostings]; // needed with repetitions
|
||||
ArrayList<ArrayList<PhrasePositions>> rgs = gatherRptGroups(rptTerms);
|
||||
sortRptGroups(rgs);
|
||||
if (!advanceRepeatGroups()) {
|
||||
return false; // PPs exhausted
|
||||
}
|
||||
}
|
||||
|
||||
fillQueue();
|
||||
return true; // PPs available
|
||||
}
|
||||
|
||||
/** sort each repetition group by (query) offset.
|
||||
* Done only once (at first doc) and allows to initialize faster for each doc. */
|
||||
private void sortRptGroups(ArrayList<ArrayList<PhrasePositions>> rgs) {
|
||||
rptGroups = new PhrasePositions[rgs.size()][];
|
||||
Comparator<PhrasePositions> cmprtr = new Comparator<PhrasePositions>() {
|
||||
public int compare(PhrasePositions pp1, PhrasePositions pp2) {
|
||||
return pp1.offset - pp2.offset;
|
||||
}
|
||||
};
|
||||
for (int i=0; i<rptGroups.length; i++) {
|
||||
PhrasePositions[] rg = rgs.get(i).toArray(new PhrasePositions[0]);
|
||||
Arrays.sort(rg, cmprtr);
|
||||
rptGroups[i] = rg;
|
||||
for (int j=0; j<rg.length; j++) {
|
||||
rg[j].rptInd = j; // we use this index for efficient re-queuing
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Detect repetition groups. Done once - for first doc */
|
||||
private ArrayList<ArrayList<PhrasePositions>> gatherRptGroups(LinkedHashMap<Term,Integer> rptTerms) throws IOException {
|
||||
PhrasePositions[] rpp = repeatingPPs(rptTerms);
|
||||
ArrayList<ArrayList<PhrasePositions>> res = new ArrayList<ArrayList<PhrasePositions>>();
|
||||
if (!hasMultiTermRpts) {
|
||||
// simpler - no multi-terms - can base on positions in first doc
|
||||
for (int i=0; i<rpp.length; i++) {
|
||||
PhrasePositions pp = rpp[i];
|
||||
if (pp.rptGroup >=0) continue; // already marked as a repetition
|
||||
int tpPos = tpPos(pp);
|
||||
for (int j=i+1; j<rpp.length; j++) {
|
||||
PhrasePositions pp2 = rpp[j];
|
||||
if (
|
||||
pp2.rptGroup >=0 // already marked as a repetition
|
||||
|| pp2.offset == pp.offset // not a repetition: two PPs are originally in same offset in the query!
|
||||
|| tpPos(pp2) != tpPos) { // not a repetition
|
||||
continue;
|
||||
}
|
||||
// a repetition
|
||||
int g = pp.rptGroup;
|
||||
if (g < 0) {
|
||||
g = res.size();
|
||||
pp.rptGroup = g;
|
||||
ArrayList<PhrasePositions> rl = new ArrayList<PhrasePositions>(2);
|
||||
rl.add(pp);
|
||||
res.add(rl);
|
||||
}
|
||||
pp2.rptGroup = g;
|
||||
res.get(g).add(pp2);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// more involved - has multi-terms
|
||||
ArrayList<HashSet<PhrasePositions>> tmp = new ArrayList<HashSet<PhrasePositions>>();
|
||||
ArrayList<OpenBitSet> bb = ppTermsBitSets(rpp, rptTerms);
|
||||
unionTermGroups(bb);
|
||||
HashMap<Term,Integer> tg = termGroups(rptTerms, bb);
|
||||
HashSet<Integer> distinctGroupIDs = new HashSet<Integer>(tg.values());
|
||||
for (int i=0; i<distinctGroupIDs.size(); i++) {
|
||||
tmp.add(new HashSet<PhrasePositions>());
|
||||
}
|
||||
for (PhrasePositions pp : rpp) {
|
||||
for (Term t: pp.terms) {
|
||||
if (rptTerms.containsKey(t)) {
|
||||
int g = tg.get(t);
|
||||
tmp.get(g).add(pp);
|
||||
assert pp.rptGroup==-1 || pp.rptGroup==g;
|
||||
pp.rptGroup = g;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (HashSet<PhrasePositions> hs : tmp) {
|
||||
res.add(new ArrayList<PhrasePositions>(hs));
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
/** Actual position in doc of a PhrasePosition, relies on that position = tpPos - offset) */
|
||||
private final int tpPos(PhrasePositions pp) {
|
||||
return pp.position + pp.offset;
|
||||
}
|
||||
|
||||
// private void printPositions(PrintStream ps, String title) {
|
||||
// ps.println();
|
||||
// ps.println("---- "+title);
|
||||
// int k = 0;
|
||||
// if (nrPps!=null) {
|
||||
// for (PhrasePositions pp: nrPps) {
|
||||
// ps.println(" " + k++ + " " + pp);
|
||||
// }
|
||||
// } else {
|
||||
// for (PhrasePositions pp=min; 0==k || pp!=min; pp = pp.next) {
|
||||
// ps.println(" " + k++ + " " + pp);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
/** find repeating terms and assign them ordinal values */
|
||||
private LinkedHashMap<Term,Integer> repeatingTerms() {
|
||||
LinkedHashMap<Term,Integer> tord = new LinkedHashMap<Term,Integer>();
|
||||
HashMap<Term,Integer> tcnt = new HashMap<Term,Integer>();
|
||||
for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
|
||||
for (Term t : pp.terms) {
|
||||
Integer cnt0 = tcnt.get(t);
|
||||
Integer cnt = cnt0==null ? new Integer(1) : new Integer(1+cnt0.intValue());
|
||||
tcnt.put(t, cnt);
|
||||
if (cnt==2) {
|
||||
tord.put(t,tord.size());
|
||||
}
|
||||
}
|
||||
}
|
||||
return tord;
|
||||
}
|
||||
|
||||
/** find repeating pps, and for each, if has multi-terms, update this.hasMultiTermRpts */
|
||||
private PhrasePositions[] repeatingPPs(HashMap<Term,Integer> rptTerms) {
|
||||
ArrayList<PhrasePositions> rp = new ArrayList<PhrasePositions>();
|
||||
for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
|
||||
for (Term t : pp.terms) {
|
||||
if (rptTerms.containsKey(t)) {
|
||||
rp.add(pp);
|
||||
hasMultiTermRpts |= (pp.terms.length > 1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return rp.toArray(new PhrasePositions[0]);
|
||||
}
|
||||
|
||||
/** bit-sets - for each repeating pp, for each of its repeating terms, the term ordinal values is set */
|
||||
private ArrayList<OpenBitSet> ppTermsBitSets(PhrasePositions[] rpp, HashMap<Term,Integer> tord) {
|
||||
ArrayList<OpenBitSet> bb = new ArrayList<OpenBitSet>(rpp.length);
|
||||
for (PhrasePositions pp : rpp) {
|
||||
OpenBitSet b = new OpenBitSet(tord.size());
|
||||
Integer ord;
|
||||
for (Term t: pp.terms) {
|
||||
if ((ord=tord.get(t))!=null) {
|
||||
b.set(ord);
|
||||
}
|
||||
}
|
||||
bb.add(b);
|
||||
}
|
||||
return bb;
|
||||
}
|
||||
|
||||
/** union (term group) bit-sets until they are disjoint (O(n^^2)), and each group have different terms */
|
||||
private void unionTermGroups(ArrayList<OpenBitSet> bb) {
|
||||
int incr;
|
||||
for (int i=0; i<bb.size()-1; i+=incr) {
|
||||
incr = 1;
|
||||
int j = i+1;
|
||||
while (j<bb.size()) {
|
||||
if (bb.get(i).intersects(bb.get(j))) {
|
||||
bb.get(i).union(bb.get(j));
|
||||
bb.remove(j);
|
||||
incr = 0;
|
||||
} else {
|
||||
++j;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** map each term to the single group that contains it */
|
||||
private HashMap<Term,Integer> termGroups(LinkedHashMap<Term,Integer> tord, ArrayList<OpenBitSet> bb) throws IOException {
|
||||
HashMap<Term,Integer> tg = new HashMap<Term,Integer>();
|
||||
Term[] t = tord.keySet().toArray(new Term[0]);
|
||||
for (int i=0; i<bb.size(); i++) { // i is the group no.
|
||||
DocIdSetIterator bits = bb.get(i).iterator();
|
||||
int ord;
|
||||
while ((ord=bits.nextDoc())!=NO_MORE_DOCS) {
|
||||
tg.put(t[ord],i);
|
||||
}
|
||||
}
|
||||
return tg;
|
||||
}
|
||||
|
||||
// private void printQueue(PrintStream ps, PhrasePositions ext, String title) {
|
||||
// //if (min.doc != ?) return;
|
||||
// ps.println();
|
||||
// ps.println("---- "+title);
|
||||
// ps.println("EXT: "+ext);
|
||||
|
@ -266,7 +494,7 @@ final class SloppyPhraseScorer extends PhraseScorer {
|
|||
// ps.println(" " + 0 + " " + t[0]);
|
||||
// for (int i=1; i<t.length; i++) {
|
||||
// t[i] = pq.pop();
|
||||
// assert t[i-1].position <= t[i].position : "PQ is out of order: "+(i-1)+"::"+t[i-1]+" "+i+"::"+t[i];
|
||||
// assert t[i-1].position <= t[i].position;
|
||||
// ps.println(" " + i + " " + t[i]);
|
||||
// }
|
||||
// // add them back
|
||||
|
@ -275,4 +503,5 @@ final class SloppyPhraseScorer extends PhraseScorer {
|
|||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
}
|
||||
|
|
|
@ -38,6 +38,7 @@ import org.apache.lucene.store.Directory;
|
|||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.junit.Ignore;
|
||||
|
||||
/**
|
||||
* This class tests the MultiPhraseQuery class.
|
||||
|
@ -156,6 +157,43 @@ public class TestMultiPhraseQuery extends LuceneTestCase {
|
|||
indexStore.close();
|
||||
}
|
||||
|
||||
@Ignore //LUCENE-3821 fixes sloppy phrase scoring, except for this known problem
|
||||
public void testMultiSloppyWithRepeats() throws IOException {
|
||||
Directory indexStore = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random, indexStore);
|
||||
add("a b c d e f g h i k", writer);
|
||||
IndexReader r = writer.getReader();
|
||||
writer.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(r);
|
||||
|
||||
MultiPhraseQuery q = new MultiPhraseQuery();
|
||||
// this will fail, when the scorer would propagate [a] rather than [a,b],
|
||||
q.add(new Term[] {new Term("body", "a"), new Term("body", "b")});
|
||||
q.add(new Term[] {new Term("body", "a")});
|
||||
q.setSlop(6);
|
||||
assertEquals(1, searcher.search(q, 1).totalHits); // should match on "a b"
|
||||
|
||||
r.close();
|
||||
indexStore.close();
|
||||
}
|
||||
|
||||
public void testMultiExactWithRepeats() throws IOException {
|
||||
Directory indexStore = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random, indexStore);
|
||||
add("a b c d e f g h i k", writer);
|
||||
IndexReader r = writer.getReader();
|
||||
writer.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(r);
|
||||
MultiPhraseQuery q = new MultiPhraseQuery();
|
||||
q.add(new Term[] {new Term("body", "a"), new Term("body", "d")}, 0);
|
||||
q.add(new Term[] {new Term("body", "a"), new Term("body", "f")}, 2);
|
||||
assertEquals(1, searcher.search(q, 1).totalHits); // should match on "a b"
|
||||
r.close();
|
||||
indexStore.close();
|
||||
}
|
||||
|
||||
private void add(String s, RandomIndexWriter writer) throws IOException {
|
||||
Document doc = new Document();
|
||||
doc.add(newField("body", s, TextField.TYPE_STORED));
|
||||
|
|
|
@ -21,12 +21,10 @@ import java.util.Random;
|
|||
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
import org.junit.Ignore;
|
||||
|
||||
/**
|
||||
* random sloppy phrase query tests
|
||||
*/
|
||||
@Ignore("Put this back when we fix LUCENE-3821")
|
||||
public class TestSloppyPhraseQuery2 extends SearchEquivalenceTestBase {
|
||||
/** "A B"~N ⊆ "A B"~N+1 */
|
||||
public void testIncreasingSloppiness() throws Exception {
|
||||
|
|
Loading…
Reference in New Issue