LUCENE-3821: SloppyPhraseScorer sometimes misses documents that ExactPhraseScorer finds.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1299112 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Doron Cohen 2012-03-10 00:06:13 +00:00
parent bf2be9b35a
commit 187bfe2ebc
8 changed files with 486 additions and 177 deletions

View File

@ -919,7 +919,13 @@ Bug fixes
from the delegate DocIdSet.iterator(), which is allowed to return
null by DocIdSet specification when no documents match.
(Shay Banon via Uwe Schindler)
* LUCENE-3821: SloppyPhraseScorer missed documents that ExactPhraseScorer finds
When phrase queru had repeating terms (e.g. "yes ho yes")
sloppy query missed documents that exact query matched.
Fixed except when for repeating multiterms (e.g. "yes ho yes|no").
(Robert Muir, Doron Cohen)
Optimizations
* LUCENE-3653: Improve concurrency in VirtualMethod and AttributeSource by

View File

@ -22,7 +22,6 @@ import java.util.*;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReaderContext;
@ -238,7 +237,7 @@ public class MultiPhraseQuery extends Query {
docFreq = termsEnum.docFreq();
}
postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(postingsEnum, docFreq, positions.get(pos).intValue(), terms[0]);
postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(postingsEnum, docFreq, positions.get(pos).intValue(), terms);
}
// sort by increasing docFreq order
@ -314,9 +313,21 @@ public class MultiPhraseQuery extends Query {
}
buffer.append("\"");
int k = 0;
Iterator<Term[]> i = termArrays.iterator();
int lastPos = -1;
boolean first = true;
while (i.hasNext()) {
Term[] terms = i.next();
int position = positions.get(k);
if (first) {
first = false;
} else {
buffer.append(" ");
for (int j=1; j<(position-lastPos); j++) {
buffer.append("? ");
}
}
if (terms.length > 1) {
buffer.append("(");
for (int j = 0; j < terms.length; j++) {
@ -328,8 +339,8 @@ public class MultiPhraseQuery extends Query {
} else {
buffer.append(terms[0].text());
}
if (i.hasNext())
buffer.append(" ");
lastPos = position;
++k;
}
buffer.append("\"");

View File

@ -31,12 +31,15 @@ final class PhrasePositions {
final int ord; // unique across all PhrasePositions instances
final DocsAndPositionsEnum postings; // stream of docs & positions
PhrasePositions next; // used to make lists
PhrasePositions nextRepeating; // link to next repeating pp: standing for same term in different query offsets
int rptGroup = -1; // >=0 indicates that this is a repeating PP
int rptInd; // index in the rptGroup
final Term[] terms; // for repetitions initialization
PhrasePositions(DocsAndPositionsEnum postings, int o, int ord) {
PhrasePositions(DocsAndPositionsEnum postings, int o, int ord, Term[] terms) {
this.postings = postings;
offset = o;
this.ord = ord;
this.terms = terms;
}
final boolean next() throws IOException { // increments to next doc
@ -78,8 +81,8 @@ final class PhrasePositions {
@Override
public String toString() {
String s = "d:"+doc+" o:"+offset+" p:"+position+" c:"+count;
if (nextRepeating!=null) {
s += " rpt[ "+nextRepeating+" ]";
if (rptGroup >=0 ) {
s += " rpt:"+rptGroup+",i"+rptInd;
}
return s;
}

View File

@ -19,6 +19,7 @@ package org.apache.lucene.search;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Set;
import org.apache.lucene.index.AtomicReaderContext;
@ -137,23 +138,46 @@ public class PhraseQuery extends Query {
final DocsAndPositionsEnum postings;
final int docFreq;
final int position;
final Term term;
final Term[] terms;
final int nTerms; // for faster comparisons
public PostingsAndFreq(DocsAndPositionsEnum postings, int docFreq, int position, Term term) {
public PostingsAndFreq(DocsAndPositionsEnum postings, int docFreq, int position, Term... terms) {
this.postings = postings;
this.docFreq = docFreq;
this.position = position;
this.term = term;
nTerms = terms==null ? 0 : terms.length;
if (nTerms>0) {
if (terms.length==1) {
this.terms = terms;
} else {
Term[] terms2 = new Term[terms.length];
System.arraycopy(terms, 0, terms2, 0, terms.length);
Arrays.sort(terms2);
this.terms = terms2;
}
} else {
this.terms = null;
}
}
public int compareTo(PostingsAndFreq other) {
if (docFreq == other.docFreq) {
if (position == other.position) {
return term.compareTo(other.term);
}
if (docFreq != other.docFreq) {
return docFreq - other.docFreq;
}
if (position != other.position) {
return position - other.position;
}
return docFreq - other.docFreq;
if (nTerms != other.nTerms) {
return nTerms - other.nTerms;
}
if (nTerms == 0) {
return 0;
}
for (int i=0; i<terms.length; i++) {
int res = terms[i].compareTo(other.terms[i]);
if (res!=0) return res;
}
return 0;
}
@Override
@ -162,7 +186,9 @@ public class PhraseQuery extends Query {
int result = 1;
result = prime * result + docFreq;
result = prime * result + position;
result = prime * result + ((term == null) ? 0 : term.hashCode());
for (int i=0; i<nTerms; i++) {
result = prime * result + terms[i].hashCode();
}
return result;
}
@ -174,10 +200,8 @@ public class PhraseQuery extends Query {
PostingsAndFreq other = (PostingsAndFreq) obj;
if (docFreq != other.docFreq) return false;
if (position != other.position) return false;
if (term == null) {
if (other.term != null) return false;
} else if (!term.equals(other.term)) return false;
return true;
if (terms == null) return other.terms == null;
return Arrays.equals(terms, other.terms);
}
}

View File

@ -49,11 +49,11 @@ abstract class PhraseScorer extends Scorer {
// this allows to easily identify a matching (exact) phrase
// when all PhrasePositions have exactly the same position.
if (postings.length > 0) {
min = new PhrasePositions(postings[0].postings, postings[0].position, 0);
min = new PhrasePositions(postings[0].postings, postings[0].position, 0, postings[0].terms);
max = min;
max.doc = -1;
for (int i = 1; i < postings.length; i++) {
PhrasePositions pp = new PhrasePositions(postings[i].postings, postings[i].position, i);
PhrasePositions pp = new PhrasePositions(postings[i].postings, postings[i].position, i, postings[i].terms);
max.next = pp;
max = pp;
max.doc = -1;

View File

@ -19,22 +19,38 @@ package org.apache.lucene.search;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.util.OpenBitSet;
final class SloppyPhraseScorer extends PhraseScorer {
private int slop;
private boolean checkedRepeats; // flag to only check in first candidate doc in case there are no repeats
private boolean hasRepeats; // flag indicating that there are repeats (already checked in first candidate doc)
private PhraseQueue pq; // for advancing min position
private PhrasePositions[] nrPps; // non repeating pps ordered by their query offset
private final int slop;
private final int numPostings;
private final PhraseQueue pq; // for advancing min position
private int end; // current largest phrase position
private boolean hasRpts; // flag indicating that there are repetitions (as checked in first candidate doc)
private boolean checkedRpts; // flag to only check for repetitions in first candidate doc
private boolean hasMultiTermRpts; //
private PhrasePositions[][] rptGroups; // in each group are PPs that repeats each other (i.e. same term), sorted by (query) offset
private PhrasePositions[] rptStack; // temporary stack for switching colliding repeating pps
SloppyPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
int slop, Similarity.SloppySimScorer docScorer) {
super(weight, postings, docScorer);
this.slop = slop;
this.numPostings = postings==null ? 0 : postings.length;
pq = new PhraseQueue(postings.length);
}
/**
* Score a candidate doc for all slop-valid position-combinations (matches)
* encountered while traversing/hopping the PhrasePositions.
@ -55,31 +71,27 @@ final class SloppyPhraseScorer extends PhraseScorer {
*/
@Override
protected float phraseFreq() throws IOException {
int end = initPhrasePositions();
//printPositions(System.err, "INIT DONE:");
if (end==Integer.MIN_VALUE) {
if (!initPhrasePositions()) {
return 0.0f;
}
float freq = 0.0f;
PhrasePositions pp = pq.pop();
int matchLength = end - pp.position;
int next = pq.size()>0 ? pq.top().position : pp.position;
//printQueue(System.err, pp, "Bef Loop: next="+next+" mlen="+end+"-"+pp.position+"="+matchLength);
while (pp.nextPosition() && (end=advanceRepeats(pp, end)) != Integer.MIN_VALUE) {
if (pp.position > next) {
//printQueue(System.err, pp, "A: >next="+next+" matchLength="+matchLength);
int next = pq.top().position;
while (advancePP(pp)) {
if (hasRpts && !advanceRpts(pp)) {
break; // pps exhausted
}
if (pp.position > next) { // done minimizing current match-length
if (matchLength <= slop) {
freq += docScorer.computeSlopFactor(matchLength); // score match
}
pq.add(pp);
pp = pq.pop();
next = pq.size()>0 ? pq.top().position : pp.position;
next = pq.top().position;
matchLength = end - pp.position;
//printQueue(System.err, pp, "B: >next="+next+" matchLength="+matchLength);
} else {
int matchLength2 = end - pp.position;
//printQueue(System.err, pp, "C: mlen2<mlen: next="+next+" matchLength="+matchLength+" matchLength2="+matchLength2);
if (matchLength2 < matchLength) {
matchLength = matchLength2;
}
@ -91,53 +103,82 @@ final class SloppyPhraseScorer extends PhraseScorer {
return freq;
}
/**
* Advance repeating pps of an input (non-repeating) pp.
* Return a modified 'end' in case pp or its repeats exceeds original 'end'.
* "Dirty" trick: when there are repeats, modifies pp's position to that of
* least repeater of pp (needed when due to holes repeaters' positions are "back").
*/
private int advanceRepeats(PhrasePositions pp, int end) throws IOException {
int repeatsEnd = end;
if (pp.position > repeatsEnd) {
repeatsEnd = pp.position;
/** advance a PhrasePosition and update 'end', return false if exhausted */
private boolean advancePP(PhrasePositions pp) throws IOException {
if (!pp.nextPosition()) {
return false;
}
if (!hasRepeats) {
return repeatsEnd;
if (pp.position > end) {
end = pp.position;
}
return true;
}
/** pp was just advanced. If that caused a repeater collision, resolve by advancing the lesser
* of the two colliding pps. Note that there can only be one collision, as by the initialization
* there were no collisions before pp was advanced. */
private boolean advanceRpts(PhrasePositions pp) throws IOException {
if (pp.rptGroup < 0) {
return true; // not a repeater
}
PhrasePositions[] rg = rptGroups[pp.rptGroup];
OpenBitSet bits = new OpenBitSet(rg.length); // for re-queuing after collisions are resolved
int k0 = pp.rptInd;
int k;
while((k=collide(pp)) >= 0) {
pp = lesser(pp, rg[k]); // always advance the lesser of the (only) two colliding pps
if (!advancePP(pp)) {
return false; // exhausted
}
if (k != k0) { // careful: mark only those currently in the queue
bits.set(k); // mark that pp2 need to be re-queued
}
}
// collisions resolved, now re-queue
// empty (partially) the queue until seeing all pps advanced for resolving collisions
int n = 0;
while (bits.cardinality() > 0) {
PhrasePositions pp2 = pq.pop();
rptStack[n++] = pp2;
if (pp2.rptGroup >= 0 && bits.get(pp2.rptInd)) {
bits.clear(pp2.rptInd);
}
}
// add back to queue
for (int i=n-1; i>=0; i--) {
pq.add(rptStack[i]);
}
return true;
}
/** compare two pps, but only by position and offset */
private PhrasePositions lesser(PhrasePositions pp, PhrasePositions pp2) {
if (pp.position < pp2.position ||
(pp.position == pp2.position && pp.offset < pp2.offset)) {
return pp;
}
return pp2;
}
/** index of a pp2 colliding with pp, or -1 if none */
private int collide(PhrasePositions pp) {
int tpPos = tpPos(pp);
for (PhrasePositions pp2=pp.nextRepeating; pp2!=null; pp2=pp2.nextRepeating) {
while (tpPos(pp2) <= tpPos) {
if (!pp2.nextPosition()) {
return Integer.MIN_VALUE;
}
}
tpPos = tpPos(pp2);
if (pp2.position > repeatsEnd) {
repeatsEnd = pp2.position;
}
// "dirty" trick: with holes, given a pp, its repeating pp2 might have smaller position.
// so in order to have the right "start" in matchLength computation we fake pp.position.
// this relies on pp.nextPosition() not using pp.position.
if (pp2.position < pp.position) {
pp.position = pp2.position;
PhrasePositions[] rg = rptGroups[pp.rptGroup];
for (int i=0; i<rg.length; i++) {
PhrasePositions pp2 = rg[i];
if (pp2 != pp && tpPos(pp2) == tpPos) {
return pp2.rptInd;
}
}
return repeatsEnd;
return -1;
}
/**
* Initialize PhrasePositions in place.
* There is a one time initialization for this scorer (taking place at the first doc that matches all terms):
* A one time initialization for this scorer (on first doc matching all terms):
* <ul>
* <li>Detect groups of repeating pps: those with same tpPos (tpPos==position in the doc) but different offsets in query.
* <li>For each such group:
* <ul>
* <li>form an inner linked list of the repeating ones.
* <li>propagate all group members but first so that they land on different tpPos().
* </ul>
* <li>Mark whether there are repetitions at all, so that scoring queries with no repetitions has no overhead due to this computation.
* <li>Insert to pq only non repeating PPs, or PPs that are the first in a repeating group.
* <li>Check if there are repetitions
* <li>If there are, find groups of repetitions.
* </ul>
* Examples:
* <ol>
@ -145,118 +186,305 @@ final class SloppyPhraseScorer extends PhraseScorer {
* <li>repetitions: <b>"ho my my"~2</b>
* <li>repetitions: <b>"my ho my"~2</b>
* </ol>
* @return end (max position), or Integer.MIN_VALUE if any term ran out (i.e. done)
* @return false if PPs are exhausted (and so current doc will not be a match)
*/
private int initPhrasePositions() throws IOException {
int end = Integer.MIN_VALUE;
// no repeats at all (most common case is also the simplest one)
if (checkedRepeats && !hasRepeats) {
// build queue from list
pq.clear();
for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
pp.firstPosition();
if (pp.position > end) {
end = pp.position;
}
pq.add(pp); // build pq from list
}
return end;
private boolean initPhrasePositions() throws IOException {
end = Integer.MIN_VALUE;
if (!checkedRpts) {
return initFirstTime();
}
//printPositions(System.err, "Init: 1: Bef position");
// position the pp's
for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
pp.firstPosition();
if (!hasRpts) {
initSimple();
return true; // PPs available
}
//printPositions(System.err, "Init: 2: Aft position");
// one time initialization for this scorer (done only for the first candidate doc)
if (!checkedRepeats) {
checkedRepeats = true;
ArrayList<PhrasePositions> ppsA = new ArrayList<PhrasePositions>();
PhrasePositions dummyPP = new PhrasePositions(null, -1, -1);
// check for repeats
for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
if (pp.nextRepeating != null) {
continue; // a repetition of an earlier pp
}
ppsA.add(pp);
int tpPos = tpPos(pp);
for (PhrasePositions prevB=pp, pp2=pp.next; pp2!= min; pp2=pp2.next) {
if (
pp2.nextRepeating != null // already detected as a repetition of an earlier pp
|| pp.offset == pp2.offset // not a repetition: the two PPs are originally in same offset in the query!
|| tpPos(pp2) != tpPos) { // not a repetition
continue;
}
// a repetition
hasRepeats = true;
prevB.nextRepeating = pp2; // add pp2 to the repeats linked list
pp2.nextRepeating = dummyPP; // allows not to handle the last pp in a sub-list
prevB = pp2;
}
}
if (hasRepeats) {
// clean dummy markers
for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
if (pp.nextRepeating == dummyPP) {
pp.nextRepeating = null;
}
}
}
nrPps = ppsA.toArray(new PhrasePositions[0]);
pq = new PhraseQueue(nrPps.length);
}
//printPositions(System.err, "Init: 3: Aft check-repeats");
// with repeats must advance some repeating pp's so they all start with differing tp's
if (hasRepeats) {
for (PhrasePositions pp: nrPps) {
if ((end=advanceRepeats(pp, end)) == Integer.MIN_VALUE) {
return Integer.MIN_VALUE; // ran out of a term -- done (no valid matches in current doc)
}
}
}
//printPositions(System.err, "Init: 4: Aft advance-repeats");
// build queue from non repeating pps
return initComplex();
}
/** no repeats: simplest case, and most common. It is important to keep this piece of the code simple and efficient */
private void initSimple() throws IOException {
//System.err.println("initSimple: doc: "+min.doc);
pq.clear();
for (PhrasePositions pp: nrPps) {
// position pps and build queue from list
for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
pp.firstPosition();
if (pp.position > end) {
end = pp.position;
}
pq.add(pp);
}
return end;
}
/** with repeats: not so simple. */
private boolean initComplex() throws IOException {
//System.err.println("initComplex: doc: "+min.doc);
placeFirstPositions();
if (!advanceRepeatGroups()) {
return false; // PPs exhausted
}
fillQueue();
return true; // PPs available
}
/** move all PPs to their first position */
private void placeFirstPositions() throws IOException {
for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
pp.firstPosition();
}
}
/** Fill the queue (all pps are already placed */
private void fillQueue() {
pq.clear();
for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
if (pp.position > end) {
end = pp.position;
}
pq.add(pp);
}
}
/** At initialization (each doc), each repetition group is sorted by (query) offset.
* This provides the start condition: no collisions.
* <p>Case 1: no multi-term repeats<br>
* It is sufficient to advance each pp in the group by one less than its group index.
* So lesser pp is not advanced, 2nd one advance once, 3rd one advanced twice, etc.
* <p>Case 2: multi-term repeats<br>
*
* @return false if PPs are exhausted.
*/
private boolean advanceRepeatGroups() throws IOException {
for (PhrasePositions[] rg: rptGroups) {
if (hasMultiTermRpts) {
// more involved, some may not collide
int incr;
for (int i=0; i<rg.length; i+=incr) {
incr = 1;
PhrasePositions pp = rg[i];
int k;
while((k=collide(pp)) >= 0) {
PhrasePositions pp2 = lesser(pp, rg[k]);
if (!advancePP(pp2)) { // at initialization always advance pp with higher offset
return false; // exhausted
}
if (pp2.rptInd < i) { // should not happen?
incr = 0;
break;
}
}
}
} else {
// simpler, we know exactly how much to advance
for (int j=1; j<rg.length; j++) {
for (int k=0; k<j; k++) {
if (!rg[j].nextPosition()) {
return false; // PPs exhausted
}
}
}
}
}
return true; // PPs available
}
/** initialize with checking for repeats. Heavy work, but done only for the first candidate doc.<p>
* If there are repetitions, check if multi-term postings (MTP) are involved.<p>
* Without MTP, once PPs are placed in the first candidate doc, repeats (and groups) are visible.<br>
* With MTP, a more complex check is needed, up-front, as there may be "hidden collisions".<br>
* For example P1 has {A,B}, P1 has {B,C}, and the first doc is: "A C B". At start, P1 would point
* to "A", p2 to "C", and it will not be identified that P1 and P2 are repetitions of each other.<p>
* The more complex initialization has two parts:<br>
* (1) identification of repetition groups.<br>
* (2) advancing repeat groups at the start of the doc.<br>
* For (1), a possible solution is to just create a single repetition group,
* made of all repeating pps. But this would slow down the check for collisions,
* as all pps would need to be checked. Instead, we compute "connected regions"
* on the bipartite graph of postings and terms.
*/
private boolean initFirstTime() throws IOException {
//System.err.println("initFirstTime: doc: "+min.doc);
checkedRpts = true;
placeFirstPositions();
LinkedHashMap<Term,Integer> rptTerms = repeatingTerms();
hasRpts = !rptTerms.isEmpty();
if (hasRpts) {
rptStack = new PhrasePositions[numPostings]; // needed with repetitions
ArrayList<ArrayList<PhrasePositions>> rgs = gatherRptGroups(rptTerms);
sortRptGroups(rgs);
if (!advanceRepeatGroups()) {
return false; // PPs exhausted
}
}
fillQueue();
return true; // PPs available
}
/** sort each repetition group by (query) offset.
* Done only once (at first doc) and allows to initialize faster for each doc. */
private void sortRptGroups(ArrayList<ArrayList<PhrasePositions>> rgs) {
rptGroups = new PhrasePositions[rgs.size()][];
Comparator<PhrasePositions> cmprtr = new Comparator<PhrasePositions>() {
public int compare(PhrasePositions pp1, PhrasePositions pp2) {
return pp1.offset - pp2.offset;
}
};
for (int i=0; i<rptGroups.length; i++) {
PhrasePositions[] rg = rgs.get(i).toArray(new PhrasePositions[0]);
Arrays.sort(rg, cmprtr);
rptGroups[i] = rg;
for (int j=0; j<rg.length; j++) {
rg[j].rptInd = j; // we use this index for efficient re-queuing
}
}
}
/** Detect repetition groups. Done once - for first doc */
private ArrayList<ArrayList<PhrasePositions>> gatherRptGroups(LinkedHashMap<Term,Integer> rptTerms) throws IOException {
PhrasePositions[] rpp = repeatingPPs(rptTerms);
ArrayList<ArrayList<PhrasePositions>> res = new ArrayList<ArrayList<PhrasePositions>>();
if (!hasMultiTermRpts) {
// simpler - no multi-terms - can base on positions in first doc
for (int i=0; i<rpp.length; i++) {
PhrasePositions pp = rpp[i];
if (pp.rptGroup >=0) continue; // already marked as a repetition
int tpPos = tpPos(pp);
for (int j=i+1; j<rpp.length; j++) {
PhrasePositions pp2 = rpp[j];
if (
pp2.rptGroup >=0 // already marked as a repetition
|| pp2.offset == pp.offset // not a repetition: two PPs are originally in same offset in the query!
|| tpPos(pp2) != tpPos) { // not a repetition
continue;
}
// a repetition
int g = pp.rptGroup;
if (g < 0) {
g = res.size();
pp.rptGroup = g;
ArrayList<PhrasePositions> rl = new ArrayList<PhrasePositions>(2);
rl.add(pp);
res.add(rl);
}
pp2.rptGroup = g;
res.get(g).add(pp2);
}
}
} else {
// more involved - has multi-terms
ArrayList<HashSet<PhrasePositions>> tmp = new ArrayList<HashSet<PhrasePositions>>();
ArrayList<OpenBitSet> bb = ppTermsBitSets(rpp, rptTerms);
unionTermGroups(bb);
HashMap<Term,Integer> tg = termGroups(rptTerms, bb);
HashSet<Integer> distinctGroupIDs = new HashSet<Integer>(tg.values());
for (int i=0; i<distinctGroupIDs.size(); i++) {
tmp.add(new HashSet<PhrasePositions>());
}
for (PhrasePositions pp : rpp) {
for (Term t: pp.terms) {
if (rptTerms.containsKey(t)) {
int g = tg.get(t);
tmp.get(g).add(pp);
assert pp.rptGroup==-1 || pp.rptGroup==g;
pp.rptGroup = g;
}
}
}
for (HashSet<PhrasePositions> hs : tmp) {
res.add(new ArrayList<PhrasePositions>(hs));
}
}
return res;
}
/** Actual position in doc of a PhrasePosition, relies on that position = tpPos - offset) */
private final int tpPos(PhrasePositions pp) {
return pp.position + pp.offset;
}
// private void printPositions(PrintStream ps, String title) {
// ps.println();
// ps.println("---- "+title);
// int k = 0;
// if (nrPps!=null) {
// for (PhrasePositions pp: nrPps) {
// ps.println(" " + k++ + " " + pp);
// }
// } else {
// for (PhrasePositions pp=min; 0==k || pp!=min; pp = pp.next) {
// ps.println(" " + k++ + " " + pp);
// }
// }
// }
/** find repeating terms and assign them ordinal values */
private LinkedHashMap<Term,Integer> repeatingTerms() {
LinkedHashMap<Term,Integer> tord = new LinkedHashMap<Term,Integer>();
HashMap<Term,Integer> tcnt = new HashMap<Term,Integer>();
for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
for (Term t : pp.terms) {
Integer cnt0 = tcnt.get(t);
Integer cnt = cnt0==null ? new Integer(1) : new Integer(1+cnt0.intValue());
tcnt.put(t, cnt);
if (cnt==2) {
tord.put(t,tord.size());
}
}
}
return tord;
}
/** find repeating pps, and for each, if has multi-terms, update this.hasMultiTermRpts */
private PhrasePositions[] repeatingPPs(HashMap<Term,Integer> rptTerms) {
ArrayList<PhrasePositions> rp = new ArrayList<PhrasePositions>();
for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
for (Term t : pp.terms) {
if (rptTerms.containsKey(t)) {
rp.add(pp);
hasMultiTermRpts |= (pp.terms.length > 1);
break;
}
}
}
return rp.toArray(new PhrasePositions[0]);
}
/** bit-sets - for each repeating pp, for each of its repeating terms, the term ordinal values is set */
private ArrayList<OpenBitSet> ppTermsBitSets(PhrasePositions[] rpp, HashMap<Term,Integer> tord) {
ArrayList<OpenBitSet> bb = new ArrayList<OpenBitSet>(rpp.length);
for (PhrasePositions pp : rpp) {
OpenBitSet b = new OpenBitSet(tord.size());
Integer ord;
for (Term t: pp.terms) {
if ((ord=tord.get(t))!=null) {
b.set(ord);
}
}
bb.add(b);
}
return bb;
}
/** union (term group) bit-sets until they are disjoint (O(n^^2)), and each group have different terms */
private void unionTermGroups(ArrayList<OpenBitSet> bb) {
int incr;
for (int i=0; i<bb.size()-1; i+=incr) {
incr = 1;
int j = i+1;
while (j<bb.size()) {
if (bb.get(i).intersects(bb.get(j))) {
bb.get(i).union(bb.get(j));
bb.remove(j);
incr = 0;
} else {
++j;
}
}
}
}
/** map each term to the single group that contains it */
private HashMap<Term,Integer> termGroups(LinkedHashMap<Term,Integer> tord, ArrayList<OpenBitSet> bb) throws IOException {
HashMap<Term,Integer> tg = new HashMap<Term,Integer>();
Term[] t = tord.keySet().toArray(new Term[0]);
for (int i=0; i<bb.size(); i++) { // i is the group no.
DocIdSetIterator bits = bb.get(i).iterator();
int ord;
while ((ord=bits.nextDoc())!=NO_MORE_DOCS) {
tg.put(t[ord],i);
}
}
return tg;
}
// private void printQueue(PrintStream ps, PhrasePositions ext, String title) {
// //if (min.doc != ?) return;
// ps.println();
// ps.println("---- "+title);
// ps.println("EXT: "+ext);
@ -266,7 +494,7 @@ final class SloppyPhraseScorer extends PhraseScorer {
// ps.println(" " + 0 + " " + t[0]);
// for (int i=1; i<t.length; i++) {
// t[i] = pq.pop();
// assert t[i-1].position <= t[i].position : "PQ is out of order: "+(i-1)+"::"+t[i-1]+" "+i+"::"+t[i];
// assert t[i-1].position <= t[i].position;
// ps.println(" " + i + " " + t[i]);
// }
// // add them back
@ -275,4 +503,5 @@ final class SloppyPhraseScorer extends PhraseScorer {
// }
// }
// }
}

View File

@ -38,6 +38,7 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Ignore;
/**
* This class tests the MultiPhraseQuery class.
@ -156,6 +157,43 @@ public class TestMultiPhraseQuery extends LuceneTestCase {
indexStore.close();
}
@Ignore //LUCENE-3821 fixes sloppy phrase scoring, except for this known problem
public void testMultiSloppyWithRepeats() throws IOException {
Directory indexStore = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random, indexStore);
add("a b c d e f g h i k", writer);
IndexReader r = writer.getReader();
writer.close();
IndexSearcher searcher = newSearcher(r);
MultiPhraseQuery q = new MultiPhraseQuery();
// this will fail, when the scorer would propagate [a] rather than [a,b],
q.add(new Term[] {new Term("body", "a"), new Term("body", "b")});
q.add(new Term[] {new Term("body", "a")});
q.setSlop(6);
assertEquals(1, searcher.search(q, 1).totalHits); // should match on "a b"
r.close();
indexStore.close();
}
public void testMultiExactWithRepeats() throws IOException {
Directory indexStore = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random, indexStore);
add("a b c d e f g h i k", writer);
IndexReader r = writer.getReader();
writer.close();
IndexSearcher searcher = newSearcher(r);
MultiPhraseQuery q = new MultiPhraseQuery();
q.add(new Term[] {new Term("body", "a"), new Term("body", "d")}, 0);
q.add(new Term[] {new Term("body", "a"), new Term("body", "f")}, 2);
assertEquals(1, searcher.search(q, 1).totalHits); // should match on "a b"
r.close();
indexStore.close();
}
private void add(String s, RandomIndexWriter writer) throws IOException {
Document doc = new Document();
doc.add(newField("body", s, TextField.TYPE_STORED));

View File

@ -21,12 +21,10 @@ import java.util.Random;
import org.apache.lucene.index.Term;
import org.apache.lucene.util._TestUtil;
import org.junit.Ignore;
/**
* random sloppy phrase query tests
*/
@Ignore("Put this back when we fix LUCENE-3821")
public class TestSloppyPhraseQuery2 extends SearchEquivalenceTestBase {
/** "A B"~N ⊆ "A B"~N+1 */
public void testIncreasingSloppiness() throws Exception {