From 187bfe2ebc4555e93b71d6fe463d14d1a3b45932 Mon Sep 17 00:00:00 2001 From: Doron Cohen Date: Sat, 10 Mar 2012 00:06:13 +0000 Subject: [PATCH] LUCENE-3821: SloppyPhraseScorer sometimes misses documents that ExactPhraseScorer finds. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1299112 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 8 +- .../lucene/search/MultiPhraseQuery.java | 19 +- .../apache/lucene/search/PhrasePositions.java | 11 +- .../org/apache/lucene/search/PhraseQuery.java | 50 +- .../apache/lucene/search/PhraseScorer.java | 4 +- .../lucene/search/SloppyPhraseScorer.java | 531 +++++++++++++----- .../lucene/search/TestMultiPhraseQuery.java | 38 ++ .../lucene/search/TestSloppyPhraseQuery2.java | 2 - 8 files changed, 486 insertions(+), 177 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index bc5e3f0d48b..7c98b27b02e 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -919,7 +919,13 @@ Bug fixes from the delegate DocIdSet.iterator(), which is allowed to return null by DocIdSet specification when no documents match. (Shay Banon via Uwe Schindler) - + +* LUCENE-3821: SloppyPhraseScorer missed documents that ExactPhraseScorer finds + When phrase queru had repeating terms (e.g. "yes ho yes") + sloppy query missed documents that exact query matched. + Fixed except when for repeating multiterms (e.g. "yes ho yes|no"). + (Robert Muir, Doron Cohen) + Optimizations * LUCENE-3653: Improve concurrency in VirtualMethod and AttributeSource by diff --git a/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java index d829bf32595..2dbc77e27cb 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java @@ -22,7 +22,6 @@ import java.util.*; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.DocsAndPositionsEnum; -import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.AtomicReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReaderContext; @@ -238,7 +237,7 @@ public class MultiPhraseQuery extends Query { docFreq = termsEnum.docFreq(); } - postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(postingsEnum, docFreq, positions.get(pos).intValue(), terms[0]); + postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(postingsEnum, docFreq, positions.get(pos).intValue(), terms); } // sort by increasing docFreq order @@ -314,9 +313,21 @@ public class MultiPhraseQuery extends Query { } buffer.append("\""); + int k = 0; Iterator i = termArrays.iterator(); + int lastPos = -1; + boolean first = true; while (i.hasNext()) { Term[] terms = i.next(); + int position = positions.get(k); + if (first) { + first = false; + } else { + buffer.append(" "); + for (int j=1; j<(position-lastPos); j++) { + buffer.append("? "); + } + } if (terms.length > 1) { buffer.append("("); for (int j = 0; j < terms.length; j++) { @@ -328,8 +339,8 @@ public class MultiPhraseQuery extends Query { } else { buffer.append(terms[0].text()); } - if (i.hasNext()) - buffer.append(" "); + lastPos = position; + ++k; } buffer.append("\""); diff --git a/lucene/core/src/java/org/apache/lucene/search/PhrasePositions.java b/lucene/core/src/java/org/apache/lucene/search/PhrasePositions.java index 004d857c0f6..b2d4afe3814 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PhrasePositions.java +++ b/lucene/core/src/java/org/apache/lucene/search/PhrasePositions.java @@ -31,12 +31,15 @@ final class PhrasePositions { final int ord; // unique across all PhrasePositions instances final DocsAndPositionsEnum postings; // stream of docs & positions PhrasePositions next; // used to make lists - PhrasePositions nextRepeating; // link to next repeating pp: standing for same term in different query offsets + int rptGroup = -1; // >=0 indicates that this is a repeating PP + int rptInd; // index in the rptGroup + final Term[] terms; // for repetitions initialization - PhrasePositions(DocsAndPositionsEnum postings, int o, int ord) { + PhrasePositions(DocsAndPositionsEnum postings, int o, int ord, Term[] terms) { this.postings = postings; offset = o; this.ord = ord; + this.terms = terms; } final boolean next() throws IOException { // increments to next doc @@ -78,8 +81,8 @@ final class PhrasePositions { @Override public String toString() { String s = "d:"+doc+" o:"+offset+" p:"+position+" c:"+count; - if (nextRepeating!=null) { - s += " rpt[ "+nextRepeating+" ]"; + if (rptGroup >=0 ) { + s += " rpt:"+rptGroup+",i"+rptInd; } return s; } diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java index 30faaba84c3..2f2a45c9635 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java @@ -19,6 +19,7 @@ package org.apache.lucene.search; import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.Set; import org.apache.lucene.index.AtomicReaderContext; @@ -137,23 +138,46 @@ public class PhraseQuery extends Query { final DocsAndPositionsEnum postings; final int docFreq; final int position; - final Term term; + final Term[] terms; + final int nTerms; // for faster comparisons - public PostingsAndFreq(DocsAndPositionsEnum postings, int docFreq, int position, Term term) { + public PostingsAndFreq(DocsAndPositionsEnum postings, int docFreq, int position, Term... terms) { this.postings = postings; this.docFreq = docFreq; this.position = position; - this.term = term; + nTerms = terms==null ? 0 : terms.length; + if (nTerms>0) { + if (terms.length==1) { + this.terms = terms; + } else { + Term[] terms2 = new Term[terms.length]; + System.arraycopy(terms, 0, terms2, 0, terms.length); + Arrays.sort(terms2); + this.terms = terms2; + } + } else { + this.terms = null; + } } public int compareTo(PostingsAndFreq other) { - if (docFreq == other.docFreq) { - if (position == other.position) { - return term.compareTo(other.term); - } + if (docFreq != other.docFreq) { + return docFreq - other.docFreq; + } + if (position != other.position) { return position - other.position; } - return docFreq - other.docFreq; + if (nTerms != other.nTerms) { + return nTerms - other.nTerms; + } + if (nTerms == 0) { + return 0; + } + for (int i=0; i 0) { - min = new PhrasePositions(postings[0].postings, postings[0].position, 0); + min = new PhrasePositions(postings[0].postings, postings[0].position, 0, postings[0].terms); max = min; max.doc = -1; for (int i = 1; i < postings.length; i++) { - PhrasePositions pp = new PhrasePositions(postings[i].postings, postings[i].position, i); + PhrasePositions pp = new PhrasePositions(postings[i].postings, postings[i].position, i, postings[i].terms); max.next = pp; max = pp; max.doc = -1; diff --git a/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java b/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java index dbd5ca84d41..2f2f9ed59a5 100644 --- a/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java @@ -19,22 +19,38 @@ package org.apache.lucene.search; import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashMap; +import org.apache.lucene.index.Term; import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.util.OpenBitSet; final class SloppyPhraseScorer extends PhraseScorer { - private int slop; - private boolean checkedRepeats; // flag to only check in first candidate doc in case there are no repeats - private boolean hasRepeats; // flag indicating that there are repeats (already checked in first candidate doc) - private PhraseQueue pq; // for advancing min position - private PhrasePositions[] nrPps; // non repeating pps ordered by their query offset + + private final int slop; + private final int numPostings; + private final PhraseQueue pq; // for advancing min position + + private int end; // current largest phrase position + + private boolean hasRpts; // flag indicating that there are repetitions (as checked in first candidate doc) + private boolean checkedRpts; // flag to only check for repetitions in first candidate doc + private boolean hasMultiTermRpts; // + private PhrasePositions[][] rptGroups; // in each group are PPs that repeats each other (i.e. same term), sorted by (query) offset + private PhrasePositions[] rptStack; // temporary stack for switching colliding repeating pps SloppyPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, int slop, Similarity.SloppySimScorer docScorer) { super(weight, postings, docScorer); this.slop = slop; + this.numPostings = postings==null ? 0 : postings.length; + pq = new PhraseQueue(postings.length); } - + /** * Score a candidate doc for all slop-valid position-combinations (matches) * encountered while traversing/hopping the PhrasePositions. @@ -55,31 +71,27 @@ final class SloppyPhraseScorer extends PhraseScorer { */ @Override protected float phraseFreq() throws IOException { - int end = initPhrasePositions(); - //printPositions(System.err, "INIT DONE:"); - if (end==Integer.MIN_VALUE) { + if (!initPhrasePositions()) { return 0.0f; } - float freq = 0.0f; PhrasePositions pp = pq.pop(); int matchLength = end - pp.position; - int next = pq.size()>0 ? pq.top().position : pp.position; - //printQueue(System.err, pp, "Bef Loop: next="+next+" mlen="+end+"-"+pp.position+"="+matchLength); - while (pp.nextPosition() && (end=advanceRepeats(pp, end)) != Integer.MIN_VALUE) { - if (pp.position > next) { - //printQueue(System.err, pp, "A: >next="+next+" matchLength="+matchLength); + int next = pq.top().position; + while (advancePP(pp)) { + if (hasRpts && !advanceRpts(pp)) { + break; // pps exhausted + } + if (pp.position > next) { // done minimizing current match-length if (matchLength <= slop) { freq += docScorer.computeSlopFactor(matchLength); // score match } pq.add(pp); pp = pq.pop(); - next = pq.size()>0 ? pq.top().position : pp.position; + next = pq.top().position; matchLength = end - pp.position; - //printQueue(System.err, pp, "B: >next="+next+" matchLength="+matchLength); } else { int matchLength2 = end - pp.position; - //printQueue(System.err, pp, "C: mlen2 repeatsEnd) { - repeatsEnd = pp.position; + /** advance a PhrasePosition and update 'end', return false if exhausted */ + private boolean advancePP(PhrasePositions pp) throws IOException { + if (!pp.nextPosition()) { + return false; } - if (!hasRepeats) { - return repeatsEnd; + if (pp.position > end) { + end = pp.position; } + return true; + } + + /** pp was just advanced. If that caused a repeater collision, resolve by advancing the lesser + * of the two colliding pps. Note that there can only be one collision, as by the initialization + * there were no collisions before pp was advanced. */ + private boolean advanceRpts(PhrasePositions pp) throws IOException { + if (pp.rptGroup < 0) { + return true; // not a repeater + } + PhrasePositions[] rg = rptGroups[pp.rptGroup]; + OpenBitSet bits = new OpenBitSet(rg.length); // for re-queuing after collisions are resolved + int k0 = pp.rptInd; + int k; + while((k=collide(pp)) >= 0) { + pp = lesser(pp, rg[k]); // always advance the lesser of the (only) two colliding pps + if (!advancePP(pp)) { + return false; // exhausted + } + if (k != k0) { // careful: mark only those currently in the queue + bits.set(k); // mark that pp2 need to be re-queued + } + } + // collisions resolved, now re-queue + // empty (partially) the queue until seeing all pps advanced for resolving collisions + int n = 0; + while (bits.cardinality() > 0) { + PhrasePositions pp2 = pq.pop(); + rptStack[n++] = pp2; + if (pp2.rptGroup >= 0 && bits.get(pp2.rptInd)) { + bits.clear(pp2.rptInd); + } + } + // add back to queue + for (int i=n-1; i>=0; i--) { + pq.add(rptStack[i]); + } + return true; + } + + /** compare two pps, but only by position and offset */ + private PhrasePositions lesser(PhrasePositions pp, PhrasePositions pp2) { + if (pp.position < pp2.position || + (pp.position == pp2.position && pp.offset < pp2.offset)) { + return pp; + } + return pp2; + } + + /** index of a pp2 colliding with pp, or -1 if none */ + private int collide(PhrasePositions pp) { int tpPos = tpPos(pp); - for (PhrasePositions pp2=pp.nextRepeating; pp2!=null; pp2=pp2.nextRepeating) { - while (tpPos(pp2) <= tpPos) { - if (!pp2.nextPosition()) { - return Integer.MIN_VALUE; - } - } - tpPos = tpPos(pp2); - if (pp2.position > repeatsEnd) { - repeatsEnd = pp2.position; - } - // "dirty" trick: with holes, given a pp, its repeating pp2 might have smaller position. - // so in order to have the right "start" in matchLength computation we fake pp.position. - // this relies on pp.nextPosition() not using pp.position. - if (pp2.position < pp.position) { - pp.position = pp2.position; + PhrasePositions[] rg = rptGroups[pp.rptGroup]; + for (int i=0; i - *
  • Detect groups of repeating pps: those with same tpPos (tpPos==position in the doc) but different offsets in query. - *
  • For each such group: - *
      - *
    • form an inner linked list of the repeating ones. - *
    • propagate all group members but first so that they land on different tpPos(). - *
    - *
  • Mark whether there are repetitions at all, so that scoring queries with no repetitions has no overhead due to this computation. - *
  • Insert to pq only non repeating PPs, or PPs that are the first in a repeating group. + *
  • Check if there are repetitions + *
  • If there are, find groups of repetitions. * * Examples: *
      @@ -145,118 +186,305 @@ final class SloppyPhraseScorer extends PhraseScorer { *
    1. repetitions: "ho my my"~2 *
    2. repetitions: "my ho my"~2 *
    - * @return end (max position), or Integer.MIN_VALUE if any term ran out (i.e. done) + * @return false if PPs are exhausted (and so current doc will not be a match) */ - private int initPhrasePositions() throws IOException { - int end = Integer.MIN_VALUE; - - // no repeats at all (most common case is also the simplest one) - if (checkedRepeats && !hasRepeats) { - // build queue from list - pq.clear(); - for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max - pp.firstPosition(); - if (pp.position > end) { - end = pp.position; - } - pq.add(pp); // build pq from list - } - return end; + private boolean initPhrasePositions() throws IOException { + end = Integer.MIN_VALUE; + if (!checkedRpts) { + return initFirstTime(); } - - //printPositions(System.err, "Init: 1: Bef position"); - - // position the pp's - for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max - pp.firstPosition(); + if (!hasRpts) { + initSimple(); + return true; // PPs available } - - //printPositions(System.err, "Init: 2: Aft position"); - - // one time initialization for this scorer (done only for the first candidate doc) - if (!checkedRepeats) { - checkedRepeats = true; - ArrayList ppsA = new ArrayList(); - PhrasePositions dummyPP = new PhrasePositions(null, -1, -1); - // check for repeats - for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max - if (pp.nextRepeating != null) { - continue; // a repetition of an earlier pp - } - ppsA.add(pp); - int tpPos = tpPos(pp); - for (PhrasePositions prevB=pp, pp2=pp.next; pp2!= min; pp2=pp2.next) { - if ( - pp2.nextRepeating != null // already detected as a repetition of an earlier pp - || pp.offset == pp2.offset // not a repetition: the two PPs are originally in same offset in the query! - || tpPos(pp2) != tpPos) { // not a repetition - continue; - } - // a repetition - hasRepeats = true; - prevB.nextRepeating = pp2; // add pp2 to the repeats linked list - pp2.nextRepeating = dummyPP; // allows not to handle the last pp in a sub-list - prevB = pp2; - } - } - if (hasRepeats) { - // clean dummy markers - for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max - if (pp.nextRepeating == dummyPP) { - pp.nextRepeating = null; - } - } - } - nrPps = ppsA.toArray(new PhrasePositions[0]); - pq = new PhraseQueue(nrPps.length); - } - - //printPositions(System.err, "Init: 3: Aft check-repeats"); - - // with repeats must advance some repeating pp's so they all start with differing tp's - if (hasRepeats) { - for (PhrasePositions pp: nrPps) { - if ((end=advanceRepeats(pp, end)) == Integer.MIN_VALUE) { - return Integer.MIN_VALUE; // ran out of a term -- done (no valid matches in current doc) - } - } - } - - //printPositions(System.err, "Init: 4: Aft advance-repeats"); - - // build queue from non repeating pps + return initComplex(); + } + + /** no repeats: simplest case, and most common. It is important to keep this piece of the code simple and efficient */ + private void initSimple() throws IOException { + //System.err.println("initSimple: doc: "+min.doc); pq.clear(); - for (PhrasePositions pp: nrPps) { + // position pps and build queue from list + for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max + pp.firstPosition(); if (pp.position > end) { end = pp.position; } pq.add(pp); } - - return end; } + /** with repeats: not so simple. */ + private boolean initComplex() throws IOException { + //System.err.println("initComplex: doc: "+min.doc); + placeFirstPositions(); + if (!advanceRepeatGroups()) { + return false; // PPs exhausted + } + fillQueue(); + return true; // PPs available + } + + /** move all PPs to their first position */ + private void placeFirstPositions() throws IOException { + for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max + pp.firstPosition(); + } + } + + /** Fill the queue (all pps are already placed */ + private void fillQueue() { + pq.clear(); + for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max + if (pp.position > end) { + end = pp.position; + } + pq.add(pp); + } + } + + /** At initialization (each doc), each repetition group is sorted by (query) offset. + * This provides the start condition: no collisions. + *

    Case 1: no multi-term repeats
    + * It is sufficient to advance each pp in the group by one less than its group index. + * So lesser pp is not advanced, 2nd one advance once, 3rd one advanced twice, etc. + *

    Case 2: multi-term repeats
    + * + * @return false if PPs are exhausted. + */ + private boolean advanceRepeatGroups() throws IOException { + for (PhrasePositions[] rg: rptGroups) { + if (hasMultiTermRpts) { + // more involved, some may not collide + int incr; + for (int i=0; i= 0) { + PhrasePositions pp2 = lesser(pp, rg[k]); + if (!advancePP(pp2)) { // at initialization always advance pp with higher offset + return false; // exhausted + } + if (pp2.rptInd < i) { // should not happen? + incr = 0; + break; + } + } + } + } else { + // simpler, we know exactly how much to advance + for (int j=1; j + * If there are repetitions, check if multi-term postings (MTP) are involved.

    + * Without MTP, once PPs are placed in the first candidate doc, repeats (and groups) are visible.
    + * With MTP, a more complex check is needed, up-front, as there may be "hidden collisions".
    + * For example P1 has {A,B}, P1 has {B,C}, and the first doc is: "A C B". At start, P1 would point + * to "A", p2 to "C", and it will not be identified that P1 and P2 are repetitions of each other.

    + * The more complex initialization has two parts:
    + * (1) identification of repetition groups.
    + * (2) advancing repeat groups at the start of the doc.
    + * For (1), a possible solution is to just create a single repetition group, + * made of all repeating pps. But this would slow down the check for collisions, + * as all pps would need to be checked. Instead, we compute "connected regions" + * on the bipartite graph of postings and terms. + */ + private boolean initFirstTime() throws IOException { + //System.err.println("initFirstTime: doc: "+min.doc); + checkedRpts = true; + placeFirstPositions(); + + LinkedHashMap rptTerms = repeatingTerms(); + hasRpts = !rptTerms.isEmpty(); + + if (hasRpts) { + rptStack = new PhrasePositions[numPostings]; // needed with repetitions + ArrayList> rgs = gatherRptGroups(rptTerms); + sortRptGroups(rgs); + if (!advanceRepeatGroups()) { + return false; // PPs exhausted + } + } + + fillQueue(); + return true; // PPs available + } + + /** sort each repetition group by (query) offset. + * Done only once (at first doc) and allows to initialize faster for each doc. */ + private void sortRptGroups(ArrayList> rgs) { + rptGroups = new PhrasePositions[rgs.size()][]; + Comparator cmprtr = new Comparator() { + public int compare(PhrasePositions pp1, PhrasePositions pp2) { + return pp1.offset - pp2.offset; + } + }; + for (int i=0; i> gatherRptGroups(LinkedHashMap rptTerms) throws IOException { + PhrasePositions[] rpp = repeatingPPs(rptTerms); + ArrayList> res = new ArrayList>(); + if (!hasMultiTermRpts) { + // simpler - no multi-terms - can base on positions in first doc + for (int i=0; i=0) continue; // already marked as a repetition + int tpPos = tpPos(pp); + for (int j=i+1; j=0 // already marked as a repetition + || pp2.offset == pp.offset // not a repetition: two PPs are originally in same offset in the query! + || tpPos(pp2) != tpPos) { // not a repetition + continue; + } + // a repetition + int g = pp.rptGroup; + if (g < 0) { + g = res.size(); + pp.rptGroup = g; + ArrayList rl = new ArrayList(2); + rl.add(pp); + res.add(rl); + } + pp2.rptGroup = g; + res.get(g).add(pp2); + } + } + } else { + // more involved - has multi-terms + ArrayList> tmp = new ArrayList>(); + ArrayList bb = ppTermsBitSets(rpp, rptTerms); + unionTermGroups(bb); + HashMap tg = termGroups(rptTerms, bb); + HashSet distinctGroupIDs = new HashSet(tg.values()); + for (int i=0; i()); + } + for (PhrasePositions pp : rpp) { + for (Term t: pp.terms) { + if (rptTerms.containsKey(t)) { + int g = tg.get(t); + tmp.get(g).add(pp); + assert pp.rptGroup==-1 || pp.rptGroup==g; + pp.rptGroup = g; + } + } + } + for (HashSet hs : tmp) { + res.add(new ArrayList(hs)); + } + } + return res; + } + /** Actual position in doc of a PhrasePosition, relies on that position = tpPos - offset) */ private final int tpPos(PhrasePositions pp) { return pp.position + pp.offset; } - -// private void printPositions(PrintStream ps, String title) { -// ps.println(); -// ps.println("---- "+title); -// int k = 0; -// if (nrPps!=null) { -// for (PhrasePositions pp: nrPps) { -// ps.println(" " + k++ + " " + pp); -// } -// } else { -// for (PhrasePositions pp=min; 0==k || pp!=min; pp = pp.next) { -// ps.println(" " + k++ + " " + pp); -// } -// } -// } + /** find repeating terms and assign them ordinal values */ + private LinkedHashMap repeatingTerms() { + LinkedHashMap tord = new LinkedHashMap(); + HashMap tcnt = new HashMap(); + for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max + for (Term t : pp.terms) { + Integer cnt0 = tcnt.get(t); + Integer cnt = cnt0==null ? new Integer(1) : new Integer(1+cnt0.intValue()); + tcnt.put(t, cnt); + if (cnt==2) { + tord.put(t,tord.size()); + } + } + } + return tord; + } + + /** find repeating pps, and for each, if has multi-terms, update this.hasMultiTermRpts */ + private PhrasePositions[] repeatingPPs(HashMap rptTerms) { + ArrayList rp = new ArrayList(); + for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max + for (Term t : pp.terms) { + if (rptTerms.containsKey(t)) { + rp.add(pp); + hasMultiTermRpts |= (pp.terms.length > 1); + break; + } + } + } + return rp.toArray(new PhrasePositions[0]); + } + + /** bit-sets - for each repeating pp, for each of its repeating terms, the term ordinal values is set */ + private ArrayList ppTermsBitSets(PhrasePositions[] rpp, HashMap tord) { + ArrayList bb = new ArrayList(rpp.length); + for (PhrasePositions pp : rpp) { + OpenBitSet b = new OpenBitSet(tord.size()); + Integer ord; + for (Term t: pp.terms) { + if ((ord=tord.get(t))!=null) { + b.set(ord); + } + } + bb.add(b); + } + return bb; + } + + /** union (term group) bit-sets until they are disjoint (O(n^^2)), and each group have different terms */ + private void unionTermGroups(ArrayList bb) { + int incr; + for (int i=0; i termGroups(LinkedHashMap tord, ArrayList bb) throws IOException { + HashMap tg = new HashMap(); + Term[] t = tord.keySet().toArray(new Term[0]); + for (int i=0; i