UCENE-3068: sloppy phrase query failed to match valid documents when multiple

query terms had same position in the query. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1124293 13f79535-47bb-0310-9956-ffa450edef68
2025-02-28 05:19:17 +00:00 · 2011-05-18 14:59:38 +00:00 · 2011-05-18 14:59:38 +00:00 · af9930c6cb
commit af9930c6cb
parent 09ce1ac540
3 changed files with 182 additions and 10 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -451,6 +451,9 @@ Bug fixes
  indexes, causing existing deletions to be applied on the incoming indexes as 
  well. (Shai Erera, Mike McCandless)

+* LUCENE-3068: sloppy phrase query failed to match valid documents when multiple 
+  query terms had same position in the query. (Doron Cohen)
+
 Test Cases

 * LUCENE-3002: added 'tests.iter.min' to control 'tests.iter' by allowing to 
--- a/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java
+++ b/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java
@ -18,7 +18,7 @@ package org.apache.lucene.search;
 */

 import java.io.IOException;
-import java.util.HashMap;
+import java.util.HashSet;

 final class SloppyPhraseScorer extends PhraseScorer {
    private int slop;
@ -109,8 +109,14 @@ final class SloppyPhraseScorer extends PhraseScorer {

    /**
     * Init PhrasePositions in place.
-     * There is a one time initialization for this scorer:
+     * There is a one time initialization for this scorer (taking place at the first doc that matches all terms):
     * <br>- Put in repeats[] each pp that has another pp with same position in the doc.
+     *       This relies on that the position in PP is computed as (TP.position - offset) and 
+     *       so by adding offset we actually compare positions and identify that the two are 
+     *       the same term.
+     *       An exclusion to this is two distinct terms in the same offset in query and same 
+     *       position in doc. This case is detected by comparing just the (query) offsets, 
+     *       and two such PPs are not considered "repeating". 
     * <br>- Also mark each such pp by pp.repeats = true.
     * <br>Later can consult with repeats[] in termPositionsDiffer(pp), making that check efficient.
     * In particular, this allows to score queries with no repetitions with no overhead due to this computation.
@ -145,23 +151,26 @@ final class SloppyPhraseScorer extends PhraseScorer {
        if (!checkedRepeats) {
            checkedRepeats = true;
            // check for repeats
-            HashMap<PhrasePositions, Object> m = null;
+            HashSet<PhrasePositions> m = null;
            for (PhrasePositions pp = first; pp != null; pp = pp.next) {
                int tpPos = pp.position + pp.offset;
                for (PhrasePositions pp2 = pp.next; pp2 != null; pp2 = pp2.next) {
+                    if (pp.offset == pp2.offset) {
+                      continue; // not a repetition: the two PPs are originally in same offset in the query! 
+                    }
                    int tpPos2 = pp2.position + pp2.offset;
                    if (tpPos2 == tpPos) { 
                        if (m == null)
-                            m = new HashMap<PhrasePositions, Object>();
+                            m = new HashSet<PhrasePositions>();
                        pp.repeats = true;
                        pp2.repeats = true;
-                        m.put(pp,null);
-                        m.put(pp2,null);
+                        m.add(pp);
+                        m.add(pp2);
                    }
                }
            }
            if (m!=null)
-                repeats = m.keySet().toArray(new PhrasePositions[0]);
+                repeats = m.toArray(new PhrasePositions[0]);
        }
        
        // with repeats must advance some repeating pp's so they all start with differing tp's       
@ -204,11 +213,16 @@ final class SloppyPhraseScorer extends PhraseScorer {
        int tpPos = pp.position + pp.offset;
        for (int i = 0; i < repeats.length; i++) {
            PhrasePositions pp2 = repeats[i];
-            if (pp2 == pp)
+            if (pp2 == pp) {
                continue;
+            }
+            if (pp.offset == pp2.offset) {
+              continue; // not a repetition: the two PPs are originally in same offset in the query! 
+            }
            int tpPos2 = pp2.position + pp2.offset;
-            if (tpPos2 == tpPos)
+            if (tpPos2 == tpPos) {
                return pp.offset > pp2.offset ? pp : pp2; // do not differ: return the one with higher offset.
+            }
        }
        return null; 
    }
--- a/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java
+++ b/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java
@ -17,11 +17,14 @@ package org.apache.lucene.search;
 * limitations under the License.
 */

+import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.MultiFields;
+import org.apache.lucene.queryParser.ParseException;
+import org.apache.lucene.queryParser.QueryParser;
 import org.apache.lucene.search.Explanation.IDFExplanation;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BytesRef;
@ -423,7 +426,7 @@ public class TestMultiPhraseQuery extends LuceneTestCase {
      mpq.add(new Term[] {new Term("field", "b"), new Term("field", "c")}, 0);
    }
    TopDocs hits = s.search(mpq, 2);
-    assert hits.totalHits == 2;
+    assertEquals(2, hits.totalHits);
    assertEquals(hits.scoreDocs[0].score, hits.scoreDocs[1].score, 1e-5);
    /*
    for(int hit=0;hit<hits.totalHits;hit++) {
@ -434,4 +437,156 @@ public class TestMultiPhraseQuery extends LuceneTestCase {
    r.close();
    dir.close();
  }
+
+  private final static TokenAndPos[] INCR_0_DOC_TOKENS = new TokenAndPos[] {
+      new TokenAndPos("x", 0),
+      new TokenAndPos("a", 1),
+      new TokenAndPos("1", 1),
+      new TokenAndPos("m", 2), // not existing, relying on slop=2
+      new TokenAndPos("b", 3),
+      new TokenAndPos("1", 3),
+      new TokenAndPos("n", 4), // not existing, relying on slop=2
+      new TokenAndPos("c", 5),
+      new TokenAndPos("y", 6)
+  };
+  
+  private final static TokenAndPos[] INCR_0_QUERY_TOKENS_AND = new TokenAndPos[] {
+      new TokenAndPos("a", 0),
+      new TokenAndPos("1", 0),
+      new TokenAndPos("b", 1),
+      new TokenAndPos("1", 1),
+      new TokenAndPos("c", 2)
+  };
+  
+  private final static TokenAndPos[][] INCR_0_QUERY_TOKENS_AND_OR_MATCH = new TokenAndPos[][] {
+      { new TokenAndPos("a", 0) },
+      { new TokenAndPos("x", 0), new TokenAndPos("1", 0) },
+      { new TokenAndPos("b", 1) },
+      { new TokenAndPos("x", 1), new TokenAndPos("1", 1) },
+      { new TokenAndPos("c", 2) }
+  };
+  
+  private final static TokenAndPos[][] INCR_0_QUERY_TOKENS_AND_OR_NO_MATCHN = new TokenAndPos[][] {
+      { new TokenAndPos("x", 0) },
+      { new TokenAndPos("a", 0), new TokenAndPos("1", 0) },
+      { new TokenAndPos("x", 1) },
+      { new TokenAndPos("b", 1), new TokenAndPos("1", 1) },
+      { new TokenAndPos("c", 2) }
+  };
+  
+  /**
+   * using query parser, MPQ will be created, and will not be strict about having all query terms 
+   * in each position - one of each position is sufficient (OR logic)
+   */
+  public void testZeroPosIncrSloppyParsedAnd() throws IOException, ParseException {
+    QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new CannedAnalyzer(INCR_0_QUERY_TOKENS_AND));
+    final Query q = qp.parse("\"this text is acually ignored\"");
+    assertTrue("wrong query type!", q instanceof MultiPhraseQuery);
+    doTestZeroPosIncrSloppy(q, 0);
+    ((MultiPhraseQuery) q).setSlop(1);
+    doTestZeroPosIncrSloppy(q, 0);
+    ((MultiPhraseQuery) q).setSlop(2);
+    doTestZeroPosIncrSloppy(q, 1);
+  }
+  
+  private void doTestZeroPosIncrSloppy(Query q, int nExpected) throws IOException {
+    Directory dir = newDirectory(); // random dir
+    IndexWriterConfig cfg = newIndexWriterConfig(TEST_VERSION_CURRENT, new CannedAnalyzer(INCR_0_DOC_TOKENS));
+    IndexWriter writer = new IndexWriter(dir, cfg);
+    Document doc = new Document();
+    doc.add(new Field("field", "", Field.Store.NO, Field.Index.ANALYZED));
+    writer.addDocument(doc);
+    IndexReader r = IndexReader.open(writer,false);
+    writer.close();
+    IndexSearcher s = new IndexSearcher(r);
+    
+    if (VERBOSE) {
+      System.out.println("QUERY=" + q);
+    }
+    
+    TopDocs hits = s.search(q, 1);
+    assertEquals("wrong number of results", nExpected, hits.totalHits);
+    
+    if (VERBOSE) {
+      for(int hit=0;hit<hits.totalHits;hit++) {
+        ScoreDoc sd = hits.scoreDocs[hit];
+        System.out.println("  hit doc=" + sd.doc + " score=" + sd.score);
+      }
+    }
+    
+    r.close();
+    dir.close();
+  }
+
+  /**
+   * PQ AND Mode - Manually creating a phrase query
+   */
+  public void testZeroPosIncrSloppyPqAnd() throws IOException, ParseException {
+    final PhraseQuery pq = new PhraseQuery();
+    for (TokenAndPos tap : INCR_0_QUERY_TOKENS_AND) {
+      pq.add(new Term("field",tap.token), tap.pos);
+    }
+    doTestZeroPosIncrSloppy(pq, 0);
+    pq.setSlop(1);
+    doTestZeroPosIncrSloppy(pq, 0);
+    pq.setSlop(2);
+    doTestZeroPosIncrSloppy(pq, 1);
+  }
+
+  /**
+   * MPQ AND Mode - Manually creating a multiple phrase query
+   */
+  public void testZeroPosIncrSloppyMpqAnd() throws IOException, ParseException {
+    final MultiPhraseQuery mpq = new MultiPhraseQuery();
+    for (TokenAndPos tap : INCR_0_QUERY_TOKENS_AND) {
+      mpq.add(new Term[]{new Term("field",tap.token)}, tap.pos); //AND logic
+    }
+    doTestZeroPosIncrSloppy(mpq, 0);
+    mpq.setSlop(1);
+    doTestZeroPosIncrSloppy(mpq, 0);
+    mpq.setSlop(2);
+    doTestZeroPosIncrSloppy(mpq, 1);
+  }
+
+  /**
+   * MPQ Combined AND OR Mode - Manually creating a multiple phrase query
+   */
+  public void testZeroPosIncrSloppyMpqAndOrMatch() throws IOException, ParseException {
+    final MultiPhraseQuery mpq = new MultiPhraseQuery();
+    for (TokenAndPos tap[] : INCR_0_QUERY_TOKENS_AND_OR_MATCH) {
+      Term[] terms = tapTerms(tap);
+      final int pos = tap[0].pos;
+      mpq.add(terms, pos); //AND logic in pos, OR across lines 
+    }
+    doTestZeroPosIncrSloppy(mpq, 0);
+    mpq.setSlop(1);
+    doTestZeroPosIncrSloppy(mpq, 0);
+    mpq.setSlop(2);
+    doTestZeroPosIncrSloppy(mpq, 1);
+  }
+
+  /**
+   * MPQ Combined AND OR Mode - Manually creating a multiple phrase query - with no match
+   */
+  public void testZeroPosIncrSloppyMpqAndOrNoMatch() throws IOException, ParseException {
+    final MultiPhraseQuery mpq = new MultiPhraseQuery();
+    for (TokenAndPos tap[] : INCR_0_QUERY_TOKENS_AND_OR_NO_MATCHN) {
+      Term[] terms = tapTerms(tap);
+      final int pos = tap[0].pos;
+      mpq.add(terms, pos); //AND logic in pos, OR across lines 
+    }
+    doTestZeroPosIncrSloppy(mpq, 0);
+    mpq.setSlop(2);
+    doTestZeroPosIncrSloppy(mpq, 0);
+  }
+
+  private Term[] tapTerms(TokenAndPos[] tap) {
+    Term[] terms = new Term[tap.length];
+    for (int i=0; i<terms.length; i++) {
+      terms[i] = new Term("field",tap[i].token);
+    }
+    return terms;
+  }
+  
 }
+