From 0eb1be2b481745a2902da01d2b411edf3bfa08f5 Mon Sep 17 00:00:00 2001 From: Doron Cohen Date: Sun, 3 Aug 2008 15:40:14 +0000 Subject: [PATCH] LUCENE-1310: Phrase query with term repeated 3 times needed more slop than expected. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@682186 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 3 + .../lucene/search/SloppyPhraseScorer.java | 69 +++++--- .../lucene/search/TestSloppyPhraseQuery.java | 155 ++++++++++++++++++ 3 files changed, 204 insertions(+), 23 deletions(-) create mode 100755 src/test/org/apache/lucene/search/TestSloppyPhraseQuery.java diff --git a/CHANGES.txt b/CHANGES.txt index 4e9c7ee35f2..4c08a29c83a 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -136,6 +136,9 @@ Bug fixes depending only upon the non-payload score part, regardless of the effect of the payload on the score. Prior to this, score of a query containing a BTQ differed from its explanation. (Doron Cohen) + +14. LUCENE-1310: Fixed SloppyPhraseScorer to work also for terms repeating more + than twice in the query. (Doron Cohen) New features diff --git a/src/java/org/apache/lucene/search/SloppyPhraseScorer.java b/src/java/org/apache/lucene/search/SloppyPhraseScorer.java index c039c61fb33..2778ae8a55a 100644 --- a/src/java/org/apache/lucene/search/SloppyPhraseScorer.java +++ b/src/java/org/apache/lucene/search/SloppyPhraseScorer.java @@ -20,13 +20,12 @@ package org.apache.lucene.search; import org.apache.lucene.index.TermPositions; import java.io.IOException; -import java.util.Arrays; -import java.util.Comparator; import java.util.HashMap; final class SloppyPhraseScorer extends PhraseScorer { private int slop; private PhrasePositions repeats[]; + private PhrasePositions tmpPos[]; // for flipping repeating pps. private boolean checkedRepeats; SloppyPhraseScorer(Weight weight, TermPositions[] tps, int[] offsets, Similarity similarity, @@ -66,12 +65,16 @@ final class SloppyPhraseScorer extends PhraseScorer { boolean tpsDiffer = true; for (int pos = start; pos <= next || !tpsDiffer; pos = pp.position) { if (pos<=next && tpsDiffer) - start = pos; // advance pp to min window + start = pos; // advance pp to min window if (!pp.nextPosition()) { done = true; // ran out of a term -- done break; } - tpsDiffer = !pp.repeats || termPositionsDiffer(pp); + PhrasePositions pp2 = null; + tpsDiffer = !pp.repeats || (pp2 = termPositionsDiffer(pp))==null; + if (pp2!=null && pp2!=pp) { + pp = flip(pp,pp2); // flip pp to pp2 + } } int matchLength = end - start; @@ -80,20 +83,38 @@ final class SloppyPhraseScorer extends PhraseScorer { if (pp.position > end) end = pp.position; - pq.put(pp); // restore pq + pq.put(pp); // restore pq } return freq; } - + // flip pp2 and pp in the queue: pop until finding pp2, insert back all but pp2, insert pp back. + // assumes: pp!=pp2, pp2 in pq, pp not in pq. + // called only when there are repeating pps. + private PhrasePositions flip(PhrasePositions pp, PhrasePositions pp2) { + int n=0; + PhrasePositions pp3; + //pop until finding pp2 + while ((pp3=(PhrasePositions)pq.pop()) != pp2) { + tmpPos[n++] = pp3; + } + //insert back all but pp2 + for (n--; n>=0; n--) { + pq.insert(tmpPos[n]); + } + //insert pp back + pq.put(pp); + return pp2; + } + /** * Init PhrasePositions in place. - * There is a one time initializatin for this scorer: + * There is a one time initialization for this scorer: *
- Put in repeats[] each pp that has another pp with same position in the doc. *
- Also mark each such pp by pp.repeats = true. *
Later can consult with repeats[] in termPositionsDiffer(pp), making that check efficient. - * In particular, this allows to score queries with no repetiotions with no overhead due to this computation. + * In particular, this allows to score queries with no repetitions with no overhead due to this computation. *
- Example 1 - query with no repetitions: "ho my"~2 *
- Example 2 - query with repetitions: "ho my my"~2 *
- Example 3 - query with repetitions: "my ho my"~2 @@ -146,17 +167,12 @@ final class SloppyPhraseScorer extends PhraseScorer { // with repeats must advance some repeating pp's so they all start with differing tp's if (repeats!=null) { - // must propagate higher offsets first (otherwise might miss matches). - Arrays.sort(repeats, new Comparator() { - public int compare(Object x, Object y) { - return ((PhrasePositions) y).offset - ((PhrasePositions) x).offset; - }}); - // now advance them for (int i = 0; i < repeats.length; i++) { PhrasePositions pp = repeats[i]; - while (!termPositionsDiffer(pp)) { - if (!pp.nextPosition()) - return -1; // ran out of a term -- done + PhrasePositions pp2; + while ((pp2 = termPositionsDiffer(pp)) != null) { + if (!pp2.nextPosition()) // out of pps that do not differ, advance the pp with higher offset + return -1; // ran out of a term -- done } } } @@ -169,13 +185,20 @@ final class SloppyPhraseScorer extends PhraseScorer { pq.put(pp); // build pq from list } + if (repeats!=null) { + tmpPos = new PhrasePositions[pq.size()]; + } return end; } - // disalow two pp's to have the same tp position, so that same word twice - // in query would go elswhere in the matched doc - private boolean termPositionsDiffer(PhrasePositions pp) { - // efficiency note: a more efficient implemention could keep a map between repeating + /** + * We disallow two pp's to have the same TermPosition, thereby verifying multiple occurrences + * in the query of the same word would go elsewhere in the matched doc. + * @return null if differ (i.e. valid) otherwise return the higher offset PhrasePositions + * out of the first two PPs found to not differ. + */ + private PhrasePositions termPositionsDiffer(PhrasePositions pp) { + // efficiency note: a more efficient implementation could keep a map between repeating // pp's, so that if pp1a, pp1b, pp1c are repeats term1, and pp2a, pp2b are repeats // of term2, pp2a would only be checked against pp2b but not against pp1a, pp1b, pp1c. // However this would complicate code, for a rather rare case, so choice is to compromise here. @@ -186,8 +209,8 @@ final class SloppyPhraseScorer extends PhraseScorer { continue; int tpPos2 = pp2.position + pp2.offset; if (tpPos2 == tpPos) - return false; + return pp.offset > pp2.offset ? pp : pp2; // do not differ: return the one with higher offset. } - return true; + return null; } } diff --git a/src/test/org/apache/lucene/search/TestSloppyPhraseQuery.java b/src/test/org/apache/lucene/search/TestSloppyPhraseQuery.java new file mode 100755 index 00000000000..7ef087fbfa1 --- /dev/null +++ b/src/test/org/apache/lucene/search/TestSloppyPhraseQuery.java @@ -0,0 +1,155 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import junit.framework.TestCase; + +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.IndexWriter.MaxFieldLength; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.store.RAMDirectory; + +public class TestSloppyPhraseQuery extends TestCase { + + private static final String S_1 = "A A A"; + private static final String S_2 = "A 1 2 3 A 4 5 6 A"; + + private static final Document DOC_1 = makeDocument("X " + S_1 + " Y"); + private static final Document DOC_2 = makeDocument("X " + S_2 + " Y"); + private static final Document DOC_3 = makeDocument("X " + S_1 + " A Y"); + private static final Document DOC_1_B = makeDocument("X " + S_1 + " Y N N N N " + S_1 + " Z"); + private static final Document DOC_2_B = makeDocument("X " + S_2 + " Y N N N N " + S_2 + " Z"); + private static final Document DOC_3_B = makeDocument("X " + S_1 + " A Y N N N N " + S_1 + " A Y"); + private static final Document DOC_4 = makeDocument("A A X A X B A X B B A A X B A A"); + + private static final PhraseQuery QUERY_1 = makePhraseQuery( S_1 ); + private static final PhraseQuery QUERY_2 = makePhraseQuery( S_2 ); + private static final PhraseQuery QUERY_4 = makePhraseQuery( "X A A"); + + + /** + * Test DOC_4 and QUERY_4. + * QUERY_4 has a fuzzy (len=1) match to DOC_4, so all slop values > 0 should succeed. + * But only the 3rd sequence of A's in DOC_4 will do. + */ + public void testDoc4_Query4_All_Slops_Should_match() throws Exception { + for (int slop=0; slop<30; slop++) { + int numResultsExpected = slop<1 ? 0 : 1; + checkPhraseQuery(DOC_4, QUERY_4, slop, numResultsExpected); + } + } + + /** + * Test DOC_1 and QUERY_1. + * QUERY_1 has an exact match to DOC_1, so all slop values should succeed. + * Before LUCENE-1310, a slop value of 1 did not succeed. + */ + public void testDoc1_Query1_All_Slops_Should_match() throws Exception { + for (int slop=0; slop<30; slop++) { + float score1 = checkPhraseQuery(DOC_1, QUERY_1, slop, 1); + float score2 = checkPhraseQuery(DOC_1_B, QUERY_1, slop, 1); + assertTrue("slop="+slop+" score2="+score2+" should be greater than score1 "+score1, score2>score1); + } + } + + /** + * Test DOC_2 and QUERY_1. + * 6 should be the minimum slop to make QUERY_1 match DOC_2. + * Before LUCENE-1310, 7 was the minimum. + */ + public void testDoc2_Query1_Slop_6_or_more_Should_match() throws Exception { + for (int slop=0; slop<30; slop++) { + int numResultsExpected = slop<6 ? 0 : 1; + float score1 = checkPhraseQuery(DOC_2, QUERY_1, slop, numResultsExpected); + if (numResultsExpected>0) { + float score2 = checkPhraseQuery(DOC_2_B, QUERY_1, slop, 1); + assertTrue("slop="+slop+" score2="+score2+" should be greater than score1 "+score1, score2>score1); + } + } + } + + /** + * Test DOC_2 and QUERY_2. + * QUERY_2 has an exact match to DOC_2, so all slop values should succeed. + * Before LUCENE-1310, 0 succeeds, 1 through 7 fail, and 8 or greater succeeds. + */ + public void testDoc2_Query2_All_Slops_Should_match() throws Exception { + for (int slop=0; slop<30; slop++) { + float score1 = checkPhraseQuery(DOC_2, QUERY_2, slop, 1); + float score2 = checkPhraseQuery(DOC_2_B, QUERY_2, slop, 1); + assertTrue("slop="+slop+" score2="+score2+" should be greater than score1 "+score1, score2>score1); + } + } + + /** + * Test DOC_3 and QUERY_1. + * QUERY_1 has an exact match to DOC_3, so all slop values should succeed. + */ + public void testDoc3_Query1_All_Slops_Should_match() throws Exception { + for (int slop=0; slop<30; slop++) { + float score1 = checkPhraseQuery(DOC_3, QUERY_1, slop, 1); + float score2 = checkPhraseQuery(DOC_3_B, QUERY_1, slop, 1); + assertTrue("slop="+slop+" score2="+score2+" should be greater than score1 "+score1, score2>score1); + } + } + + private float checkPhraseQuery(Document doc, PhraseQuery query, int slop, int expectedNumResults) throws Exception { + query.setSlop(slop); + + RAMDirectory ramDir = new RAMDirectory(); + WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(); + IndexWriter writer = new IndexWriter(ramDir, analyzer, MaxFieldLength.UNLIMITED); + writer.addDocument(doc); + writer.close(); + + IndexSearcher searcher = new IndexSearcher(ramDir); + TopDocs td = searcher.search(query,null,10); + //System.out.println("slop: "+slop+" query: "+query+" doc: "+doc+" Expecting number of hits: "+expectedNumResults+" maxScore="+td.getMaxScore()); + assertEquals("slop: "+slop+" query: "+query+" doc: "+doc+" Wrong number of hits", expectedNumResults, td.totalHits); + + //QueryUtils.check(query,searcher); + + searcher.close(); + ramDir.close(); + + return td.getMaxScore(); + } + + private static Document makeDocument(String docText) { + Document doc = new Document(); + Field f = new Field("f", docText, Field.Store.NO, Field.Index.TOKENIZED); + f.setOmitNorms(true); + doc.add(f); + return doc; + } + + private static PhraseQuery makePhraseQuery(String terms) { + PhraseQuery query = new PhraseQuery(); + String[] t = terms.split(" +"); + for (int i=0; i