UCENE-3068: sloppy phrase query failed to match valid documents when multiple

query terms had same position in the query.


git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1124293 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Doron Cohen 2011-05-18 14:59:38 +00:00
parent 09ce1ac540
commit af9930c6cb
3 changed files with 182 additions and 10 deletions

View File

@ -451,6 +451,9 @@ Bug fixes
indexes, causing existing deletions to be applied on the incoming indexes as indexes, causing existing deletions to be applied on the incoming indexes as
well. (Shai Erera, Mike McCandless) well. (Shai Erera, Mike McCandless)
* LUCENE-3068: sloppy phrase query failed to match valid documents when multiple
query terms had same position in the query. (Doron Cohen)
Test Cases Test Cases
* LUCENE-3002: added 'tests.iter.min' to control 'tests.iter' by allowing to * LUCENE-3002: added 'tests.iter.min' to control 'tests.iter' by allowing to

View File

@ -18,7 +18,7 @@ package org.apache.lucene.search;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.HashMap; import java.util.HashSet;
final class SloppyPhraseScorer extends PhraseScorer { final class SloppyPhraseScorer extends PhraseScorer {
private int slop; private int slop;
@ -109,8 +109,14 @@ final class SloppyPhraseScorer extends PhraseScorer {
/** /**
* Init PhrasePositions in place. * Init PhrasePositions in place.
* There is a one time initialization for this scorer: * There is a one time initialization for this scorer (taking place at the first doc that matches all terms):
* <br>- Put in repeats[] each pp that has another pp with same position in the doc. * <br>- Put in repeats[] each pp that has another pp with same position in the doc.
* This relies on that the position in PP is computed as (TP.position - offset) and
* so by adding offset we actually compare positions and identify that the two are
* the same term.
* An exclusion to this is two distinct terms in the same offset in query and same
* position in doc. This case is detected by comparing just the (query) offsets,
* and two such PPs are not considered "repeating".
* <br>- Also mark each such pp by pp.repeats = true. * <br>- Also mark each such pp by pp.repeats = true.
* <br>Later can consult with repeats[] in termPositionsDiffer(pp), making that check efficient. * <br>Later can consult with repeats[] in termPositionsDiffer(pp), making that check efficient.
* In particular, this allows to score queries with no repetitions with no overhead due to this computation. * In particular, this allows to score queries with no repetitions with no overhead due to this computation.
@ -145,23 +151,26 @@ final class SloppyPhraseScorer extends PhraseScorer {
if (!checkedRepeats) { if (!checkedRepeats) {
checkedRepeats = true; checkedRepeats = true;
// check for repeats // check for repeats
HashMap<PhrasePositions, Object> m = null; HashSet<PhrasePositions> m = null;
for (PhrasePositions pp = first; pp != null; pp = pp.next) { for (PhrasePositions pp = first; pp != null; pp = pp.next) {
int tpPos = pp.position + pp.offset; int tpPos = pp.position + pp.offset;
for (PhrasePositions pp2 = pp.next; pp2 != null; pp2 = pp2.next) { for (PhrasePositions pp2 = pp.next; pp2 != null; pp2 = pp2.next) {
if (pp.offset == pp2.offset) {
continue; // not a repetition: the two PPs are originally in same offset in the query!
}
int tpPos2 = pp2.position + pp2.offset; int tpPos2 = pp2.position + pp2.offset;
if (tpPos2 == tpPos) { if (tpPos2 == tpPos) {
if (m == null) if (m == null)
m = new HashMap<PhrasePositions, Object>(); m = new HashSet<PhrasePositions>();
pp.repeats = true; pp.repeats = true;
pp2.repeats = true; pp2.repeats = true;
m.put(pp,null); m.add(pp);
m.put(pp2,null); m.add(pp2);
} }
} }
} }
if (m!=null) if (m!=null)
repeats = m.keySet().toArray(new PhrasePositions[0]); repeats = m.toArray(new PhrasePositions[0]);
} }
// with repeats must advance some repeating pp's so they all start with differing tp's // with repeats must advance some repeating pp's so they all start with differing tp's
@ -204,12 +213,17 @@ final class SloppyPhraseScorer extends PhraseScorer {
int tpPos = pp.position + pp.offset; int tpPos = pp.position + pp.offset;
for (int i = 0; i < repeats.length; i++) { for (int i = 0; i < repeats.length; i++) {
PhrasePositions pp2 = repeats[i]; PhrasePositions pp2 = repeats[i];
if (pp2 == pp) if (pp2 == pp) {
continue; continue;
}
if (pp.offset == pp2.offset) {
continue; // not a repetition: the two PPs are originally in same offset in the query!
}
int tpPos2 = pp2.position + pp2.offset; int tpPos2 = pp2.position + pp2.offset;
if (tpPos2 == tpPos) if (tpPos2 == tpPos) {
return pp.offset > pp2.offset ? pp : pp2; // do not differ: return the one with higher offset. return pp.offset > pp2.offset ? pp : pp2; // do not differ: return the one with higher offset.
} }
}
return null; return null;
} }
} }

View File

@ -17,11 +17,14 @@ package org.apache.lucene.search;
* limitations under the License. * limitations under the License.
*/ */
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.MultiFields;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Explanation.IDFExplanation; import org.apache.lucene.search.Explanation.IDFExplanation;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
@ -423,7 +426,7 @@ public class TestMultiPhraseQuery extends LuceneTestCase {
mpq.add(new Term[] {new Term("field", "b"), new Term("field", "c")}, 0); mpq.add(new Term[] {new Term("field", "b"), new Term("field", "c")}, 0);
} }
TopDocs hits = s.search(mpq, 2); TopDocs hits = s.search(mpq, 2);
assert hits.totalHits == 2; assertEquals(2, hits.totalHits);
assertEquals(hits.scoreDocs[0].score, hits.scoreDocs[1].score, 1e-5); assertEquals(hits.scoreDocs[0].score, hits.scoreDocs[1].score, 1e-5);
/* /*
for(int hit=0;hit<hits.totalHits;hit++) { for(int hit=0;hit<hits.totalHits;hit++) {
@ -434,4 +437,156 @@ public class TestMultiPhraseQuery extends LuceneTestCase {
r.close(); r.close();
dir.close(); dir.close();
} }
private final static TokenAndPos[] INCR_0_DOC_TOKENS = new TokenAndPos[] {
new TokenAndPos("x", 0),
new TokenAndPos("a", 1),
new TokenAndPos("1", 1),
new TokenAndPos("m", 2), // not existing, relying on slop=2
new TokenAndPos("b", 3),
new TokenAndPos("1", 3),
new TokenAndPos("n", 4), // not existing, relying on slop=2
new TokenAndPos("c", 5),
new TokenAndPos("y", 6)
};
private final static TokenAndPos[] INCR_0_QUERY_TOKENS_AND = new TokenAndPos[] {
new TokenAndPos("a", 0),
new TokenAndPos("1", 0),
new TokenAndPos("b", 1),
new TokenAndPos("1", 1),
new TokenAndPos("c", 2)
};
private final static TokenAndPos[][] INCR_0_QUERY_TOKENS_AND_OR_MATCH = new TokenAndPos[][] {
{ new TokenAndPos("a", 0) },
{ new TokenAndPos("x", 0), new TokenAndPos("1", 0) },
{ new TokenAndPos("b", 1) },
{ new TokenAndPos("x", 1), new TokenAndPos("1", 1) },
{ new TokenAndPos("c", 2) }
};
private final static TokenAndPos[][] INCR_0_QUERY_TOKENS_AND_OR_NO_MATCHN = new TokenAndPos[][] {
{ new TokenAndPos("x", 0) },
{ new TokenAndPos("a", 0), new TokenAndPos("1", 0) },
{ new TokenAndPos("x", 1) },
{ new TokenAndPos("b", 1), new TokenAndPos("1", 1) },
{ new TokenAndPos("c", 2) }
};
/**
* using query parser, MPQ will be created, and will not be strict about having all query terms
* in each position - one of each position is sufficient (OR logic)
*/
public void testZeroPosIncrSloppyParsedAnd() throws IOException, ParseException {
QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new CannedAnalyzer(INCR_0_QUERY_TOKENS_AND));
final Query q = qp.parse("\"this text is acually ignored\"");
assertTrue("wrong query type!", q instanceof MultiPhraseQuery);
doTestZeroPosIncrSloppy(q, 0);
((MultiPhraseQuery) q).setSlop(1);
doTestZeroPosIncrSloppy(q, 0);
((MultiPhraseQuery) q).setSlop(2);
doTestZeroPosIncrSloppy(q, 1);
}
private void doTestZeroPosIncrSloppy(Query q, int nExpected) throws IOException {
Directory dir = newDirectory(); // random dir
IndexWriterConfig cfg = newIndexWriterConfig(TEST_VERSION_CURRENT, new CannedAnalyzer(INCR_0_DOC_TOKENS));
IndexWriter writer = new IndexWriter(dir, cfg);
Document doc = new Document();
doc.add(new Field("field", "", Field.Store.NO, Field.Index.ANALYZED));
writer.addDocument(doc);
IndexReader r = IndexReader.open(writer,false);
writer.close();
IndexSearcher s = new IndexSearcher(r);
if (VERBOSE) {
System.out.println("QUERY=" + q);
}
TopDocs hits = s.search(q, 1);
assertEquals("wrong number of results", nExpected, hits.totalHits);
if (VERBOSE) {
for(int hit=0;hit<hits.totalHits;hit++) {
ScoreDoc sd = hits.scoreDocs[hit];
System.out.println(" hit doc=" + sd.doc + " score=" + sd.score);
}
}
r.close();
dir.close();
}
/**
* PQ AND Mode - Manually creating a phrase query
*/
public void testZeroPosIncrSloppyPqAnd() throws IOException, ParseException {
final PhraseQuery pq = new PhraseQuery();
for (TokenAndPos tap : INCR_0_QUERY_TOKENS_AND) {
pq.add(new Term("field",tap.token), tap.pos);
}
doTestZeroPosIncrSloppy(pq, 0);
pq.setSlop(1);
doTestZeroPosIncrSloppy(pq, 0);
pq.setSlop(2);
doTestZeroPosIncrSloppy(pq, 1);
}
/**
* MPQ AND Mode - Manually creating a multiple phrase query
*/
public void testZeroPosIncrSloppyMpqAnd() throws IOException, ParseException {
final MultiPhraseQuery mpq = new MultiPhraseQuery();
for (TokenAndPos tap : INCR_0_QUERY_TOKENS_AND) {
mpq.add(new Term[]{new Term("field",tap.token)}, tap.pos); //AND logic
}
doTestZeroPosIncrSloppy(mpq, 0);
mpq.setSlop(1);
doTestZeroPosIncrSloppy(mpq, 0);
mpq.setSlop(2);
doTestZeroPosIncrSloppy(mpq, 1);
}
/**
* MPQ Combined AND OR Mode - Manually creating a multiple phrase query
*/
public void testZeroPosIncrSloppyMpqAndOrMatch() throws IOException, ParseException {
final MultiPhraseQuery mpq = new MultiPhraseQuery();
for (TokenAndPos tap[] : INCR_0_QUERY_TOKENS_AND_OR_MATCH) {
Term[] terms = tapTerms(tap);
final int pos = tap[0].pos;
mpq.add(terms, pos); //AND logic in pos, OR across lines
}
doTestZeroPosIncrSloppy(mpq, 0);
mpq.setSlop(1);
doTestZeroPosIncrSloppy(mpq, 0);
mpq.setSlop(2);
doTestZeroPosIncrSloppy(mpq, 1);
}
/**
* MPQ Combined AND OR Mode - Manually creating a multiple phrase query - with no match
*/
public void testZeroPosIncrSloppyMpqAndOrNoMatch() throws IOException, ParseException {
final MultiPhraseQuery mpq = new MultiPhraseQuery();
for (TokenAndPos tap[] : INCR_0_QUERY_TOKENS_AND_OR_NO_MATCHN) {
Term[] terms = tapTerms(tap);
final int pos = tap[0].pos;
mpq.add(terms, pos); //AND logic in pos, OR across lines
}
doTestZeroPosIncrSloppy(mpq, 0);
mpq.setSlop(2);
doTestZeroPosIncrSloppy(mpq, 0);
}
private Term[] tapTerms(TokenAndPos[] tap) {
Term[] terms = new Term[tap.length];
for (int i=0; i<terms.length; i++) {
terms[i] = new Term("field",tap[i].token);
}
return terms;
}
} }