mirror of https://github.com/apache/lucene.git
UCENE-3068: sloppy phrase query failed to match valid documents when multiple
query terms had same position in the query. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1124293 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
09ce1ac540
commit
af9930c6cb
|
@ -451,6 +451,9 @@ Bug fixes
|
||||||
indexes, causing existing deletions to be applied on the incoming indexes as
|
indexes, causing existing deletions to be applied on the incoming indexes as
|
||||||
well. (Shai Erera, Mike McCandless)
|
well. (Shai Erera, Mike McCandless)
|
||||||
|
|
||||||
|
* LUCENE-3068: sloppy phrase query failed to match valid documents when multiple
|
||||||
|
query terms had same position in the query. (Doron Cohen)
|
||||||
|
|
||||||
Test Cases
|
Test Cases
|
||||||
|
|
||||||
* LUCENE-3002: added 'tests.iter.min' to control 'tests.iter' by allowing to
|
* LUCENE-3002: added 'tests.iter.min' to control 'tests.iter' by allowing to
|
||||||
|
|
|
@ -18,7 +18,7 @@ package org.apache.lucene.search;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashMap;
|
import java.util.HashSet;
|
||||||
|
|
||||||
final class SloppyPhraseScorer extends PhraseScorer {
|
final class SloppyPhraseScorer extends PhraseScorer {
|
||||||
private int slop;
|
private int slop;
|
||||||
|
@ -109,8 +109,14 @@ final class SloppyPhraseScorer extends PhraseScorer {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Init PhrasePositions in place.
|
* Init PhrasePositions in place.
|
||||||
* There is a one time initialization for this scorer:
|
* There is a one time initialization for this scorer (taking place at the first doc that matches all terms):
|
||||||
* <br>- Put in repeats[] each pp that has another pp with same position in the doc.
|
* <br>- Put in repeats[] each pp that has another pp with same position in the doc.
|
||||||
|
* This relies on that the position in PP is computed as (TP.position - offset) and
|
||||||
|
* so by adding offset we actually compare positions and identify that the two are
|
||||||
|
* the same term.
|
||||||
|
* An exclusion to this is two distinct terms in the same offset in query and same
|
||||||
|
* position in doc. This case is detected by comparing just the (query) offsets,
|
||||||
|
* and two such PPs are not considered "repeating".
|
||||||
* <br>- Also mark each such pp by pp.repeats = true.
|
* <br>- Also mark each such pp by pp.repeats = true.
|
||||||
* <br>Later can consult with repeats[] in termPositionsDiffer(pp), making that check efficient.
|
* <br>Later can consult with repeats[] in termPositionsDiffer(pp), making that check efficient.
|
||||||
* In particular, this allows to score queries with no repetitions with no overhead due to this computation.
|
* In particular, this allows to score queries with no repetitions with no overhead due to this computation.
|
||||||
|
@ -145,23 +151,26 @@ final class SloppyPhraseScorer extends PhraseScorer {
|
||||||
if (!checkedRepeats) {
|
if (!checkedRepeats) {
|
||||||
checkedRepeats = true;
|
checkedRepeats = true;
|
||||||
// check for repeats
|
// check for repeats
|
||||||
HashMap<PhrasePositions, Object> m = null;
|
HashSet<PhrasePositions> m = null;
|
||||||
for (PhrasePositions pp = first; pp != null; pp = pp.next) {
|
for (PhrasePositions pp = first; pp != null; pp = pp.next) {
|
||||||
int tpPos = pp.position + pp.offset;
|
int tpPos = pp.position + pp.offset;
|
||||||
for (PhrasePositions pp2 = pp.next; pp2 != null; pp2 = pp2.next) {
|
for (PhrasePositions pp2 = pp.next; pp2 != null; pp2 = pp2.next) {
|
||||||
|
if (pp.offset == pp2.offset) {
|
||||||
|
continue; // not a repetition: the two PPs are originally in same offset in the query!
|
||||||
|
}
|
||||||
int tpPos2 = pp2.position + pp2.offset;
|
int tpPos2 = pp2.position + pp2.offset;
|
||||||
if (tpPos2 == tpPos) {
|
if (tpPos2 == tpPos) {
|
||||||
if (m == null)
|
if (m == null)
|
||||||
m = new HashMap<PhrasePositions, Object>();
|
m = new HashSet<PhrasePositions>();
|
||||||
pp.repeats = true;
|
pp.repeats = true;
|
||||||
pp2.repeats = true;
|
pp2.repeats = true;
|
||||||
m.put(pp,null);
|
m.add(pp);
|
||||||
m.put(pp2,null);
|
m.add(pp2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (m!=null)
|
if (m!=null)
|
||||||
repeats = m.keySet().toArray(new PhrasePositions[0]);
|
repeats = m.toArray(new PhrasePositions[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// with repeats must advance some repeating pp's so they all start with differing tp's
|
// with repeats must advance some repeating pp's so they all start with differing tp's
|
||||||
|
@ -204,12 +213,17 @@ final class SloppyPhraseScorer extends PhraseScorer {
|
||||||
int tpPos = pp.position + pp.offset;
|
int tpPos = pp.position + pp.offset;
|
||||||
for (int i = 0; i < repeats.length; i++) {
|
for (int i = 0; i < repeats.length; i++) {
|
||||||
PhrasePositions pp2 = repeats[i];
|
PhrasePositions pp2 = repeats[i];
|
||||||
if (pp2 == pp)
|
if (pp2 == pp) {
|
||||||
continue;
|
continue;
|
||||||
|
}
|
||||||
|
if (pp.offset == pp2.offset) {
|
||||||
|
continue; // not a repetition: the two PPs are originally in same offset in the query!
|
||||||
|
}
|
||||||
int tpPos2 = pp2.position + pp2.offset;
|
int tpPos2 = pp2.position + pp2.offset;
|
||||||
if (tpPos2 == tpPos)
|
if (tpPos2 == tpPos) {
|
||||||
return pp.offset > pp2.offset ? pp : pp2; // do not differ: return the one with higher offset.
|
return pp.offset > pp2.offset ? pp : pp2; // do not differ: return the one with higher offset.
|
||||||
}
|
}
|
||||||
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,11 +17,14 @@ package org.apache.lucene.search;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.index.IndexWriterConfig;
|
||||||
import org.apache.lucene.index.RandomIndexWriter;
|
import org.apache.lucene.index.RandomIndexWriter;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.index.TermsEnum;
|
import org.apache.lucene.index.TermsEnum;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.MultiFields;
|
import org.apache.lucene.index.MultiFields;
|
||||||
|
import org.apache.lucene.queryParser.ParseException;
|
||||||
|
import org.apache.lucene.queryParser.QueryParser;
|
||||||
import org.apache.lucene.search.Explanation.IDFExplanation;
|
import org.apache.lucene.search.Explanation.IDFExplanation;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
@ -423,7 +426,7 @@ public class TestMultiPhraseQuery extends LuceneTestCase {
|
||||||
mpq.add(new Term[] {new Term("field", "b"), new Term("field", "c")}, 0);
|
mpq.add(new Term[] {new Term("field", "b"), new Term("field", "c")}, 0);
|
||||||
}
|
}
|
||||||
TopDocs hits = s.search(mpq, 2);
|
TopDocs hits = s.search(mpq, 2);
|
||||||
assert hits.totalHits == 2;
|
assertEquals(2, hits.totalHits);
|
||||||
assertEquals(hits.scoreDocs[0].score, hits.scoreDocs[1].score, 1e-5);
|
assertEquals(hits.scoreDocs[0].score, hits.scoreDocs[1].score, 1e-5);
|
||||||
/*
|
/*
|
||||||
for(int hit=0;hit<hits.totalHits;hit++) {
|
for(int hit=0;hit<hits.totalHits;hit++) {
|
||||||
|
@ -434,4 +437,156 @@ public class TestMultiPhraseQuery extends LuceneTestCase {
|
||||||
r.close();
|
r.close();
|
||||||
dir.close();
|
dir.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private final static TokenAndPos[] INCR_0_DOC_TOKENS = new TokenAndPos[] {
|
||||||
|
new TokenAndPos("x", 0),
|
||||||
|
new TokenAndPos("a", 1),
|
||||||
|
new TokenAndPos("1", 1),
|
||||||
|
new TokenAndPos("m", 2), // not existing, relying on slop=2
|
||||||
|
new TokenAndPos("b", 3),
|
||||||
|
new TokenAndPos("1", 3),
|
||||||
|
new TokenAndPos("n", 4), // not existing, relying on slop=2
|
||||||
|
new TokenAndPos("c", 5),
|
||||||
|
new TokenAndPos("y", 6)
|
||||||
|
};
|
||||||
|
|
||||||
|
private final static TokenAndPos[] INCR_0_QUERY_TOKENS_AND = new TokenAndPos[] {
|
||||||
|
new TokenAndPos("a", 0),
|
||||||
|
new TokenAndPos("1", 0),
|
||||||
|
new TokenAndPos("b", 1),
|
||||||
|
new TokenAndPos("1", 1),
|
||||||
|
new TokenAndPos("c", 2)
|
||||||
|
};
|
||||||
|
|
||||||
|
private final static TokenAndPos[][] INCR_0_QUERY_TOKENS_AND_OR_MATCH = new TokenAndPos[][] {
|
||||||
|
{ new TokenAndPos("a", 0) },
|
||||||
|
{ new TokenAndPos("x", 0), new TokenAndPos("1", 0) },
|
||||||
|
{ new TokenAndPos("b", 1) },
|
||||||
|
{ new TokenAndPos("x", 1), new TokenAndPos("1", 1) },
|
||||||
|
{ new TokenAndPos("c", 2) }
|
||||||
|
};
|
||||||
|
|
||||||
|
private final static TokenAndPos[][] INCR_0_QUERY_TOKENS_AND_OR_NO_MATCHN = new TokenAndPos[][] {
|
||||||
|
{ new TokenAndPos("x", 0) },
|
||||||
|
{ new TokenAndPos("a", 0), new TokenAndPos("1", 0) },
|
||||||
|
{ new TokenAndPos("x", 1) },
|
||||||
|
{ new TokenAndPos("b", 1), new TokenAndPos("1", 1) },
|
||||||
|
{ new TokenAndPos("c", 2) }
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* using query parser, MPQ will be created, and will not be strict about having all query terms
|
||||||
|
* in each position - one of each position is sufficient (OR logic)
|
||||||
|
*/
|
||||||
|
public void testZeroPosIncrSloppyParsedAnd() throws IOException, ParseException {
|
||||||
|
QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new CannedAnalyzer(INCR_0_QUERY_TOKENS_AND));
|
||||||
|
final Query q = qp.parse("\"this text is acually ignored\"");
|
||||||
|
assertTrue("wrong query type!", q instanceof MultiPhraseQuery);
|
||||||
|
doTestZeroPosIncrSloppy(q, 0);
|
||||||
|
((MultiPhraseQuery) q).setSlop(1);
|
||||||
|
doTestZeroPosIncrSloppy(q, 0);
|
||||||
|
((MultiPhraseQuery) q).setSlop(2);
|
||||||
|
doTestZeroPosIncrSloppy(q, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void doTestZeroPosIncrSloppy(Query q, int nExpected) throws IOException {
|
||||||
|
Directory dir = newDirectory(); // random dir
|
||||||
|
IndexWriterConfig cfg = newIndexWriterConfig(TEST_VERSION_CURRENT, new CannedAnalyzer(INCR_0_DOC_TOKENS));
|
||||||
|
IndexWriter writer = new IndexWriter(dir, cfg);
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(new Field("field", "", Field.Store.NO, Field.Index.ANALYZED));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
IndexReader r = IndexReader.open(writer,false);
|
||||||
|
writer.close();
|
||||||
|
IndexSearcher s = new IndexSearcher(r);
|
||||||
|
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println("QUERY=" + q);
|
||||||
|
}
|
||||||
|
|
||||||
|
TopDocs hits = s.search(q, 1);
|
||||||
|
assertEquals("wrong number of results", nExpected, hits.totalHits);
|
||||||
|
|
||||||
|
if (VERBOSE) {
|
||||||
|
for(int hit=0;hit<hits.totalHits;hit++) {
|
||||||
|
ScoreDoc sd = hits.scoreDocs[hit];
|
||||||
|
System.out.println(" hit doc=" + sd.doc + " score=" + sd.score);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
r.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* PQ AND Mode - Manually creating a phrase query
|
||||||
|
*/
|
||||||
|
public void testZeroPosIncrSloppyPqAnd() throws IOException, ParseException {
|
||||||
|
final PhraseQuery pq = new PhraseQuery();
|
||||||
|
for (TokenAndPos tap : INCR_0_QUERY_TOKENS_AND) {
|
||||||
|
pq.add(new Term("field",tap.token), tap.pos);
|
||||||
|
}
|
||||||
|
doTestZeroPosIncrSloppy(pq, 0);
|
||||||
|
pq.setSlop(1);
|
||||||
|
doTestZeroPosIncrSloppy(pq, 0);
|
||||||
|
pq.setSlop(2);
|
||||||
|
doTestZeroPosIncrSloppy(pq, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* MPQ AND Mode - Manually creating a multiple phrase query
|
||||||
|
*/
|
||||||
|
public void testZeroPosIncrSloppyMpqAnd() throws IOException, ParseException {
|
||||||
|
final MultiPhraseQuery mpq = new MultiPhraseQuery();
|
||||||
|
for (TokenAndPos tap : INCR_0_QUERY_TOKENS_AND) {
|
||||||
|
mpq.add(new Term[]{new Term("field",tap.token)}, tap.pos); //AND logic
|
||||||
|
}
|
||||||
|
doTestZeroPosIncrSloppy(mpq, 0);
|
||||||
|
mpq.setSlop(1);
|
||||||
|
doTestZeroPosIncrSloppy(mpq, 0);
|
||||||
|
mpq.setSlop(2);
|
||||||
|
doTestZeroPosIncrSloppy(mpq, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* MPQ Combined AND OR Mode - Manually creating a multiple phrase query
|
||||||
|
*/
|
||||||
|
public void testZeroPosIncrSloppyMpqAndOrMatch() throws IOException, ParseException {
|
||||||
|
final MultiPhraseQuery mpq = new MultiPhraseQuery();
|
||||||
|
for (TokenAndPos tap[] : INCR_0_QUERY_TOKENS_AND_OR_MATCH) {
|
||||||
|
Term[] terms = tapTerms(tap);
|
||||||
|
final int pos = tap[0].pos;
|
||||||
|
mpq.add(terms, pos); //AND logic in pos, OR across lines
|
||||||
|
}
|
||||||
|
doTestZeroPosIncrSloppy(mpq, 0);
|
||||||
|
mpq.setSlop(1);
|
||||||
|
doTestZeroPosIncrSloppy(mpq, 0);
|
||||||
|
mpq.setSlop(2);
|
||||||
|
doTestZeroPosIncrSloppy(mpq, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* MPQ Combined AND OR Mode - Manually creating a multiple phrase query - with no match
|
||||||
|
*/
|
||||||
|
public void testZeroPosIncrSloppyMpqAndOrNoMatch() throws IOException, ParseException {
|
||||||
|
final MultiPhraseQuery mpq = new MultiPhraseQuery();
|
||||||
|
for (TokenAndPos tap[] : INCR_0_QUERY_TOKENS_AND_OR_NO_MATCHN) {
|
||||||
|
Term[] terms = tapTerms(tap);
|
||||||
|
final int pos = tap[0].pos;
|
||||||
|
mpq.add(terms, pos); //AND logic in pos, OR across lines
|
||||||
|
}
|
||||||
|
doTestZeroPosIncrSloppy(mpq, 0);
|
||||||
|
mpq.setSlop(2);
|
||||||
|
doTestZeroPosIncrSloppy(mpq, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
private Term[] tapTerms(TokenAndPos[] tap) {
|
||||||
|
Term[] terms = new Term[tap.length];
|
||||||
|
for (int i=0; i<terms.length; i++) {
|
||||||
|
terms[i] = new Term("field",tap[i].token);
|
||||||
|
}
|
||||||
|
return terms;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue