SOLR-3589: Edismax parser does not honor mm parameter if analyzer splits a token

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1406437 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-11-07 03:25:20 +00:00
parent debe67b60b
commit d93f599e32
3 changed files with 174 additions and 0 deletions

View File

@ -146,6 +146,9 @@ Bug Fixes
thrown by PingRequestHandler. Do not log exceptions if a user tries to view a
hidden file using ShowFileRequestHandler. (Tomás Fernández Löbbe via James Dyer)
* SOLR-3589: Edismax parser does not honor mm parameter if analyzer splits a token.
(Tom Burton-West, Robert Muir)
Other Changes
----------------------

View File

@ -243,6 +243,8 @@ class ExtendedDismaxQParser extends QParser {
// For correct lucene queries, turn off mm processing if there
// were explicit operators (except for AND).
boolean doMinMatched = (numOR + numNOT + numPluses + numMinuses) == 0;
// but always for unstructured implicit bqs created by getFieldQuery
up.minShouldMatch = minShouldMatch;
try {
up.setRemoveStopFilter(!stopwords);
@ -888,6 +890,7 @@ class ExtendedDismaxQParser extends QParser {
private Map<String, Analyzer> nonStopFilterAnalyzerPerField;
private boolean removeStopFilter;
String minShouldMatch; // for inner boolean queries produced from a single fieldQuery
/**
* Where we store a map from field name we expect to see in our query
@ -1161,6 +1164,18 @@ class ExtendedDismaxQParser extends QParser {
case FIELD: // fallthrough
case PHRASE:
Query query = super.getFieldQuery(field, val, type == QType.PHRASE);
// A BooleanQuery is only possible from getFieldQuery if it came from
// a single whitespace separated term. In this case, check the coordination
// factor on the query: if its enabled, that means we aren't a set of synonyms
// but instead multiple terms from one whitespace-separated term, we must
// apply minShouldMatch here so that it works correctly with other things
// like aliasing.
if (query instanceof BooleanQuery) {
BooleanQuery bq = (BooleanQuery) query;
if (!bq.isCoordDisabled()) {
SolrPluginUtils.setMinShouldMatch(bq, minShouldMatch);
}
}
if (query instanceof PhraseQuery) {
PhraseQuery pq = (PhraseQuery)query;
if (minClauseSize > 1 && pq.getTerms().length < minClauseSize) return null;

View File

@ -60,6 +60,13 @@ public class TestExtendedDismaxParser extends AbstractSolrTestCase {
assertU(adoc("id", "52", "text_sw", "tekna theou klethomen"));
assertU(adoc("id", "53", "text_sw", "nun tekna theou esmen"));
assertU(adoc("id", "54", "text_sw", "phanera estin ta tekna tou theou"));
assertU(adoc("id", "55", "standardtok", ""));
assertU(adoc("id", "56", "standardtok", "大亚"));
assertU(adoc("id", "57", "standardtok", "大亚湾"));
assertU(adoc("id", "58", "HTMLstandardtok", ""));
assertU(adoc("id", "59", "HTMLstandardtok", "大亚"));
assertU(adoc("id", "60", "HTMLstandardtok", "大亚湾"));
assertU(adoc("id", "61", "text_sw", "bazaaa")); // synonyms in an expansion group
assertU(commit());
}
@Override
@ -774,4 +781,153 @@ public class TestExtendedDismaxParser extends AbstractSolrTestCase {
, "*[count(//doc)=1]");
}
/**
* SOLR-3589: Edismax parser does not honor mm parameter if analyzer splits a token
*/
public void testCJK() throws Exception {
assertQ("test cjk (disjunction)",
req("q", "大亚湾",
"qf", "standardtok",
"mm", "0%",
"defType", "edismax")
, "*[count(//doc)=3]");
assertQ("test cjk (minShouldMatch)",
req("q", "大亚湾",
"qf", "standardtok",
"mm", "67%",
"defType", "edismax")
, "*[count(//doc)=2]");
assertQ("test cjk (conjunction)",
req("q", "大亚湾",
"qf", "standardtok",
"mm", "100%",
"defType", "edismax")
, "*[count(//doc)=1]");
}
/**
* test that minShouldMatch works with aliasing
* for implicit boolean queries
*/
public void testCJKAliasing() throws Exception {
// single field
assertQ("test cjk (aliasing+disjunction)",
req("q", "myalias:大亚湾",
"f.myalias.qf", "standardtok",
"mm", "0%",
"defType", "edismax")
, "*[count(//doc)=3]");
assertQ("test cjk (aliasing+minShouldMatch)",
req("q", "myalias:大亚湾",
"f.myalias.qf", "standardtok",
"mm", "67%",
"defType", "edismax")
, "*[count(//doc)=2]");
assertQ("test cjk (aliasing+conjunction)",
req("q", "myalias:大亚湾",
"f.myalias.qf", "standardtok",
"mm", "100%",
"defType", "edismax")
, "*[count(//doc)=1]");
// multifield
assertQ("test cjk (aliasing+disjunction)",
req("q", "myalias:大亚湾",
"f.myalias.qf", "standardtok HTMLstandardtok",
"mm", "0%",
"defType", "edismax")
, "*[count(//doc)=6]");
assertQ("test cjk (aliasing+minShouldMatch)",
req("q", "myalias:大亚湾",
"f.myalias.qf", "standardtok HTMLstandardtok",
"mm", "67%",
"defType", "edismax")
, "*[count(//doc)=4]");
assertQ("test cjk (aliasing+conjunction)",
req("q", "myalias:大亚湾",
"f.myalias.qf", "standardtok HTMLstandardtok",
"mm", "100%",
"defType", "edismax")
, "*[count(//doc)=2]");
}
/** Test that we apply boosts correctly */
public void testCJKBoosts() throws Exception {
assertQ("test cjk (disjunction)",
req("q", "大亚湾",
"qf", "standardtok^2 HTMLstandardtok",
"mm", "0%",
"defType", "edismax")
, "*[count(//doc)=6]", "//result/doc[1]/str[@name='id'][.='57']");
assertQ("test cjk (minShouldMatch)",
req("q", "大亚湾",
"qf", "standardtok^2 HTMLstandardtok",
"mm", "67%",
"defType", "edismax")
, "*[count(//doc)=4]", "//result/doc[1]/str[@name='id'][.='57']");
assertQ("test cjk (conjunction)",
req("q", "大亚湾",
"qf", "standardtok^2 HTMLstandardtok",
"mm", "100%",
"defType", "edismax")
, "*[count(//doc)=2]", "//result/doc[1]/str[@name='id'][.='57']");
// now boost the other field
assertQ("test cjk (disjunction)",
req("q", "大亚湾",
"qf", "standardtok HTMLstandardtok^2",
"mm", "0%",
"defType", "edismax")
, "*[count(//doc)=6]", "//result/doc[1]/str[@name='id'][.='60']");
assertQ("test cjk (minShouldMatch)",
req("q", "大亚湾",
"qf", "standardtok HTMLstandardtok^2",
"mm", "67%",
"defType", "edismax")
, "*[count(//doc)=4]", "//result/doc[1]/str[@name='id'][.='60']");
assertQ("test cjk (conjunction)",
req("q", "大亚湾",
"qf", "standardtok HTMLstandardtok^2",
"mm", "100%",
"defType", "edismax")
, "*[count(//doc)=2]", "//result/doc[1]/str[@name='id'][.='60']");
}
/** always apply minShouldMatch to the inner booleanqueries
* created from whitespace, as these are never structured lucene queries
* but only come from unstructured text */
public void testCJKStructured() throws Exception {
assertQ("test cjk (disjunction)",
req("q", "大亚湾 OR bogus",
"qf", "standardtok",
"mm", "0%",
"defType", "edismax")
, "*[count(//doc)=3]");
assertQ("test cjk (minShouldMatch)",
req("q", "大亚湾 OR bogus",
"qf", "standardtok",
"mm", "67%",
"defType", "edismax")
, "*[count(//doc)=2]");
assertQ("test cjk (conjunction)",
req("q", "大亚湾 OR bogus",
"qf", "standardtok",
"mm", "100%",
"defType", "edismax")
, "*[count(//doc)=1]");
}
/**
* Test that we don't apply minShouldMatch to the inner boolean queries
* when there are synonyms (these are indicated by coordination factor)
*/
public void testSynonyms() throws Exception {
// document only contains baraaa, but should still match.
assertQ("test synonyms",
req("q", "fooaaa",
"qf", "text_sw",
"mm", "100%",
"defType", "edismax")
, "*[count(//doc)=1]");
}
}