From d93f599e32415c1e93ebceaaf015bffdd8270a49 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Wed, 7 Nov 2012 03:25:20 +0000 Subject: [PATCH] SOLR-3589: Edismax parser does not honor mm parameter if analyzer splits a token git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1406437 13f79535-47bb-0310-9956-ffa450edef68 --- solr/CHANGES.txt | 3 + .../search/ExtendedDismaxQParserPlugin.java | 15 ++ .../solr/search/TestExtendedDismaxParser.java | 156 ++++++++++++++++++ 3 files changed, 174 insertions(+) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 9b94ea72a03..11a7ed0670a 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -146,6 +146,9 @@ Bug Fixes thrown by PingRequestHandler. Do not log exceptions if a user tries to view a hidden file using ShowFileRequestHandler. (Tomás Fernández Löbbe via James Dyer) +* SOLR-3589: Edismax parser does not honor mm parameter if analyzer splits a token. + (Tom Burton-West, Robert Muir) + Other Changes ---------------------- diff --git a/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParserPlugin.java b/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParserPlugin.java index 12692dfa1bd..e1724d9602c 100755 --- a/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParserPlugin.java +++ b/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParserPlugin.java @@ -243,6 +243,8 @@ class ExtendedDismaxQParser extends QParser { // For correct lucene queries, turn off mm processing if there // were explicit operators (except for AND). boolean doMinMatched = (numOR + numNOT + numPluses + numMinuses) == 0; + // but always for unstructured implicit bqs created by getFieldQuery + up.minShouldMatch = minShouldMatch; try { up.setRemoveStopFilter(!stopwords); @@ -888,6 +890,7 @@ class ExtendedDismaxQParser extends QParser { private Map nonStopFilterAnalyzerPerField; private boolean removeStopFilter; + String minShouldMatch; // for inner boolean queries produced from a single fieldQuery /** * Where we store a map from field name we expect to see in our query @@ -1161,6 +1164,18 @@ class ExtendedDismaxQParser extends QParser { case FIELD: // fallthrough case PHRASE: Query query = super.getFieldQuery(field, val, type == QType.PHRASE); + // A BooleanQuery is only possible from getFieldQuery if it came from + // a single whitespace separated term. In this case, check the coordination + // factor on the query: if its enabled, that means we aren't a set of synonyms + // but instead multiple terms from one whitespace-separated term, we must + // apply minShouldMatch here so that it works correctly with other things + // like aliasing. + if (query instanceof BooleanQuery) { + BooleanQuery bq = (BooleanQuery) query; + if (!bq.isCoordDisabled()) { + SolrPluginUtils.setMinShouldMatch(bq, minShouldMatch); + } + } if (query instanceof PhraseQuery) { PhraseQuery pq = (PhraseQuery)query; if (minClauseSize > 1 && pq.getTerms().length < minClauseSize) return null; diff --git a/solr/core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java b/solr/core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java index e8ee726c5b0..0c97d86d69c 100755 --- a/solr/core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java +++ b/solr/core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java @@ -60,6 +60,13 @@ public class TestExtendedDismaxParser extends AbstractSolrTestCase { assertU(adoc("id", "52", "text_sw", "tekna theou klethomen")); assertU(adoc("id", "53", "text_sw", "nun tekna theou esmen")); assertU(adoc("id", "54", "text_sw", "phanera estin ta tekna tou theou")); + assertU(adoc("id", "55", "standardtok", "大")); + assertU(adoc("id", "56", "standardtok", "大亚")); + assertU(adoc("id", "57", "standardtok", "大亚湾")); + assertU(adoc("id", "58", "HTMLstandardtok", "大")); + assertU(adoc("id", "59", "HTMLstandardtok", "大亚")); + assertU(adoc("id", "60", "HTMLstandardtok", "大亚湾")); + assertU(adoc("id", "61", "text_sw", "bazaaa")); // synonyms in an expansion group assertU(commit()); } @Override @@ -774,4 +781,153 @@ public class TestExtendedDismaxParser extends AbstractSolrTestCase { , "*[count(//doc)=1]"); } + + /** + * SOLR-3589: Edismax parser does not honor mm parameter if analyzer splits a token + */ + public void testCJK() throws Exception { + assertQ("test cjk (disjunction)", + req("q", "大亚湾", + "qf", "standardtok", + "mm", "0%", + "defType", "edismax") + , "*[count(//doc)=3]"); + assertQ("test cjk (minShouldMatch)", + req("q", "大亚湾", + "qf", "standardtok", + "mm", "67%", + "defType", "edismax") + , "*[count(//doc)=2]"); + assertQ("test cjk (conjunction)", + req("q", "大亚湾", + "qf", "standardtok", + "mm", "100%", + "defType", "edismax") + , "*[count(//doc)=1]"); + } + + /** + * test that minShouldMatch works with aliasing + * for implicit boolean queries + */ + public void testCJKAliasing() throws Exception { + // single field + assertQ("test cjk (aliasing+disjunction)", + req("q", "myalias:大亚湾", + "f.myalias.qf", "standardtok", + "mm", "0%", + "defType", "edismax") + , "*[count(//doc)=3]"); + assertQ("test cjk (aliasing+minShouldMatch)", + req("q", "myalias:大亚湾", + "f.myalias.qf", "standardtok", + "mm", "67%", + "defType", "edismax") + , "*[count(//doc)=2]"); + assertQ("test cjk (aliasing+conjunction)", + req("q", "myalias:大亚湾", + "f.myalias.qf", "standardtok", + "mm", "100%", + "defType", "edismax") + , "*[count(//doc)=1]"); + // multifield + assertQ("test cjk (aliasing+disjunction)", + req("q", "myalias:大亚湾", + "f.myalias.qf", "standardtok HTMLstandardtok", + "mm", "0%", + "defType", "edismax") + , "*[count(//doc)=6]"); + assertQ("test cjk (aliasing+minShouldMatch)", + req("q", "myalias:大亚湾", + "f.myalias.qf", "standardtok HTMLstandardtok", + "mm", "67%", + "defType", "edismax") + , "*[count(//doc)=4]"); + assertQ("test cjk (aliasing+conjunction)", + req("q", "myalias:大亚湾", + "f.myalias.qf", "standardtok HTMLstandardtok", + "mm", "100%", + "defType", "edismax") + , "*[count(//doc)=2]"); + } + + /** Test that we apply boosts correctly */ + public void testCJKBoosts() throws Exception { + assertQ("test cjk (disjunction)", + req("q", "大亚湾", + "qf", "standardtok^2 HTMLstandardtok", + "mm", "0%", + "defType", "edismax") + , "*[count(//doc)=6]", "//result/doc[1]/str[@name='id'][.='57']"); + assertQ("test cjk (minShouldMatch)", + req("q", "大亚湾", + "qf", "standardtok^2 HTMLstandardtok", + "mm", "67%", + "defType", "edismax") + , "*[count(//doc)=4]", "//result/doc[1]/str[@name='id'][.='57']"); + assertQ("test cjk (conjunction)", + req("q", "大亚湾", + "qf", "standardtok^2 HTMLstandardtok", + "mm", "100%", + "defType", "edismax") + , "*[count(//doc)=2]", "//result/doc[1]/str[@name='id'][.='57']"); + + // now boost the other field + assertQ("test cjk (disjunction)", + req("q", "大亚湾", + "qf", "standardtok HTMLstandardtok^2", + "mm", "0%", + "defType", "edismax") + , "*[count(//doc)=6]", "//result/doc[1]/str[@name='id'][.='60']"); + assertQ("test cjk (minShouldMatch)", + req("q", "大亚湾", + "qf", "standardtok HTMLstandardtok^2", + "mm", "67%", + "defType", "edismax") + , "*[count(//doc)=4]", "//result/doc[1]/str[@name='id'][.='60']"); + assertQ("test cjk (conjunction)", + req("q", "大亚湾", + "qf", "standardtok HTMLstandardtok^2", + "mm", "100%", + "defType", "edismax") + , "*[count(//doc)=2]", "//result/doc[1]/str[@name='id'][.='60']"); + } + + /** always apply minShouldMatch to the inner booleanqueries + * created from whitespace, as these are never structured lucene queries + * but only come from unstructured text */ + public void testCJKStructured() throws Exception { + assertQ("test cjk (disjunction)", + req("q", "大亚湾 OR bogus", + "qf", "standardtok", + "mm", "0%", + "defType", "edismax") + , "*[count(//doc)=3]"); + assertQ("test cjk (minShouldMatch)", + req("q", "大亚湾 OR bogus", + "qf", "standardtok", + "mm", "67%", + "defType", "edismax") + , "*[count(//doc)=2]"); + assertQ("test cjk (conjunction)", + req("q", "大亚湾 OR bogus", + "qf", "standardtok", + "mm", "100%", + "defType", "edismax") + , "*[count(//doc)=1]"); + } + + /** + * Test that we don't apply minShouldMatch to the inner boolean queries + * when there are synonyms (these are indicated by coordination factor) + */ + public void testSynonyms() throws Exception { + // document only contains baraaa, but should still match. + assertQ("test synonyms", + req("q", "fooaaa", + "qf", "text_sw", + "mm", "100%", + "defType", "edismax") + , "*[count(//doc)=1]"); + } }