SOLR-1604: Wildcards, ORs etc inside Phrase Queries or 'ComplexPhraseQueryParser support in Solr'

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1578200 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Erick Erickson 2014-03-17 01:11:15 +00:00
parent bcedf1bba1
commit 023aa2bf2a
7 changed files with 556 additions and 1 deletions

View File

@ -128,6 +128,8 @@ New Features
* SOLR-3177: Enable tagging and excluding filters in StatsComponent via the
localParams syntax. (Mathias H., Nikolai Luthman, Vitaliy Zhovtyuk, shalin)
* SOLR-1604: Wildcards, ORs etc inside Phrase Queries. (Ahmet Arslan via Erick Erickson)
Bug Fixes
----------------------

View File

@ -0,0 +1,117 @@
package org.apache.solr.search;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.complexPhrase.ComplexPhraseQueryParser;
import org.apache.lucene.search.Query;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.parser.QueryParser;
import org.apache.solr.request.SolrQueryRequest;
/**
* Parse Solr's variant on the Lucene {@link org.apache.lucene.queryparser.complexPhrase.ComplexPhraseQueryParser} syntax.
* <p/>
* Modified from {@link org.apache.solr.search.LuceneQParserPlugin} and {@link org.apache.solr.search.SurroundQParserPlugin}
*/
public class ComplexPhraseQParserPlugin extends QParserPlugin {
public static final String NAME = "complexphrase";
private boolean inOrder = true;
@Override
public void init(NamedList args) {
if (args != null) {
Object val = args.get("inOrder");
if (val != null) {
inOrder = StrUtils.parseBool(val.toString());
}
}
}
@Override
public QParser createParser(String qstr, SolrParams localParams, SolrParams params, SolrQueryRequest req) {
ComplexPhraseQParser qParser = new ComplexPhraseQParser(qstr, localParams, params, req);
qParser.setInOrder(inOrder);
return qParser;
}
}
/**
* Modified from {@link org.apache.solr.search.LuceneQParser} and {@link org.apache.solr.search.SurroundQParser}
*/
class ComplexPhraseQParser extends QParser {
ComplexPhraseQueryParser lparser;
boolean inOrder = true;
/**
* When <code>inOrder</code> is true, the search terms must
* exists in the documents as the same order as in query.
*
* @param inOrder parameter to choose between ordered or un-ordered proximity search
*/
public void setInOrder(final boolean inOrder) {
this.inOrder = inOrder;
}
public ComplexPhraseQParser(String qstr, SolrParams localParams, SolrParams params, SolrQueryRequest req) {
super(qstr, localParams, params, req);
}
@Override
public Query parse() throws SyntaxError {
String qstr = getString();
String defaultField = getParam(CommonParams.DF);
if (defaultField == null) {
defaultField = getReq().getSchema().getDefaultSearchFieldName();
}
lparser = new ComplexPhraseQueryParser(getReq().getCore().getSolrConfig().luceneMatchVersion, defaultField, getReq().getSchema().getQueryAnalyzer());
if (localParams != null)
inOrder = localParams.getBool("inOrder", inOrder);
lparser.setInOrder(inOrder);
QueryParser.Operator defaultOperator = QueryParsing.getQueryParserDefaultOperator(getReq().getSchema(), getParam(QueryParsing.OP));
if (QueryParser.Operator.AND.equals(defaultOperator))
lparser.setDefaultOperator(org.apache.lucene.queryparser.classic.QueryParser.Operator.AND);
else
lparser.setDefaultOperator(org.apache.lucene.queryparser.classic.QueryParser.Operator.OR);
try {
return lparser.parse(qstr);
} catch (ParseException pe) {
throw new SyntaxError(pe);
}
}
@Override
public String[] getDefaultHighlightFields() {
return lparser == null ? new String[]{} : new String[]{lparser.getField()};
}
}

View File

@ -59,7 +59,8 @@ public abstract class QParserPlugin implements NamedListInitializedPlugin, SolrI
BlockJoinParentQParserPlugin.NAME, BlockJoinParentQParserPlugin.class,
BlockJoinChildQParserPlugin.NAME, BlockJoinChildQParserPlugin.class,
CollapsingQParserPlugin.NAME, CollapsingQParserPlugin.class,
SimpleQParserPlugin.NAME, SimpleQParserPlugin.class
SimpleQParserPlugin.NAME, SimpleQParserPlugin.class,
ComplexPhraseQParserPlugin.NAME, ComplexPhraseQParserPlugin.class
};
/** return a {@link QParser} */

View File

@ -0,0 +1,40 @@
<?xml version="1.0" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<schema name="test" version="1.2">
<types>
<fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldtype name="nametext" class="solr.TextField">
<analyzer class="org.apache.lucene.analysis.standard.StandardAnalyzer"/>
</fieldtype>
</types>
<fields>
<field name="id" type="int" indexed="true" stored="true" multiValued="false" required="true"/>
<field name="name" type="nametext" indexed="true" stored="true"/>
<field name="title" type="nametext" indexed="true" stored="true"/>
<field name="text" type="nametext" indexed="true" stored="true"/>
</fields>
<defaultSearchField>text</defaultSearchField>
<solrQueryParser defaultOperator="AND"/>
<uniqueKey>id</uniqueKey>
</schema>

View File

@ -28,4 +28,9 @@
<!-- query parser without final NAME field lead to NPE during query parser initialization-->
<queryParser name="fail" class="solr.search.LuceneQParserPlugin"/>
<!-- Un-ordered complex phrase query parser Same behaviour as Lucene's Sloppy PhraseQuery where slop is greater than 0 -->
<queryParser name="unorderedcomplexphrase" class="solr.search.ComplexPhraseQParserPlugin">
<bool name="inOrder">false</bool>
</queryParser>
</config>

View File

@ -344,6 +344,13 @@ public class QueryEqualityTest extends SolrTestCaseJ4 {
"and(apache,solr)", "apache AND solr");
}
public void testQueryComplexPhrase() throws Exception {
assertQueryEquals("complexphrase", "{!complexphrase df=text}\"jo* smith\"",
"text:\"jo* smith\"");
assertQueryEquals("complexphrase", "{!complexphrase df=title}\"jo* smith\"",
"title:\"jo* smith\"");
}
public void testFuncTestfunc() throws Exception {
assertFuncEquals("testfunc(foo_i)","testfunc(field(foo_i))");
assertFuncEquals("testfunc(23)");

View File

@ -0,0 +1,383 @@
package org.apache.solr.search;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.HighlightParams;
import org.apache.solr.util.AbstractSolrTestCase;
import org.apache.solr.util.TestHarness;
import org.junit.BeforeClass;
import org.junit.Test;
import java.util.HashMap;
public class TestComplexPhraseQParserPlugin extends AbstractSolrTestCase {
@BeforeClass
public static void beforeClass() throws Exception {
initCore("solrconfig-query-parser-init.xml","schema-complex-phrase.xml");
}
@Override
public void setUp() throws Exception {
super.setUp();
}
@Override
public void tearDown() throws Exception {
super.tearDown();
}
@Test
public void testDefaultField() {
assertU(adoc("text", "john smith", "id", "1"));
assertU(adoc("text", "johathon smith", "id", "2"));
assertU(adoc("text", "john percival smith", "id", "3"));
assertU(commit());
assertU(optimize());
assertQ(req("q", "{!complexphrase} \"john smith\"")
, "//result[@numFound='1']"
, "//doc[./int[@name='id']='1']"
);
assertQ(req("q", "{!complexphrase} \"j* smyth~\"")
, "//result[@numFound='2']"
, "//doc[./int[@name='id']='1']"
, "//doc[./int[@name='id']='2']"
);
assertQ(req("q", "{!complexphrase} \"(jo* -john) smith\"")
, "//result[@numFound='1']"
, "//doc[./int[@name='id']='2']"
);
assertQ(req("q", "{!complexphrase} \"jo* smith\"~2")
, "//result[@numFound='3']"
, "//doc[./int[@name='id']='1']"
, "//doc[./int[@name='id']='2']"
, "//doc[./int[@name='id']='3']"
);
assertQ(req("q", "{!complexphrase} \"jo* [sma TO smz]\"")
, "//result[@numFound='2']"
, "//doc[./int[@name='id']='1']"
, "//doc[./int[@name='id']='2']"
);
assertQ(req("q", "{!complexphrase} \"john\"")
, "//result[@numFound='2']"
, "//doc[./int[@name='id']='1']"
, "//doc[./int[@name='id']='3']"
);
assertQ(req("q", "{!complexphrase} \"(john johathon) smith\"")
, "//result[@numFound='2']"
, "//doc[./int[@name='id']='1']"
, "//doc[./int[@name='id']='2']"
);
}
@Test
public void test() {
HashMap<String, String> args = new HashMap<String, String>();
args.put(QueryParsing.DEFTYPE, ComplexPhraseQParserPlugin.NAME);
args.put(CommonParams.FL, "id");
TestHarness.LocalRequestFactory sumLRF = h.getRequestFactory(
"standard", 0, 200, args);
assertU(adoc("name", "john smith", "id", "1"));
assertU(adoc("name", "johathon smith", "id", "2"));
assertU(adoc("name", "john percival smith", "id", "3"));
assertU(commit());
assertU(optimize());
assertQ("Simple multi-term still works",
sumLRF.makeRequest("name:\"john smith\""),
"//doc[./int[@name='id']='1']",
"//result[@numFound='1']"
);
assertQ(req("q", "{!complexphrase} name:\"john smith\""),
"//doc[./int[@name='id']='1']",
"//result[@numFound='1']"
);
assertQ("wildcards and fuzzies are OK in phrases",
sumLRF.makeRequest("name:\"j* smyth~\""),
"//doc[./int[@name='id']='1']",
"//doc[./int[@name='id']='2']",
"//result[@numFound='2']"
);
assertQ("boolean logic works",
sumLRF.makeRequest("name:\"(jo* -john) smith\""),
"//doc[./int[@name='id']='2']",
"//result[@numFound='1']"
);
assertQ("position logic works",
sumLRF.makeRequest("name:\"jo* smith\"~2"),
"//doc[./int[@name='id']='1']",
"//doc[./int[@name='id']='2']",
"//doc[./int[@name='id']='3']",
"//result[@numFound='3']"
);
assertQ("range queries supported",
sumLRF.makeRequest("name:\"jo* [sma TO smz]\""),
"//doc[./int[@name='id']='1']",
"//doc[./int[@name='id']='2']",
"//result[@numFound='2']"
);
assertQ("Simple single-term still works",
sumLRF.makeRequest("name:\"john\""),
"//doc[./int[@name='id']='1']",
"//doc[./int[@name='id']='3']",
"//result[@numFound='2']"
);
assertQ("OR inside phrase works",
sumLRF.makeRequest("name:\"(john johathon) smith\""),
"//doc[./int[@name='id']='1']",
"//doc[./int[@name='id']='2']",
"//result[@numFound='2']"
);
}
@Test
public void testPhraseHighlighter() {
HashMap<String, String> args = new HashMap<String, String>();
args.put(QueryParsing.DEFTYPE, ComplexPhraseQParserPlugin.NAME);
args.put(CommonParams.FL, "id");
args.put(HighlightParams.HIGHLIGHT, Boolean.TRUE.toString());
args.put(HighlightParams.USE_PHRASE_HIGHLIGHTER, Boolean.TRUE.toString());
args.put(HighlightParams.FIELD_MATCH, Boolean.FALSE.toString());
args.put(HighlightParams.FRAGSIZE, String.valueOf(0));
args.put(HighlightParams.FIELDS, "name");
TestHarness.LocalRequestFactory sumLRF = h.getRequestFactory(
"standard", 0, 200, args);
assertU(adoc("name", "john smith smith john", "id", "1"));
assertU(adoc("name", "johathon smith smith johathon", "id", "2"));
assertU(adoc("name", "john percival smith", "id", "3"));
assertU(commit());
assertU(optimize());
assertQ("range queries supported",
sumLRF.makeRequest("name:[sma TO smz]"),
"//doc[./int[@name='id']='1']",
"//doc[./int[@name='id']='2']",
"//doc[./int[@name='id']='3']",
"//result[@numFound='3']"
);
sumLRF = h.getRequestFactory("standard", 0, 200, args);
assertQ("PhraseHighlighter=true Test",
sumLRF.makeRequest("name:\"(john johathon) smith\""),
"//lst[@name='highlighting']/lst[@name='1']",
"//lst[@name='1']/arr[@name='name']/str[.='<em>john</em> <em>smith</em> smith john']",
"//lst[@name='highlighting']/lst[@name='2']",
"//lst[@name='2']/arr[@name='name']/str[.='<em>johathon</em> <em>smith</em> smith johathon']"
);
args.put(HighlightParams.USE_PHRASE_HIGHLIGHTER, Boolean.FALSE.toString());
sumLRF = h.getRequestFactory("standard", 0, 200, args);
assertQ("PhraseHighlighter=false Test",
sumLRF.makeRequest("name:\"(john johathon) smith\""),
"//lst[@name='highlighting']/lst[@name='1']",
"//lst[@name='1']/arr[@name='name']/str[.='<em>john</em> <em>smith</em> <em>smith</em> <em>john</em>']",
"//lst[@name='highlighting']/lst[@name='2']",
"//lst[@name='2']/arr[@name='name']/str[.='<em>johathon</em> <em>smith</em> <em>smith</em> <em>johathon</em>']"
);
/*
assertQ("Highlight Plain Prefix Query Test",
sumLRF.makeRequest("name:jo*"),
"//lst[@name='highlighting']/lst[@name='1']",
"//lst[@name='1']/arr[@name='name']/str[.='<em>john</em> smith smith <em>john</em>']",
"//lst[@name='highlighting']/lst[@name='2']",
"//lst[@name='2']/arr[@name='name']/str[.='<em>johathon</em> smith smith <em>johathon</em>']",
"//lst[@name='highlighting']/lst[@name='3']",
"//lst[@name='3']/arr[@name='name']/str[.='<em>john</em> percival smith']"
);
*/
}
@Test
public void testMultipleFields() {
assertU(adoc("text", "protein digest", "name", "dna rules", "id", "1"));
assertU(adoc("text", "digest protein", "name", "rna is the workhorse", "id", "2"));
assertU(adoc("text", "dna rules", "name", "protein digest", "id", "3"));
assertU(adoc("text", "dna really rules", "name", "digest protein", "id", "4"));
assertU(commit());
assertU(optimize());
assertQ(req("q", "{!complexphrase} name:\"protein digest\" AND text:\"dna rules\"")
, "//result[@numFound='1']"
, "//doc[./int[@name='id']='3']"
);
assertQ(req("q", "{!complexphrase} name:\"prot* dige*\" AND text:\"d* r*\"")
, "//result[@numFound='1']"
, "//doc[./int[@name='id']='3']"
);
assertQ(req("q", "{!complexphrase inOrder=\"false\"} name:\"dna* rule*\" AND text:\"prot* diges*\"")
, "//result[@numFound='1']"
, "//doc[./int[@name='id']='1']"
);
assertQ(req("q", "{!unorderedcomplexphrase} name:\"protein digest\" AND text:\"dna rules\"~2")
, "//result[@numFound='2']"
, "//doc[./int[@name='id']='3']"
, "//doc[./int[@name='id']='4']"
);
assertQ(req("q", "{!unorderedcomplexphrase inOrder=\"true\"} name:\"protein digest\" AND text:\"dna rules\"")
, "//result[@numFound='1']"
, "//doc[./int[@name='id']='3']"
);
}
@Test
public void testUnorderedPhraseQuery() {
assertU(adoc("text", "protein digest", "id", "1"));
assertU(adoc("text", "digest protein", "id", "2"));
assertU(adoc("name", "protein digest", "id", "3"));
assertU(adoc("name", "digest protein", "id", "4"));
assertU(commit());
assertU(optimize());
/**
* ordered phrase query return only fist document
*/
assertQ(req("q", "{!complexphrase} \"protein digest\"")
, "//result[@numFound='1']"
, "//doc[./int[@name='id']='1']"
);
assertQ(req("q", "{!complexphrase} \"pro* di*\"")
, "//result[@numFound='1']"
, "//doc[./int[@name='id']='1']"
);
assertQ(req("q", "{!complexphrase} name:\"protein digest\"")
, "//result[@numFound='1']"
, "//doc[./int[@name='id']='3']"
);
assertQ(req("q", "{!complexphrase} name:\"pro* di*\"")
, "//result[@numFound='1']"
, "//doc[./int[@name='id']='3']"
);
/**
* unordered phrase query returns two documents.
*/
assertQ(req("q", "{!unorderedcomplexphrase} \"digest protein\"")
, "//result[@numFound='2']"
, "//doc[./int[@name='id']='1']"
, "//doc[./int[@name='id']='2']"
);
assertQ(req("q", "{!unorderedcomplexphrase} \"di* pro*\"")
, "//result[@numFound='2']"
, "//doc[./int[@name='id']='1']"
, "//doc[./int[@name='id']='2']"
);
assertQ(req("q", "{!unorderedcomplexphrase} name:\"digest protein\"")
, "//result[@numFound='2']"
, "//doc[./int[@name='id']='3']"
, "//doc[./int[@name='id']='4']"
);
assertQ(req("q", "{!unorderedcomplexphrase} name:\"di* pro*\"")
, "//result[@numFound='2']"
, "//doc[./int[@name='id']='3']"
, "//doc[./int[@name='id']='4']"
);
/**
* inOrder parameter can be defined with local params syntax.
*/
assertQ(req("q", "{!complexphrase inOrder=false} \"di* pro*\"")
, "//result[@numFound='2']"
, "//doc[./int[@name='id']='1']"
, "//doc[./int[@name='id']='2']"
);
/**
* inOrder and df parameters can be defined with local params syntax.
*/
assertQ(req("q", "{!complexphrase inOrder=false df=name} \"di* pro*\"")
, "//result[@numFound='2']"
, "//doc[./int[@name='id']='3']"
, "//doc[./int[@name='id']='4']"
);
}
/**
* the query "sulfur-reducing bacteria" was crashing due to the dash inside the phrase.
*/
@Test public void testHyphenInPhrase() {
assertU(adoc("text", "sulfur-reducing bacteria", "id", "1"));
assertU(adoc("text", "sulfur reducing bacteria", "id", "2"));
assertU(adoc("name", "sulfur-reducing bacteria", "id", "3"));
assertU(adoc("name", "sulfur reducing bacteria", "id", "4"));
assertU(commit());
assertU(optimize());
assertQ(req("q", "{!complexphrase} \"sulfur-reducing bacteria\"")
, "//result[@numFound='2']"
, "//doc[./int[@name='id']='1']"
, "//doc[./int[@name='id']='2']"
);
assertQ(req("q", "{!complexphrase} name:\"sulfur-reducing bacteria\"")
, "//result[@numFound='2']"
, "//doc[./int[@name='id']='3']"
, "//doc[./int[@name='id']='4']"
);
}
}