SOLR-1553: extended dismax parser

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@881546 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2009-11-17 21:46:38 +00:00
parent 2886626aea
commit 1cfb5a2fe3
8 changed files with 1424 additions and 41 deletions

View File

@ -34,9 +34,14 @@ Detailed Change List
New Features
----------------------
1. SOLR-1302: Added several new distance based functions, including Great Circle (haversine), Manhattan and Euclidean.
* SOLR-1302: Added several new distance based functions, including Great Circle (haversine), Manhattan and Euclidean.
Also added geohash(), deg() and rad() convenience functions. See http://wiki.apache.org/solr/FunctionQuery. (gsingers)
* SOLR-1553: New dismax parser implementation (accessible as "edismax")
that supports full lucene syntax, improved reserved char escaping,
fielded queries, improved proximity boosting, and improved stopword
handling. (yonik)
Optimizations
----------------------

File diff suppressed because it is too large Load Diff

View File

@ -32,6 +32,7 @@ public abstract class QParserPlugin implements NamedListInitializedPlugin {
PrefixQParserPlugin.NAME, PrefixQParserPlugin.class,
BoostQParserPlugin.NAME, BoostQParserPlugin.class,
DisMaxQParserPlugin.NAME, DisMaxQParserPlugin.class,
ExtendedDismaxQParserPlugin.NAME, ExtendedDismaxQParserPlugin.class,
FieldQParserPlugin.NAME, FieldQParserPlugin.class,
RawQParserPlugin.NAME, RawQParserPlugin.class,
NestedQParserPlugin.NAME, NestedQParserPlugin.class,

View File

@ -0,0 +1,170 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.search;
import org.apache.solr.util.AbstractSolrTestCase;
public class TestExtendedDismaxParser extends AbstractSolrTestCase {
public String getSchemaFile() { return "schema12.xml"; }
public String getSolrConfigFile() { return "solrconfig.xml"; }
// public String getCoreName() { return "collection1"; }
public void setUp() throws Exception {
// if you override setUp or tearDown, you better call
// the super classes version
super.setUp();
}
public void tearDown() throws Exception {
// if you override setUp or tearDown, you better call
// the super classes version
super.tearDown();
}
// test the edismax query parser based on the dismax parser
public void testFocusQueryParser() {
assertU(adoc("id", "42", "trait_ss", "Tool", "trait_ss", "Obnoxious",
"name", "Zapp Brannigan"));
assertU(adoc("id", "43" ,
"title", "Democratic Order op Planets"));
assertU(adoc("id", "44", "trait_ss", "Tool",
"name", "The Zapper"));
assertU(adoc("id", "45", "trait_ss", "Chauvinist",
"title", "25 star General"));
assertU(adoc("id", "46", "trait_ss", "Obnoxious",
"subject", "Defeated the pacifists op the Gandhi nebula"));
assertU(adoc("id", "47", "trait_ss", "Pig",
"text", "line up and fly directly at the enemy death cannons, clogging them with wreckage!"));
assertU(adoc("id", "48", "text_sw", "this has gigabyte potential", "foo_i","100"));
assertU(adoc("id", "49", "text_sw", "start the big apple end", "foo_i","-100"));
assertU(adoc("id", "50", "text_sw", "start new big city end"));
assertU(commit());
String allq = "id:[42 TO 50]";
String allr = "*[count(//doc)=9]";
String oner = "*[count(//doc)=1]";
String twor = "*[count(//doc)=2]";
String nor = "*[count(//doc)=0]";
assertQ("standard request handler returns all matches",
req(allq),
allr
);
assertQ("edismax query parser returns all matches",
req("q", allq,
"defType", "edismax"
),
allr
);
assertQ(req("defType", "edismax", "qf", "trait_ss",
"q","Tool"), twor
);
// test that field types that aren't applicable don't cause an exception to be thrown
assertQ(req("defType", "edismax", "qf", "trait_ss foo_i foo_f foo_dt foo_l foo_d foo_b",
"q","Tool"), twor
);
// test that numeric field types can be queried
assertQ(req("defType", "edismax", "qf", "text_sw",
"q","foo_i:100"), oner
);
// test that numeric field types can be queried
assertQ(req("defType", "edismax", "qf", "text_sw",
"q","foo_i:-100"), oner
);
// test that numeric field types can be queried via qf
assertQ(req("defType", "edismax", "qf", "text_sw foo_i",
"q","100"), oner
);
assertQ(req("defType", "edismax", "qf", "name title subject text",
"q","op"), twor
);
assertQ(req("defType", "edismax", "qf", "name title subject text",
"q","Order op"), oner
);
assertQ(req("defType", "edismax", "qf", "name title subject text",
"q","Order AND op"), oner
);
assertQ(req("defType", "edismax", "qf", "name title subject text",
"q","Order and op"), oner
);
assertQ(req("defType", "edismax", "qf", "name title subject text",
"q","+Order op"), oner
);
assertQ(req("defType", "edismax", "qf", "name title subject text",
"q","Order OR op"), twor
);
assertQ(req("defType", "edismax", "qf", "name title subject text",
"q","Order or op"), twor
);
assertQ(req("defType", "edismax", "qf", "name title subject text",
"q","*:*"), allr
);
assertQ(req("defType", "edismax", "qf", "name title subject text",
"q","star OR (-star)"), allr
);
assertQ(req("defType", "edismax", "qf", "name title subject text",
"q","id:42 OR (-id:42)"), allr
);
// test that basic synonyms work
assertQ(req("defType", "edismax", "qf", "text_sw",
"q","GB"), oner
);
// test for stopword removal in main query part
assertQ(req("defType", "edismax", "qf", "text_sw",
"q","the big"), twor
);
// test for stopwords not removed
assertQ(req("defType", "edismax", "qf", "text_sw", "stopwords","false",
"q","the big"), oner
);
/** stopword removal in conjunction with multi-word synonyms at query time
* break this test.
// multi-word synonyms
// remove id:50 which contans the false match
assertQ(req("defType", "edismax", "qf", "text_t", "indent","true", "debugQuery","true",
"q","-id:50 nyc"), oner
);
**/
/*** these fail because multi-word synonyms are being used at query time
// this will incorrectly match "new big city"
assertQ(req("defType", "edismax", "qf", "id title",
"q","nyc"), oner
);
// this will incorrectly match "new big city"
assertQ(req("defType", "edismax", "qf", "title",
"q","the big apple"), nor
);
***/
}
}

View File

@ -339,7 +339,7 @@
<fieldtype name="syn" class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter name="syn" class="solr.SynonymFilterFactory" synonyms="synonyms.txt"/>
<filter name="syn" class="solr.SynonymFilterFactory" synonyms="old_synonyms.txt"/>
</analyzer>
</fieldtype>
@ -350,7 +350,7 @@
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory"
synonyms="synonyms.txt" expand="true" />
synonyms="old_synonyms.txt" expand="true" />
<filter class="solr.EnglishPorterFilterFactory"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory" />
</analyzer>

View File

@ -325,6 +325,32 @@
<filter name="syn" class="solr.SynonymFilterFactory" synonyms="synonyms.txt"/>
</analyzer>
</fieldtype>
<!-- a text field with the stop filter only on the query analyzer
-->
<fieldType name="text_sw" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<!-- in this example, we will only use synonyms at query time
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
-->
<!--<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>-->
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1"
catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory"/>
</analyzer>
</fieldType>
<!-- Demonstrates How RemoveDuplicatesTokenFilter makes stemmed
synonyms "better"
@ -461,9 +487,11 @@
<dynamicField name="*_mfacet" type="string" indexed="true" stored="false" multiValued="true" />
<dynamicField name="*_sw" type="text_sw" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="*_i" type="int" indexed="true" stored="true"/>
<dynamicField name="*_s" type="string" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="*_ss" type="string" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="*_l" type="long" indexed="true" stored="true"/>
<dynamicField name="*_t" type="text" indexed="true" stored="true"/>
<dynamicField name="*_tt" type="text" indexed="true" stored="true"/>

View File

@ -1,16 +1,58 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
stopworda
stopwordb
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#-----------------------------------------------------------------------
# a couple of test stopwords to test that the words are really being
# configured from this file:
stopworda
stopwordb
#Standard english stop words taken from Lucene's StopAnalyzer
a
an
and
are
as
at
be
but
by
for
if
in
into
is
it
no
not
of
on
or
s
such
t
that
the
their
then
there
these
they
this
to
was
will
with

View File

@ -1,22 +1,31 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
a => aa
b => b1 b2
c => c1,c2
a\=>a => b\=>b
a\,a => b\,b
foo,bar,baz
Television,TV,Televisions
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#-----------------------------------------------------------------------
#some test synonym mappings unlikely to appear in real input text
aaa => aaaa
bbb => bbbb1 bbbb2
ccc => cccc1,cccc2
a\=>a => b\=>b
a\,a => b\,b
fooaaa,baraaa,bazaaa
# Some synonym groups specific to this example
GB,gib,gigabyte,gigabytes
MB,mib,megabyte,megabytes
Television, Televisions, TV, TVs
#notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming
#after us won't split it into two words.
# Synonym mappings can be used for spelling correction too
pixima => pixma