SOLR-1553: extended dismax parser

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@881546 13f79535-47bb-0310-9956-ffa450edef68
2009-11-17 21:46:38 +00:00 · 2009-11-17 21:46:38 +00:00 · 1cfb5a2fe3
parent 2886626aea
commit 1cfb5a2fe3
8 changed files with 1424 additions and 41 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -34,9 +34,14 @@ Detailed Change List
 New Features
 ----------------------

-1. SOLR-1302: Added several new distance based functions, including Great Circle (haversine), Manhattan and Euclidean.
+* SOLR-1302: Added several new distance based functions, including Great Circle (haversine), Manhattan and Euclidean.
  Also added geohash(), deg() and rad() convenience functions. See http://wiki.apache.org/solr/FunctionQuery. (gsingers)

+* SOLR-1553: New dismax parser implementation (accessible as "edismax")
+  that supports full lucene syntax, improved reserved char escaping,
+  fielded queries, improved proximity boosting, and improved stopword
+  handling. (yonik)
+
 Optimizations
 ----------------------

--- a/src/java/org/apache/solr/search/ExtendedDismaxQParserPlugin.java
+++ b/src/java/org/apache/solr/search/ExtendedDismaxQParserPlugin.java
--- a/src/java/org/apache/solr/search/QParserPlugin.java
+++ b/src/java/org/apache/solr/search/QParserPlugin.java
@ -32,6 +32,7 @@ public abstract class QParserPlugin implements NamedListInitializedPlugin {
    PrefixQParserPlugin.NAME, PrefixQParserPlugin.class,
    BoostQParserPlugin.NAME, BoostQParserPlugin.class,
    DisMaxQParserPlugin.NAME, DisMaxQParserPlugin.class,
+    ExtendedDismaxQParserPlugin.NAME, ExtendedDismaxQParserPlugin.class,
    FieldQParserPlugin.NAME, FieldQParserPlugin.class,
    RawQParserPlugin.NAME, RawQParserPlugin.class,
    NestedQParserPlugin.NAME, NestedQParserPlugin.class,
--- a/src/test/org/apache/solr/search/TestExtendedDismaxParser.java
+++ b/src/test/org/apache/solr/search/TestExtendedDismaxParser.java
@ -0,0 +1,170 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.search;
+
+import org.apache.solr.util.AbstractSolrTestCase;
+
+public class TestExtendedDismaxParser extends AbstractSolrTestCase {
+  public String getSchemaFile() { return "schema12.xml"; }
+  public String getSolrConfigFile() { return "solrconfig.xml"; }
+  // public String getCoreName() { return "collection1"; }
+
+  public void setUp() throws Exception {
+    // if you override setUp or tearDown, you better call
+    // the super classes version
+    super.setUp();
+  }
+  public void tearDown() throws Exception {
+    // if you override setUp or tearDown, you better call
+    // the super classes version
+    super.tearDown();
+  }
+
+  // test the edismax query parser based on the dismax parser
+  public void testFocusQueryParser() {
+    assertU(adoc("id", "42", "trait_ss", "Tool", "trait_ss", "Obnoxious",
+            "name", "Zapp Brannigan"));
+    assertU(adoc("id", "43" ,
+            "title", "Democratic Order op Planets"));
+    assertU(adoc("id", "44", "trait_ss", "Tool",
+            "name", "The Zapper"));
+    assertU(adoc("id", "45", "trait_ss", "Chauvinist",
+            "title", "25 star General"));
+    assertU(adoc("id", "46", "trait_ss", "Obnoxious",
+            "subject", "Defeated the pacifists op the Gandhi nebula"));
+    assertU(adoc("id", "47", "trait_ss", "Pig",
+            "text", "line up and fly directly at the enemy death cannons, clogging them with wreckage!"));
+    assertU(adoc("id", "48", "text_sw", "this has gigabyte potential", "foo_i","100"));
+    assertU(adoc("id", "49", "text_sw", "start the big apple end", "foo_i","-100"));
+    assertU(adoc("id", "50", "text_sw", "start new big city end"));
+
+    assertU(commit());
+    String allq = "id:[42 TO 50]";
+    String allr = "*[count(//doc)=9]";
+    String oner = "*[count(//doc)=1]";
+    String twor = "*[count(//doc)=2]";
+    String nor = "*[count(//doc)=0]";
+
+
+    assertQ("standard request handler returns all matches",
+            req(allq),
+            allr
+    );
+
+   assertQ("edismax query parser returns all matches",
+            req("q", allq,
+                "defType", "edismax"
+            ),
+            allr
+    );
+
+   assertQ(req("defType", "edismax", "qf", "trait_ss",
+               "q","Tool"), twor
+    );
+
+   // test that field types that aren't applicable don't cause an exception to be thrown
+   assertQ(req("defType", "edismax", "qf", "trait_ss foo_i foo_f foo_dt foo_l foo_d foo_b",
+               "q","Tool"), twor
+    );
+
+   // test that numeric field types can be queried
+   assertQ(req("defType", "edismax", "qf", "text_sw",
+               "q","foo_i:100"), oner
+    );
+
+   // test that numeric field types can be queried
+   assertQ(req("defType", "edismax", "qf", "text_sw",
+               "q","foo_i:-100"), oner
+    );
+
+   // test that numeric field types can be queried  via qf
+   assertQ(req("defType", "edismax", "qf", "text_sw foo_i",
+               "q","100"), oner
+    );
+
+   assertQ(req("defType", "edismax", "qf", "name title subject text",
+               "q","op"), twor
+    );
+   assertQ(req("defType", "edismax", "qf", "name title subject text",
+               "q","Order op"), oner
+    );
+   assertQ(req("defType", "edismax", "qf", "name title subject text",
+               "q","Order AND op"), oner
+    );
+   assertQ(req("defType", "edismax", "qf", "name title subject text",
+               "q","Order and op"), oner
+    );
+    assertQ(req("defType", "edismax", "qf", "name title subject text",
+               "q","+Order op"), oner
+    );
+    assertQ(req("defType", "edismax", "qf", "name title subject text",
+               "q","Order OR op"), twor
+    );
+    assertQ(req("defType", "edismax", "qf", "name title subject text",
+               "q","Order or op"), twor
+    );
+    assertQ(req("defType", "edismax", "qf", "name title subject text",
+               "q","*:*"), allr
+    );
+
+    assertQ(req("defType", "edismax", "qf", "name title subject text",
+           "q","star OR (-star)"), allr
+    );
+    assertQ(req("defType", "edismax", "qf", "name title subject text",
+           "q","id:42 OR (-id:42)"), allr
+    );
+
+    // test that basic synonyms work
+    assertQ(req("defType", "edismax", "qf", "text_sw",
+           "q","GB"), oner
+    );
+
+    // test for stopword removal in main query part
+    assertQ(req("defType", "edismax", "qf", "text_sw",
+           "q","the big"), twor
+    );
+
+    // test for stopwords not removed   
+    assertQ(req("defType", "edismax", "qf", "text_sw", "stopwords","false",
+           "q","the big"), oner
+    );
+
+    /** stopword removal in conjunction with multi-word synonyms at query time
+     * break this test.
+     // multi-word synonyms
+     // remove id:50 which contans the false match      
+    assertQ(req("defType", "edismax", "qf", "text_t", "indent","true", "debugQuery","true",
+           "q","-id:50 nyc"), oner
+    );
+    **/
+
+    /*** these fail because multi-word synonyms are being used at query time
+    // this will incorrectly match "new big city"
+    assertQ(req("defType", "edismax", "qf", "id title",
+           "q","nyc"), oner
+    );
+
+    // this will incorrectly match "new big city"
+    assertQ(req("defType", "edismax", "qf", "title",
+           "q","the big apple"), nor
+    );
+    ***/
+
+  }
+
+}
--- a/src/test/test-files/solr/conf/schema.xml
+++ b/src/test/test-files/solr/conf/schema.xml
@ -339,7 +339,7 @@
    <fieldtype name="syn" class="solr.TextField">
      <analyzer>
          <tokenizer class="solr.WhitespaceTokenizerFactory"/>
-          <filter name="syn" class="solr.SynonymFilterFactory" synonyms="synonyms.txt"/>
+          <filter name="syn" class="solr.SynonymFilterFactory" synonyms="old_synonyms.txt"/>
      </analyzer>
    </fieldtype>
    
@ -350,7 +350,7 @@
      <analyzer>
          <tokenizer class="solr.WhitespaceTokenizerFactory"/>
          <filter class="solr.SynonymFilterFactory"
-                  synonyms="synonyms.txt" expand="true" />
+                  synonyms="old_synonyms.txt" expand="true" />
          <filter class="solr.EnglishPorterFilterFactory"/>
          <filter class="solr.RemoveDuplicatesTokenFilterFactory" />
      </analyzer>
--- a/src/test/test-files/solr/conf/schema12.xml
+++ b/src/test/test-files/solr/conf/schema12.xml
@ -326,6 +326,32 @@
      </analyzer>
    </fieldtype>

+    <!-- a text field with the stop filter only on the query analyzer 
+     -->
+    <fieldType name="text_sw" class="solr.TextField" positionIncrementGap="100">
+      <analyzer type="index">
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <!-- in this example, we will only use synonyms at query time
+        <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+        -->
+        <!--<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>-->
+        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1"
+                catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.EnglishPorterFilterFactory"/>
+      </analyzer>
+      <analyzer type="query">
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
+        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0"
+                catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.EnglishPorterFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
+    
    <!-- Demonstrates How RemoveDuplicatesTokenFilter makes stemmed
         synonyms "better"
      -->
@ -461,9 +487,11 @@

   <dynamicField name="*_mfacet" type="string" indexed="true" stored="false" multiValued="true" />

+   <dynamicField name="*_sw" type="text_sw" indexed="true" stored="true" multiValued="true"/>

   <dynamicField name="*_i"  type="int"    indexed="true"  stored="true"/>
   <dynamicField name="*_s"  type="string"  indexed="true"  stored="true" multiValued="true"/>
+   <dynamicField name="*_ss"  type="string"  indexed="true"  stored="true" multiValued="true"/>
   <dynamicField name="*_l"  type="long"   indexed="true"  stored="true"/>
   <dynamicField name="*_t"  type="text"    indexed="true"  stored="true"/>
   <dynamicField name="*_tt"  type="text"    indexed="true"  stored="true"/>
--- a/src/test/test-files/solr/conf/stopwords.txt
+++ b/src/test/test-files/solr/conf/stopwords.txt
@ -12,5 +12,47 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+#-----------------------------------------------------------------------
+# a couple of test stopwords to test that the words are really being
+# configured from this file:
 stopworda
 stopwordb
+
+#Standard english stop words taken from Lucene's StopAnalyzer
+a
+an
+and
+are
+as
+at
+be
+but
+by
+for
+if
+in
+into
+is
+it
+no
+not
+of
+on
+or
+s
+such
+t
+that
+the
+their
+then
+there
+these
+they
+this
+to
+was
+will
+with
+
--- a/src/test/test-files/solr/conf/synonyms.txt
+++ b/src/test/test-files/solr/conf/synonyms.txt
@ -1,6 +1,3 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
@ -12,11 +9,23 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-a => aa
-b => b1 b2
-c => c1,c2
+
+#-----------------------------------------------------------------------
+#some test synonym mappings unlikely to appear in real input text
+aaa => aaaa
+bbb => bbbb1 bbbb2
+ccc => cccc1,cccc2
 a\=>a => b\=>b
 a\,a => b\,b
-foo,bar,baz
+fooaaa,baraaa,bazaaa
+
+# Some synonym groups specific to this example
+GB,gib,gigabyte,gigabytes
+MB,mib,megabyte,megabytes
+Television, Televisions, TV, TVs
+#notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming
+#after us won't split it into two words.
+
+# Synonym mappings can be used for spelling correction too
+pixima => pixma

-Television,TV,Televisions