SOLR-2015: add boolean attribute autoGeneratePhraseQueries to TextField

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@979049 13f79535-47bb-0310-9956-ffa450edef68
2010-07-25 15:13:05 +00:00 · 2010-07-25 15:13:05 +00:00 · da7655c72a
parent a5430edf83
commit da7655c72a
6 changed files with 116 additions and 7 deletions
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -203,6 +203,15 @@ New Features
  	     http://wiki.apache.org/solr/SpatialSearch and the example.  Refactored some items in Lucene spatial. 
 	     Removed SpatialTileField as the underlying CartesianTier is broken beyond repair and is going to be moved. (gsingers) 
   
+* SOLR-2015: Add a boolean attribute autoGeneratePhraseQueries to TextField.
+  autoGeneratePhraseQueries="true" (the default) causes the query parser to
+  generate phrase queries if multiple tokens are generated from a single
+  non-quoted analysis string.  For example WordDelimiterFilter splitting text:pdp-11
+  will cause the parser to generate text:"pdp 11" rather than (text:PDP OR text:11).
+  Note that autoGeneratePhraseQueries="true" tends to not work well for non whitespace
+  delimited languages. (yonik)
+ 
+
 Optimizations
 ----------------------

--- a/solr/example/solr/conf/schema.xml
+++ b/solr/example/solr/conf/schema.xml
@ -213,8 +213,12 @@
        words on case-change, alpha numeric boundaries, and non-alphanumeric chars,
        so that a query of "wifi" or "wi fi" could match a document containing "Wi-Fi".
        Synonyms and stopwords are customized by external files, and stemming is enabled.
+        The attribute autoGeneratePhraseQueries="true" (the default) causes words that get split to
+        form phrase queries. For example, WordDelimiterFilter splitting text:pdp-11 will cause the parser
+        to generate text:"pdp 11" rather than (text:PDP OR text:11).
+        NOTE: autoGeneratePhraseQueries="true" tends to not work well for non whitespace delimited languages.
        -->
-    <fieldType name="text" class="solr.TextField" positionIncrementGap="100">
+    <fieldType name="text" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
      <analyzer type="index">
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <!-- in this example, we will only use synonyms at query time
--- a/solr/src/java/org/apache/solr/schema/TextField.java
+++ b/solr/src/java/org/apache/solr/schema/TextField.java
@ -46,13 +46,21 @@ import java.io.StringReader;
 * @version $Id$
 */
 public class TextField extends FieldType {
+  protected boolean autoGeneratePhraseQueries = true;
+
  protected void init(IndexSchema schema, Map<String,String> args) {
    properties |= TOKENIZED;
    if (schema.getVersion()> 1.1f) properties &= ~OMIT_TF_POSITIONS;
-    
+    String autoGeneratePhraseQueriesStr = args.remove("autoGeneratePhraseQueries");
+    if (autoGeneratePhraseQueriesStr != null)
+      autoGeneratePhraseQueries = Boolean.parseBoolean(autoGeneratePhraseQueriesStr);
    super.init(schema, args);    
  }

+  public boolean getAutoGeneratePhraseQueries() {
+    return autoGeneratePhraseQueries;
+  }
+
  public SortField getSortField(SchemaField field, boolean reverse) {
    return getStringSort(field, reverse);
  }
--- a/solr/src/java/org/apache/solr/search/SolrQueryParser.java
+++ b/solr/src/java/org/apache/solr/search/SolrQueryParser.java
@ -142,11 +142,15 @@ public class SolrQueryParser extends QueryParser {
        return parser.subQuery(queryText, null).getQuery();
      }
    }
-    //Intercept poly fields, as they get expanded by default to an OR clause of
-    SchemaField sf = schema.getField(field);
-    //TODO: is there anyway to avoid this instance of check?
-    if (sf != null&& !(sf.getType() instanceof TextField)){//we have a poly field, deal with it specially by delegating to the FieldType
-      return sf.getType().getFieldQuery(parser, sf, queryText); 
+    SchemaField sf = schema.getFieldOrNull(field);
+    if (sf != null) {
+      FieldType ft = sf.getType();
+      // delegate to type for everything except TextField
+      if (ft instanceof TextField) {
+        return super.getFieldQuery(field, queryText, quoted || ((TextField)ft).getAutoGeneratePhraseQueries());
+      } else {
+        return sf.getType().getFieldQuery(parser, sf, queryText);
+      }
    }

    // default to a normal field query
--- a/solr/src/test/org/apache/solr/search/TestSolrQueryParser.java
+++ b/solr/src/test/org/apache/solr/search/TestSolrQueryParser.java
@ -0,0 +1,53 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.search;
+
+import org.apache.solr.SolrTestCaseJ4;
+import org.junit.BeforeClass;
+import org.junit.After;
+import org.junit.Test;
+
+
+public class TestSolrQueryParser extends SolrTestCaseJ4 {
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    initCore("solrconfig.xml", "schema12.xml");
+    createIndex();
+  }
+
+  public static void createIndex() {
+    String v;
+    v="how now brown cow";
+    assertU(adoc("id","1", "text",v,  "text_np",v));
+    v="now cow";
+    assertU(adoc("id","2", "text",v,  "text_np",v));
+    assertU(commit());
+  }
+
+  @Test
+  public void testPhrase() {
+    // should generate a phrase of "now cow" and match only one doc
+    assertQ(req("q","text:now-cow", "indent","true")
+        ,"//*[@numFound='1']"
+    );
+    // should generate a query of (now OR cow) and match both docs
+    assertQ(req("q","text_np:now-cow", "indent","true")
+        ,"//*[@numFound='2']"
+    );
+  }
+
+}
--- a/solr/src/test/test-files/solr/conf/schema12.xml
+++ b/solr/src/test/test-files/solr/conf/schema12.xml
@ -145,6 +145,35 @@
    </fieldType>


+    <!-- field type that doesn't generate phrases from unquoted multiple tokens per analysis unit -->
+   <fieldType name="text_np" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false" >
+      <analyzer type="index">
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.StopFilterFactory"
+                ignoreCase="true"
+                words="stopwords.txt"
+                enablePositionIncrements="true"
+                />
+        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+        <filter class="solr.PorterStemFilterFactory"/>
+      </analyzer>
+      <analyzer type="query">
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+        <filter class="solr.StopFilterFactory"
+                ignoreCase="true"
+                words="stopwords.txt"
+                enablePositionIncrements="true"
+                />
+        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+        <filter class="solr.PorterStemFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
    <fieldtype name="nametext" class="solr.TextField">
      <analyzer class="org.apache.lucene.analysis.core.WhitespaceAnalyzer"/>
    </fieldtype>
@ -403,6 +432,8 @@
   <field name="weight" type="float" indexed="true" stored="true"/>
   <field name="bday" type="date" indexed="true" stored="true"/>

+   <field name="text_np" type="text_np" indexed="true" stored="false"/>
+
   <field name="title_stemmed" type="text" indexed="true" stored="false"/>
   <field name="title_lettertok" type="lettertok" indexed="true" stored="false"/>