SOLR-879: enable position increments in QP, fix example schemas

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@721687 13f79535-47bb-0310-9956-ffa450edef68
2008-11-29 16:38:08 +00:00 · 2008-11-29 16:38:08 +00:00 · b76a2a7b68
parent 497a61a2fb
commit b76a2a7b68
5 changed files with 214 additions and 28 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -135,6 +135,11 @@ Bug Fixes

 11. SOLR-872: Better error message for incorrect copyField destination (Noble Paul via shalin)

+12. SOLR-879: Enable position increments in the query parser and fix the
+    example schema to enable position increments for the stop filter in
+    both the index and query analyzers to fix the bug with phrase queries
+    with stopwords. (yonik)
+

 Other Changes
 ----------------------
--- a/contrib/javascript/example/testsolr/solr/conf/schema.xml
+++ b/contrib/javascript/example/testsolr/solr/conf/schema.xml
@ -1,4 +1,42 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!--  
+ This is the Solr schema file. This file should be named "schema.xml" and
+ should be in the conf directory under the solr home
+ (i.e. ./solr/conf/schema.xml by default) 
+ or located where the classloader for the Solr webapp can find it.
+
+ This example schema is the recommended starting point for users.
+ It should be kept correct and concise, usable out-of-the-box.
+
+ For more information, on how to customize this file, please see
+ http://wiki.apache.org/solr/SchemaXml
+-->
+
 <schema name="example" version="1.1">
+  <!-- attribute "name" is the name of this schema and is only used for display purposes.
+       Applications should change this to reflect the nature of the search collection.
+       version="1.1" is Solr's version number for the schema syntax and semantics.  It should
+       not normally be changed by applications.
+       1.0: multiValued attribute did not exist, all fields are multiValued by nature
+       1.1: multiValued attribute introduced, false by default -->
+
  <types>
    <!-- field type definitions. The "name" attribute is
       just a label to be used by field definitions.  The "class"
@ -127,8 +165,8 @@
        <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
        -->
        <!-- Case insensitive stop word removal.
-             enablePositionIncrements=true ensures that a 'gap' is left to
-             allow for accurate phrase queries.
+          add enablePositionIncrements=true in both the index and query
+          analyzers to leave a 'gap' for more accurate phrase queries.
        -->
        <filter class="solr.StopFilterFactory"
                ignoreCase="true"
@ -143,7 +181,11 @@
      <analyzer type="query">
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
-        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
+        <filter class="solr.StopFilterFactory"
+                ignoreCase="true"
+                words="stopwords.txt"
+                enablePositionIncrements="true"
+                />
        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
@ -166,7 +208,6 @@
      </analyzer>
    </fieldType>

-
    <!--
     Setup simple analysis for spell checking
     -->
@ -178,6 +219,16 @@
      </analyzer>
    </fieldType>

+    <!-- charFilter + "CharStream aware" WhitespaceTokenizer  -->
+    <!--
+    <fieldType name="textCharNorm" class="solr.TextField" positionIncrementGap="100" >
+      <analyzer>
+        <charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt"/>
+        <tokenizer class="solr.CharStreamAwareWhitespaceTokenizerFactory"/>
+      </analyzer>
+    </fieldType>
+    -->
+
    <!-- This is an example of using the KeywordTokenizer along
         With various TokenFilterFactories to produce a sortable field
         that does not include some properties of the source text
@ -211,6 +262,14 @@
      </analyzer>
    </fieldType>
    
+    <fieldtype name="phonetic" stored="false" indexed="true" class="solr.TextField" >
+      <analyzer>
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/>
+      </analyzer>
+    </fieldtype> 
+    
+
    <!-- since fields of this type are by default not stored or indexed, any data added to 
         them will be ignored outright 
     --> 
@ -220,32 +279,134 @@


 <fields>
+   <!-- Valid attributes for fields:
+     name: mandatory - the name for the field
+     type: mandatory - the name of a previously defined type from the <types> section
+     indexed: true if this field should be indexed (searchable or sortable)
+     stored: true if this field should be retrievable
+     compressed: [false] if this field should be stored using gzip compression
+       (this will only apply if the field type is compressable; among
+       the standard field types, only TextField and StrField are)
+     multiValued: true if this field may contain multiple values per document
+     omitNorms: (expert) set to true to omit the norms associated with
+       this field (this disables length normalization and index-time
+       boosting for the field, and saves some memory).  Only full-text
+       fields or fields that need an index-time boost need norms.
+     termVectors: [false] set to true to store the term vector for a given field.
+       When using MoreLikeThis, fields used for similarity should be stored for 
+       best performance.
+     termPositions: Store position information with the term vector.  This will increase storage costs.
+     termOffsets: Store offset information with the term vector. This will increase storage costs.
+   -->
+
   <field name="id" type="string" indexed="true" stored="true" required="true" /> 
-   <field name="title" type="text" indexed="true" stored="true"/>
-   <field name="text" type="text" indexed="true" stored="true"/>
-   <field name="date" type="date" indexed="true" stored="true"/>
-   <field name="dateline" type="text" indexed="true" stored="true"/>
-   <field name="places" type="string" indexed="true" stored="true" multiValued="true" omitNorms="true" termVectors="true" />
-   <field name="countryCodes" type="string" indexed="true" stored="true" multiValued="true" omitNorms="true" termVectors="true" />
-   <field name="topics" type="string" indexed="true" stored="true" multiValued="true" omitNorms="true" termVectors="true" />
-   <field name="organisations" type="string" indexed="true" stored="true" multiValued="true" omitNorms="true" termVectors="true" />
-   <field name="exchanges" type="string" indexed="true" stored="true" multiValued="true" omitNorms="true" termVectors="true" />
-   <field name="companies" type="string" indexed="true" stored="true" multiValued="true" omitNorms="true" termVectors="true" />
-   <field name="allText" type="text" indexed="true" stored="true" multiValued="true" omitNorms="true" termVectors="true" />
+   <field name="sku" type="textTight" indexed="true" stored="true" omitNorms="true"/>
+   <field name="name" type="text" indexed="true" stored="true"/>
+   <field name="nameSort" type="string" indexed="true" stored="false"/>
+   <field name="alphaNameSort" type="alphaOnlySort" indexed="true" stored="false"/>
+   <field name="manu" type="text" indexed="true" stored="true" omitNorms="true"/>
+   <field name="cat" type="text_ws" indexed="true" stored="true" multiValued="true" omitNorms="true" termVectors="true" />
+   <field name="features" type="text" indexed="true" stored="true" multiValued="true" termVectors="true" termPositions="true" termOffsets="true"/>
+   <field name="includes" type="text" indexed="true" stored="true"/>
+
+   <field name="weight" type="sfloat" indexed="true" stored="true"/>
+   <field name="price"  type="sfloat" indexed="true" stored="true"/>
+   <!-- "default" values can be specified for fields, indicating which
+        value should be used if no value is specified when adding a document.
+     -->
+   <field name="popularity" type="sint" indexed="true" stored="true" default="0"/>
+   <field name="inStock" type="boolean" indexed="true" stored="true"/>
+
+   <!-- Some sample docs exists solely to demonstrate the spellchecker
+        functionality, this is the only field they container.
+        Typically you might build the spellchecker of "catchall" type field
+        containing all of the text in each document
+     -->
+   <field name="word" type="string" indexed="true" stored="true"/>
+
+   
+   <!-- catchall field, containing all other searchable text fields (implemented
+        via copyField further on in this schema  -->
+   <field name="text" type="text" indexed="true" stored="false" multiValued="true"/>
+
+   <!-- non-tokenized version of manufacturer to make it easier to sort or group
+        results by manufacturer.  copied from "manu" via copyField -->
+   <field name="manu_exact" type="string" indexed="true" stored="false"/>
+
+   <!-- Here, default is used to create a "timestamp" field indicating
+        When each document was indexed.
+     -->
+   <field name="timestamp" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/>
+   
+   <field name="spell" type="textSpell" indexed="true" stored="true" multiValued="true"/>
+   <!-- Dynamic field definitions.  If a field name is not found, dynamicFields
+        will be used if the name matches any of the patterns.
+        RESTRICTION: the glob-like pattern in the name attribute must have
+        a "*" only at the start or the end.
+        EXAMPLE:  name="*_i" will match any field ending in _i (like myid_i, z_i)
+        Longer patterns will be matched first.  if equal size patterns
+        both match, the first appearing in the schema will be used.  -->
+   <dynamicField name="*_i"  type="sint"    indexed="true"  stored="true"/>
+   <dynamicField name="*_s"  type="string"  indexed="true"  stored="true"/>
+   <dynamicField name="*_l"  type="slong"   indexed="true"  stored="true"/>
+   <dynamicField name="*_t"  type="text"    indexed="true"  stored="true"/>
+   <dynamicField name="*_b"  type="boolean" indexed="true"  stored="true"/>
+   <dynamicField name="*_f"  type="sfloat"  indexed="true"  stored="true"/>
+   <dynamicField name="*_d"  type="sdouble" indexed="true"  stored="true"/>
+   <dynamicField name="*_dt" type="date"    indexed="true"  stored="true"/>
+
+   <dynamicField name="random*" type="random" />
+
+   <!-- uncomment the following to ignore any fields that don't already match an existing 
+        field name or dynamic field, rather than reporting them as an error. 
+        alternately, change the type="ignored" to some other type e.g. "text" if you want 
+        unknown fields indexed and/or stored by default --> 
+   <!--dynamicField name="*" type="ignored" /-->
+   
 </fields>

-	<copyField source="title" dest="allText"/>
-	<copyField source="text" dest="allText"/>
-	<copyField source="places" dest="allText"/>
-	<copyField source="topics" dest="allText"/>
-	<copyField source="companies" dest="allText"/>
-	<copyField source="exchanges" dest="allText"/>
-
+ <!-- Field to use to determine and enforce document uniqueness. 
+      Unless this field is marked with required="false", it will be a required field
+   -->
 <uniqueKey>id</uniqueKey>

+ <!-- field for the QueryParser to use when an explicit fieldname is absent -->
 <defaultSearchField>text</defaultSearchField>

+ <!-- SolrQueryParser configuration: defaultOperator="AND|OR" -->
 <solrQueryParser defaultOperator="OR"/>

+  <!-- copyField commands copy one field to another at the time a document
+        is added to the index.  It's used either to index the same field differently,
+        or to add multiple fields to the same field for easier/faster searching.  -->
+   <copyField source="id" dest="sku"/>
+
+   <copyField source="incubationdate_dt" dest="incubationdate_s"/>
+   <copyField source="cat" dest="text"/>
+   <copyField source="name" dest="text"/>
+   <copyField source="name" dest="nameSort"/>
+   <copyField source="name" dest="alphaNameSort"/>
+   <copyField source="manu" dest="text"/>
+   <copyField source="features" dest="text"/>
+   <copyField source="includes" dest="text"/>
+
+   <copyField source="manu" dest="manu_exact"/>
+
+  <copyField source="name" dest="spell"/>
+
+ <!-- Similarity is the scoring routine for each document vs. a query.
+      A custom similarity may be specified here, but the default is fine
+      for most applications.  -->
+ <!-- <similarity class="org.apache.lucene.search.DefaultSimilarity"/> -->
+ <!-- ... OR ...
+      Specify a SimilarityFactory class name implementation
+      allowing parameters to be used.
+ -->
+ <!--
+ <similarity class="com.example.solr.CustomSimilarityFactory">
+   <str name="paramkey">param value</str>
+ </similarity>
+ -->
+

 </schema>
--- a/contrib/velocity/src/main/solr/conf/schema.xml
+++ b/contrib/velocity/src/main/solr/conf/schema.xml
@ -165,8 +165,8 @@
        <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
        -->
        <!-- Case insensitive stop word removal.
-             enablePositionIncrements=true ensures that a 'gap' is left to
-             allow for accurate phrase queries.
+          add enablePositionIncrements=true in both the index and query
+          analyzers to leave a 'gap' for more accurate phrase queries.
        -->
        <filter class="solr.StopFilterFactory"
                ignoreCase="true"
@ -181,7 +181,11 @@
      <analyzer type="query">
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
-        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
+        <filter class="solr.StopFilterFactory"
+                ignoreCase="true"
+                words="stopwords.txt"
+                enablePositionIncrements="true"
+                />
        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
@ -215,6 +219,16 @@
      </analyzer>
    </fieldType>

+    <!-- charFilter + "CharStream aware" WhitespaceTokenizer  -->
+    <!--
+    <fieldType name="textCharNorm" class="solr.TextField" positionIncrementGap="100" >
+      <analyzer>
+        <charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt"/>
+        <tokenizer class="solr.CharStreamAwareWhitespaceTokenizerFactory"/>
+      </analyzer>
+    </fieldType>
+    -->
+
    <!-- This is an example of using the KeywordTokenizer along
         With various TokenFilterFactories to produce a sortable field
         that does not include some properties of the source text
--- a/example/solr/conf/schema.xml
+++ b/example/solr/conf/schema.xml
@ -165,8 +165,8 @@
        <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
        -->
        <!-- Case insensitive stop word removal.
-             enablePositionIncrements=true ensures that a 'gap' is left to
-             allow for accurate phrase queries.
+          add enablePositionIncrements=true in both the index and query
+          analyzers to leave a 'gap' for more accurate phrase queries.
        -->
        <filter class="solr.StopFilterFactory"
                ignoreCase="true"
@ -181,7 +181,11 @@
      <analyzer type="query">
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
-        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
+        <filter class="solr.StopFilterFactory"
+                ignoreCase="true"
+                words="stopwords.txt"
+                enablePositionIncrements="true"
+                />
        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
--- a/src/java/org/apache/solr/search/SolrQueryParser.java
+++ b/src/java/org/apache/solr/search/SolrQueryParser.java
@ -73,6 +73,7 @@ public class SolrQueryParser extends QueryParser {
    this.parser  = null;
    this.defaultField = defaultField;
    setLowercaseExpandedTerms(false);
+    setEnablePositionIncrements(true);    
  }

  public SolrQueryParser(QParser parser, String defaultField) {
@ -85,6 +86,7 @@ public class SolrQueryParser extends QueryParser {
    this.parser = parser;
    this.defaultField = defaultField;
    setLowercaseExpandedTerms(false);
+    setEnablePositionIncrements(true);
  }

  private void checkNullField(String field) throws SolrException {