SOLR-879: enable position increments in QP, fix example schemas

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@721687 13f79535-47bb-0310-9956-ffa450edef68
2008-11-29 16:38:08 +00:00 · 2008-11-29 16:38:08 +00:00 · b76a2a7b68
parent 497a61a2fb
commit b76a2a7b68
5 changed files with 214 additions and 28 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -135,6 +135,11 @@ Bug Fixes
 11. SOLR-872: Better error message for incorrect copyField destination (Noble Paul via shalin)
 12. SOLR-879: Enable position increments in the query parser and fix the
    example schema to enable position increments for the stop filter in
    both the index and query analyzers to fix the bug with phrase queries
    with stopwords. (yonik)
 Other Changes
 ----------------------
--- a/contrib/javascript/example/testsolr/solr/conf/schema.xml
+++ b/contrib/javascript/example/testsolr/solr/conf/schema.xml
@ -1,4 +1,42 @@
 <?xml version="1.0" encoding="UTF-8" ?>
 <!--
 Licensed to the Apache Software Foundation (ASF) under one or more
 contributor license agreements.  See the NOTICE file distributed with
 this work for additional information regarding copyright ownership.
 The ASF licenses this file to You under the Apache License, Version 2.0
 (the "License"); you may not use this file except in compliance with
 the License.  You may obtain a copy of the License at
     http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 -->
 <!--  
 This is the Solr schema file. This file should be named "schema.xml" and
 should be in the conf directory under the solr home
 (i.e. ./solr/conf/schema.xml by default) 
 or located where the classloader for the Solr webapp can find it.
 This example schema is the recommended starting point for users.
 It should be kept correct and concise, usable out-of-the-box.
 For more information, on how to customize this file, please see
 http://wiki.apache.org/solr/SchemaXml
 -->
 <schema name="example" version="1.1">
  <!-- attribute "name" is the name of this schema and is only used for display purposes.
       Applications should change this to reflect the nature of the search collection.
       version="1.1" is Solr's version number for the schema syntax and semantics.  It should
       not normally be changed by applications.
       1.0: multiValued attribute did not exist, all fields are multiValued by nature
       1.1: multiValued attribute introduced, false by default -->
  <types>
    <!-- field type definitions. The "name" attribute is
       just a label to be used by field definitions.  The "class"
@ -127,8 +165,8 @@
        <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
        -->
        <!-- Case insensitive stop word removal.
-             enablePositionIncrements=true ensures that a 'gap' is left to
+          add enablePositionIncrements=true in both the index and query
-             allow for accurate phrase queries.
+          analyzers to leave a 'gap' for more accurate phrase queries.
        -->
        <filter class="solr.StopFilterFactory"
                ignoreCase="true"
@ -143,7 +181,11 @@
      <analyzer type="query">
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
-        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
+        <filter class="solr.StopFilterFactory"
                ignoreCase="true"
                words="stopwords.txt"
                enablePositionIncrements="true"
                />
        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
@ -166,7 +208,6 @@
      </analyzer>
    </fieldType>
    <!--
     Setup simple analysis for spell checking
     -->
@ -178,6 +219,16 @@
      </analyzer>
    </fieldType>
    <!-- charFilter + "CharStream aware" WhitespaceTokenizer  -->
    <!--
    <fieldType name="textCharNorm" class="solr.TextField" positionIncrementGap="100" >
      <analyzer>
        <charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt"/>
        <tokenizer class="solr.CharStreamAwareWhitespaceTokenizerFactory"/>
      </analyzer>
    </fieldType>
    -->
    <!-- This is an example of using the KeywordTokenizer along
         With various TokenFilterFactories to produce a sortable field
         that does not include some properties of the source text
@ -210,6 +261,14 @@
        />
      </analyzer>
    </fieldType>
    <fieldtype name="phonetic" stored="false" indexed="true" class="solr.TextField" >
      <analyzer>
        <tokenizer class="solr.StandardTokenizerFactory"/>
        <filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/>
      </analyzer>
    </fieldtype> 
    <!-- since fields of this type are by default not stored or indexed, any data added to 
         them will be ignored outright 
@ -220,32 +279,134 @@
 <fields>
   <!-- Valid attributes for fields:
     name: mandatory - the name for the field
     type: mandatory - the name of a previously defined type from the <types> section
     indexed: true if this field should be indexed (searchable or sortable)
     stored: true if this field should be retrievable
     compressed: [false] if this field should be stored using gzip compression
       (this will only apply if the field type is compressable; among
       the standard field types, only TextField and StrField are)
     multiValued: true if this field may contain multiple values per document
     omitNorms: (expert) set to true to omit the norms associated with
       this field (this disables length normalization and index-time
       boosting for the field, and saves some memory).  Only full-text
       fields or fields that need an index-time boost need norms.
     termVectors: [false] set to true to store the term vector for a given field.
       When using MoreLikeThis, fields used for similarity should be stored for 
       best performance.
     termPositions: Store position information with the term vector.  This will increase storage costs.
     termOffsets: Store offset information with the term vector. This will increase storage costs.
   -->
   <field name="id" type="string" indexed="true" stored="true" required="true" /> 
-   <field name="title" type="text" indexed="true" stored="true"/>
+   <field name="sku" type="textTight" indexed="true" stored="true" omitNorms="true"/>
-   <field name="text" type="text" indexed="true" stored="true"/>
+   <field name="name" type="text" indexed="true" stored="true"/>
-   <field name="date" type="date" indexed="true" stored="true"/>
+   <field name="nameSort" type="string" indexed="true" stored="false"/>
-   <field name="dateline" type="text" indexed="true" stored="true"/>
+   <field name="alphaNameSort" type="alphaOnlySort" indexed="true" stored="false"/>
-   <field name="places" type="string" indexed="true" stored="true" multiValued="true" omitNorms="true" termVectors="true" />
+   <field name="manu" type="text" indexed="true" stored="true" omitNorms="true"/>
-   <field name="countryCodes" type="string" indexed="true" stored="true" multiValued="true" omitNorms="true" termVectors="true" />
+   <field name="cat" type="text_ws" indexed="true" stored="true" multiValued="true" omitNorms="true" termVectors="true" />
-   <field name="topics" type="string" indexed="true" stored="true" multiValued="true" omitNorms="true" termVectors="true" />
+   <field name="features" type="text" indexed="true" stored="true" multiValued="true" termVectors="true" termPositions="true" termOffsets="true"/>
-   <field name="organisations" type="string" indexed="true" stored="true" multiValued="true" omitNorms="true" termVectors="true" />
+   <field name="includes" type="text" indexed="true" stored="true"/>
-   <field name="exchanges" type="string" indexed="true" stored="true" multiValued="true" omitNorms="true" termVectors="true" />
+
-   <field name="companies" type="string" indexed="true" stored="true" multiValued="true" omitNorms="true" termVectors="true" />
+   <field name="weight" type="sfloat" indexed="true" stored="true"/>
-   <field name="allText" type="text" indexed="true" stored="true" multiValued="true" omitNorms="true" termVectors="true" />
+   <field name="price"  type="sfloat" indexed="true" stored="true"/>
   <!-- "default" values can be specified for fields, indicating which
        value should be used if no value is specified when adding a document.
     -->
   <field name="popularity" type="sint" indexed="true" stored="true" default="0"/>
   <field name="inStock" type="boolean" indexed="true" stored="true"/>
   <!-- Some sample docs exists solely to demonstrate the spellchecker
        functionality, this is the only field they container.
        Typically you might build the spellchecker of "catchall" type field
        containing all of the text in each document
     -->
   <field name="word" type="string" indexed="true" stored="true"/>
   <!-- catchall field, containing all other searchable text fields (implemented
        via copyField further on in this schema  -->
   <field name="text" type="text" indexed="true" stored="false" multiValued="true"/>
   <!-- non-tokenized version of manufacturer to make it easier to sort or group
        results by manufacturer.  copied from "manu" via copyField -->
   <field name="manu_exact" type="string" indexed="true" stored="false"/>
   <!-- Here, default is used to create a "timestamp" field indicating
        When each document was indexed.
     -->
   <field name="timestamp" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/>
   <field name="spell" type="textSpell" indexed="true" stored="true" multiValued="true"/>
   <!-- Dynamic field definitions.  If a field name is not found, dynamicFields
        will be used if the name matches any of the patterns.
        RESTRICTION: the glob-like pattern in the name attribute must have
        a "*" only at the start or the end.
        EXAMPLE:  name="*_i" will match any field ending in _i (like myid_i, z_i)
        Longer patterns will be matched first.  if equal size patterns
        both match, the first appearing in the schema will be used.  -->
   <dynamicField name="*_i"  type="sint"    indexed="true"  stored="true"/>
   <dynamicField name="*_s"  type="string"  indexed="true"  stored="true"/>
   <dynamicField name="*_l"  type="slong"   indexed="true"  stored="true"/>
   <dynamicField name="*_t"  type="text"    indexed="true"  stored="true"/>
   <dynamicField name="*_b"  type="boolean" indexed="true"  stored="true"/>
   <dynamicField name="*_f"  type="sfloat"  indexed="true"  stored="true"/>
   <dynamicField name="*_d"  type="sdouble" indexed="true"  stored="true"/>
   <dynamicField name="*_dt" type="date"    indexed="true"  stored="true"/>
   <dynamicField name="random*" type="random" />
   <!-- uncomment the following to ignore any fields that don't already match an existing 
        field name or dynamic field, rather than reporting them as an error. 
        alternately, change the type="ignored" to some other type e.g. "text" if you want 
        unknown fields indexed and/or stored by default --> 
   <!--dynamicField name="*" type="ignored" /-->
 </fields>
-	<copyField source="title" dest="allText"/>
+ <!-- Field to use to determine and enforce document uniqueness. 
-	<copyField source="text" dest="allText"/>
+      Unless this field is marked with required="false", it will be a required field
-	<copyField source="places" dest="allText"/>
+   -->
 	<copyField source="topics" dest="allText"/>
 	<copyField source="companies" dest="allText"/>
 	<copyField source="exchanges" dest="allText"/>
 <uniqueKey>id</uniqueKey>
 <!-- field for the QueryParser to use when an explicit fieldname is absent -->
 <defaultSearchField>text</defaultSearchField>
 <!-- SolrQueryParser configuration: defaultOperator="AND|OR" -->
 <solrQueryParser defaultOperator="OR"/>
  <!-- copyField commands copy one field to another at the time a document
        is added to the index.  It's used either to index the same field differently,
        or to add multiple fields to the same field for easier/faster searching.  -->
   <copyField source="id" dest="sku"/>
   <copyField source="incubationdate_dt" dest="incubationdate_s"/>
   <copyField source="cat" dest="text"/>
   <copyField source="name" dest="text"/>
   <copyField source="name" dest="nameSort"/>
   <copyField source="name" dest="alphaNameSort"/>
   <copyField source="manu" dest="text"/>
   <copyField source="features" dest="text"/>
   <copyField source="includes" dest="text"/>
   <copyField source="manu" dest="manu_exact"/>
  <copyField source="name" dest="spell"/>
 <!-- Similarity is the scoring routine for each document vs. a query.
      A custom similarity may be specified here, but the default is fine
      for most applications.  -->
 <!-- <similarity class="org.apache.lucene.search.DefaultSimilarity"/> -->
 <!-- ... OR ...
      Specify a SimilarityFactory class name implementation
      allowing parameters to be used.
 -->
 <!--
 <similarity class="com.example.solr.CustomSimilarityFactory">
   <str name="paramkey">param value</str>
 </similarity>
 -->
 </schema>
--- a/contrib/velocity/src/main/solr/conf/schema.xml
+++ b/contrib/velocity/src/main/solr/conf/schema.xml
@ -165,8 +165,8 @@
        <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
        -->
        <!-- Case insensitive stop word removal.
-             enablePositionIncrements=true ensures that a 'gap' is left to
+          add enablePositionIncrements=true in both the index and query
-             allow for accurate phrase queries.
+          analyzers to leave a 'gap' for more accurate phrase queries.
        -->
        <filter class="solr.StopFilterFactory"
                ignoreCase="true"
@ -181,7 +181,11 @@
      <analyzer type="query">
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
-        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
+        <filter class="solr.StopFilterFactory"
                ignoreCase="true"
                words="stopwords.txt"
                enablePositionIncrements="true"
                />
        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
@ -215,6 +219,16 @@
      </analyzer>
    </fieldType>
    <!-- charFilter + "CharStream aware" WhitespaceTokenizer  -->
    <!--
    <fieldType name="textCharNorm" class="solr.TextField" positionIncrementGap="100" >
      <analyzer>
        <charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt"/>
        <tokenizer class="solr.CharStreamAwareWhitespaceTokenizerFactory"/>
      </analyzer>
    </fieldType>
    -->
    <!-- This is an example of using the KeywordTokenizer along
         With various TokenFilterFactories to produce a sortable field
         that does not include some properties of the source text
--- a/example/solr/conf/schema.xml
+++ b/example/solr/conf/schema.xml
@ -165,8 +165,8 @@
        <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
        -->
        <!-- Case insensitive stop word removal.
-             enablePositionIncrements=true ensures that a 'gap' is left to
+          add enablePositionIncrements=true in both the index and query
-             allow for accurate phrase queries.
+          analyzers to leave a 'gap' for more accurate phrase queries.
        -->
        <filter class="solr.StopFilterFactory"
                ignoreCase="true"
@ -181,7 +181,11 @@
      <analyzer type="query">
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
-        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
+        <filter class="solr.StopFilterFactory"
                ignoreCase="true"
                words="stopwords.txt"
                enablePositionIncrements="true"
                />
        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
--- a/src/java/org/apache/solr/search/SolrQueryParser.java
+++ b/src/java/org/apache/solr/search/SolrQueryParser.java
@ -73,6 +73,7 @@ public class SolrQueryParser extends QueryParser {
    this.parser  = null;
    this.defaultField = defaultField;
    setLowercaseExpandedTerms(false);
    setEnablePositionIncrements(true);    
  }
  public SolrQueryParser(QParser parser, String defaultField) {
@ -85,6 +86,7 @@ public class SolrQueryParser extends QueryParser {
    this.parser = parser;
    this.defaultField = defaultField;
    setLowercaseExpandedTerms(false);
    setEnablePositionIncrements(true);
  }
  private void checkNullField(String field) throws SolrException {