SOLR-879: enable position increments in QP, fix example schemas

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@721687 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2008-11-29 16:38:08 +00:00
parent 497a61a2fb
commit b76a2a7b68
5 changed files with 214 additions and 28 deletions

View File

@ -135,6 +135,11 @@ Bug Fixes
11. SOLR-872: Better error message for incorrect copyField destination (Noble Paul via shalin) 11. SOLR-872: Better error message for incorrect copyField destination (Noble Paul via shalin)
12. SOLR-879: Enable position increments in the query parser and fix the
example schema to enable position increments for the stop filter in
both the index and query analyzers to fix the bug with phrase queries
with stopwords. (yonik)
Other Changes Other Changes
---------------------- ----------------------

View File

@ -1,4 +1,42 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!--
This is the Solr schema file. This file should be named "schema.xml" and
should be in the conf directory under the solr home
(i.e. ./solr/conf/schema.xml by default)
or located where the classloader for the Solr webapp can find it.
This example schema is the recommended starting point for users.
It should be kept correct and concise, usable out-of-the-box.
For more information, on how to customize this file, please see
http://wiki.apache.org/solr/SchemaXml
-->
<schema name="example" version="1.1"> <schema name="example" version="1.1">
<!-- attribute "name" is the name of this schema and is only used for display purposes.
Applications should change this to reflect the nature of the search collection.
version="1.1" is Solr's version number for the schema syntax and semantics. It should
not normally be changed by applications.
1.0: multiValued attribute did not exist, all fields are multiValued by nature
1.1: multiValued attribute introduced, false by default -->
<types> <types>
<!-- field type definitions. The "name" attribute is <!-- field type definitions. The "name" attribute is
just a label to be used by field definitions. The "class" just a label to be used by field definitions. The "class"
@ -127,8 +165,8 @@
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/> <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
--> -->
<!-- Case insensitive stop word removal. <!-- Case insensitive stop word removal.
enablePositionIncrements=true ensures that a 'gap' is left to add enablePositionIncrements=true in both the index and query
allow for accurate phrase queries. analyzers to leave a 'gap' for more accurate phrase queries.
--> -->
<filter class="solr.StopFilterFactory" <filter class="solr.StopFilterFactory"
ignoreCase="true" ignoreCase="true"
@ -143,7 +181,11 @@
<analyzer type="query"> <analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/> <tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/> <filter class="solr.StopFilterFactory"
ignoreCase="true"
words="stopwords.txt"
enablePositionIncrements="true"
/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/> <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/> <filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/> <filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
@ -166,7 +208,6 @@
</analyzer> </analyzer>
</fieldType> </fieldType>
<!-- <!--
Setup simple analysis for spell checking Setup simple analysis for spell checking
--> -->
@ -178,6 +219,16 @@
</analyzer> </analyzer>
</fieldType> </fieldType>
<!-- charFilter + "CharStream aware" WhitespaceTokenizer -->
<!--
<fieldType name="textCharNorm" class="solr.TextField" positionIncrementGap="100" >
<analyzer>
<charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt"/>
<tokenizer class="solr.CharStreamAwareWhitespaceTokenizerFactory"/>
</analyzer>
</fieldType>
-->
<!-- This is an example of using the KeywordTokenizer along <!-- This is an example of using the KeywordTokenizer along
With various TokenFilterFactories to produce a sortable field With various TokenFilterFactories to produce a sortable field
that does not include some properties of the source text that does not include some properties of the source text
@ -210,6 +261,14 @@
/> />
</analyzer> </analyzer>
</fieldType> </fieldType>
<fieldtype name="phonetic" stored="false" indexed="true" class="solr.TextField" >
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/>
</analyzer>
</fieldtype>
<!-- since fields of this type are by default not stored or indexed, any data added to <!-- since fields of this type are by default not stored or indexed, any data added to
them will be ignored outright them will be ignored outright
@ -220,32 +279,134 @@
<fields> <fields>
<!-- Valid attributes for fields:
name: mandatory - the name for the field
type: mandatory - the name of a previously defined type from the <types> section
indexed: true if this field should be indexed (searchable or sortable)
stored: true if this field should be retrievable
compressed: [false] if this field should be stored using gzip compression
(this will only apply if the field type is compressable; among
the standard field types, only TextField and StrField are)
multiValued: true if this field may contain multiple values per document
omitNorms: (expert) set to true to omit the norms associated with
this field (this disables length normalization and index-time
boosting for the field, and saves some memory). Only full-text
fields or fields that need an index-time boost need norms.
termVectors: [false] set to true to store the term vector for a given field.
When using MoreLikeThis, fields used for similarity should be stored for
best performance.
termPositions: Store position information with the term vector. This will increase storage costs.
termOffsets: Store offset information with the term vector. This will increase storage costs.
-->
<field name="id" type="string" indexed="true" stored="true" required="true" /> <field name="id" type="string" indexed="true" stored="true" required="true" />
<field name="title" type="text" indexed="true" stored="true"/> <field name="sku" type="textTight" indexed="true" stored="true" omitNorms="true"/>
<field name="text" type="text" indexed="true" stored="true"/> <field name="name" type="text" indexed="true" stored="true"/>
<field name="date" type="date" indexed="true" stored="true"/> <field name="nameSort" type="string" indexed="true" stored="false"/>
<field name="dateline" type="text" indexed="true" stored="true"/> <field name="alphaNameSort" type="alphaOnlySort" indexed="true" stored="false"/>
<field name="places" type="string" indexed="true" stored="true" multiValued="true" omitNorms="true" termVectors="true" /> <field name="manu" type="text" indexed="true" stored="true" omitNorms="true"/>
<field name="countryCodes" type="string" indexed="true" stored="true" multiValued="true" omitNorms="true" termVectors="true" /> <field name="cat" type="text_ws" indexed="true" stored="true" multiValued="true" omitNorms="true" termVectors="true" />
<field name="topics" type="string" indexed="true" stored="true" multiValued="true" omitNorms="true" termVectors="true" /> <field name="features" type="text" indexed="true" stored="true" multiValued="true" termVectors="true" termPositions="true" termOffsets="true"/>
<field name="organisations" type="string" indexed="true" stored="true" multiValued="true" omitNorms="true" termVectors="true" /> <field name="includes" type="text" indexed="true" stored="true"/>
<field name="exchanges" type="string" indexed="true" stored="true" multiValued="true" omitNorms="true" termVectors="true" />
<field name="companies" type="string" indexed="true" stored="true" multiValued="true" omitNorms="true" termVectors="true" /> <field name="weight" type="sfloat" indexed="true" stored="true"/>
<field name="allText" type="text" indexed="true" stored="true" multiValued="true" omitNorms="true" termVectors="true" /> <field name="price" type="sfloat" indexed="true" stored="true"/>
<!-- "default" values can be specified for fields, indicating which
value should be used if no value is specified when adding a document.
-->
<field name="popularity" type="sint" indexed="true" stored="true" default="0"/>
<field name="inStock" type="boolean" indexed="true" stored="true"/>
<!-- Some sample docs exists solely to demonstrate the spellchecker
functionality, this is the only field they container.
Typically you might build the spellchecker of "catchall" type field
containing all of the text in each document
-->
<field name="word" type="string" indexed="true" stored="true"/>
<!-- catchall field, containing all other searchable text fields (implemented
via copyField further on in this schema -->
<field name="text" type="text" indexed="true" stored="false" multiValued="true"/>
<!-- non-tokenized version of manufacturer to make it easier to sort or group
results by manufacturer. copied from "manu" via copyField -->
<field name="manu_exact" type="string" indexed="true" stored="false"/>
<!-- Here, default is used to create a "timestamp" field indicating
When each document was indexed.
-->
<field name="timestamp" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/>
<field name="spell" type="textSpell" indexed="true" stored="true" multiValued="true"/>
<!-- Dynamic field definitions. If a field name is not found, dynamicFields
will be used if the name matches any of the patterns.
RESTRICTION: the glob-like pattern in the name attribute must have
a "*" only at the start or the end.
EXAMPLE: name="*_i" will match any field ending in _i (like myid_i, z_i)
Longer patterns will be matched first. if equal size patterns
both match, the first appearing in the schema will be used. -->
<dynamicField name="*_i" type="sint" indexed="true" stored="true"/>
<dynamicField name="*_s" type="string" indexed="true" stored="true"/>
<dynamicField name="*_l" type="slong" indexed="true" stored="true"/>
<dynamicField name="*_t" type="text" indexed="true" stored="true"/>
<dynamicField name="*_b" type="boolean" indexed="true" stored="true"/>
<dynamicField name="*_f" type="sfloat" indexed="true" stored="true"/>
<dynamicField name="*_d" type="sdouble" indexed="true" stored="true"/>
<dynamicField name="*_dt" type="date" indexed="true" stored="true"/>
<dynamicField name="random*" type="random" />
<!-- uncomment the following to ignore any fields that don't already match an existing
field name or dynamic field, rather than reporting them as an error.
alternately, change the type="ignored" to some other type e.g. "text" if you want
unknown fields indexed and/or stored by default -->
<!--dynamicField name="*" type="ignored" /-->
</fields> </fields>
<copyField source="title" dest="allText"/> <!-- Field to use to determine and enforce document uniqueness.
<copyField source="text" dest="allText"/> Unless this field is marked with required="false", it will be a required field
<copyField source="places" dest="allText"/> -->
<copyField source="topics" dest="allText"/>
<copyField source="companies" dest="allText"/>
<copyField source="exchanges" dest="allText"/>
<uniqueKey>id</uniqueKey> <uniqueKey>id</uniqueKey>
<!-- field for the QueryParser to use when an explicit fieldname is absent -->
<defaultSearchField>text</defaultSearchField> <defaultSearchField>text</defaultSearchField>
<!-- SolrQueryParser configuration: defaultOperator="AND|OR" -->
<solrQueryParser defaultOperator="OR"/> <solrQueryParser defaultOperator="OR"/>
<!-- copyField commands copy one field to another at the time a document
is added to the index. It's used either to index the same field differently,
or to add multiple fields to the same field for easier/faster searching. -->
<copyField source="id" dest="sku"/>
<copyField source="incubationdate_dt" dest="incubationdate_s"/>
<copyField source="cat" dest="text"/>
<copyField source="name" dest="text"/>
<copyField source="name" dest="nameSort"/>
<copyField source="name" dest="alphaNameSort"/>
<copyField source="manu" dest="text"/>
<copyField source="features" dest="text"/>
<copyField source="includes" dest="text"/>
<copyField source="manu" dest="manu_exact"/>
<copyField source="name" dest="spell"/>
<!-- Similarity is the scoring routine for each document vs. a query.
A custom similarity may be specified here, but the default is fine
for most applications. -->
<!-- <similarity class="org.apache.lucene.search.DefaultSimilarity"/> -->
<!-- ... OR ...
Specify a SimilarityFactory class name implementation
allowing parameters to be used.
-->
<!--
<similarity class="com.example.solr.CustomSimilarityFactory">
<str name="paramkey">param value</str>
</similarity>
-->
</schema> </schema>

View File

@ -165,8 +165,8 @@
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/> <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
--> -->
<!-- Case insensitive stop word removal. <!-- Case insensitive stop word removal.
enablePositionIncrements=true ensures that a 'gap' is left to add enablePositionIncrements=true in both the index and query
allow for accurate phrase queries. analyzers to leave a 'gap' for more accurate phrase queries.
--> -->
<filter class="solr.StopFilterFactory" <filter class="solr.StopFilterFactory"
ignoreCase="true" ignoreCase="true"
@ -181,7 +181,11 @@
<analyzer type="query"> <analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/> <tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/> <filter class="solr.StopFilterFactory"
ignoreCase="true"
words="stopwords.txt"
enablePositionIncrements="true"
/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/> <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/> <filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/> <filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
@ -215,6 +219,16 @@
</analyzer> </analyzer>
</fieldType> </fieldType>
<!-- charFilter + "CharStream aware" WhitespaceTokenizer -->
<!--
<fieldType name="textCharNorm" class="solr.TextField" positionIncrementGap="100" >
<analyzer>
<charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt"/>
<tokenizer class="solr.CharStreamAwareWhitespaceTokenizerFactory"/>
</analyzer>
</fieldType>
-->
<!-- This is an example of using the KeywordTokenizer along <!-- This is an example of using the KeywordTokenizer along
With various TokenFilterFactories to produce a sortable field With various TokenFilterFactories to produce a sortable field
that does not include some properties of the source text that does not include some properties of the source text

View File

@ -165,8 +165,8 @@
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/> <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
--> -->
<!-- Case insensitive stop word removal. <!-- Case insensitive stop word removal.
enablePositionIncrements=true ensures that a 'gap' is left to add enablePositionIncrements=true in both the index and query
allow for accurate phrase queries. analyzers to leave a 'gap' for more accurate phrase queries.
--> -->
<filter class="solr.StopFilterFactory" <filter class="solr.StopFilterFactory"
ignoreCase="true" ignoreCase="true"
@ -181,7 +181,11 @@
<analyzer type="query"> <analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/> <tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/> <filter class="solr.StopFilterFactory"
ignoreCase="true"
words="stopwords.txt"
enablePositionIncrements="true"
/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/> <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/> <filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/> <filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>

View File

@ -73,6 +73,7 @@ public class SolrQueryParser extends QueryParser {
this.parser = null; this.parser = null;
this.defaultField = defaultField; this.defaultField = defaultField;
setLowercaseExpandedTerms(false); setLowercaseExpandedTerms(false);
setEnablePositionIncrements(true);
} }
public SolrQueryParser(QParser parser, String defaultField) { public SolrQueryParser(QParser parser, String defaultField) {
@ -85,6 +86,7 @@ public class SolrQueryParser extends QueryParser {
this.parser = parser; this.parser = parser;
this.defaultField = defaultField; this.defaultField = defaultField;
setLowercaseExpandedTerms(false); setLowercaseExpandedTerms(false);
setEnablePositionIncrements(true);
} }
private void checkNullField(String field) throws SolrException { private void checkNullField(String field) throws SolrException {