mirror of https://github.com/apache/lucene.git
SOLR-879: enable position increments in QP, fix example schemas
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@721687 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
497a61a2fb
commit
b76a2a7b68
|
@ -135,6 +135,11 @@ Bug Fixes
|
|||
|
||||
11. SOLR-872: Better error message for incorrect copyField destination (Noble Paul via shalin)
|
||||
|
||||
12. SOLR-879: Enable position increments in the query parser and fix the
|
||||
example schema to enable position increments for the stop filter in
|
||||
both the index and query analyzers to fix the bug with phrase queries
|
||||
with stopwords. (yonik)
|
||||
|
||||
|
||||
Other Changes
|
||||
----------------------
|
||||
|
|
|
@ -1,4 +1,42 @@
|
|||
<?xml version="1.0" encoding="UTF-8" ?>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<!--
|
||||
This is the Solr schema file. This file should be named "schema.xml" and
|
||||
should be in the conf directory under the solr home
|
||||
(i.e. ./solr/conf/schema.xml by default)
|
||||
or located where the classloader for the Solr webapp can find it.
|
||||
|
||||
This example schema is the recommended starting point for users.
|
||||
It should be kept correct and concise, usable out-of-the-box.
|
||||
|
||||
For more information, on how to customize this file, please see
|
||||
http://wiki.apache.org/solr/SchemaXml
|
||||
-->
|
||||
|
||||
<schema name="example" version="1.1">
|
||||
<!-- attribute "name" is the name of this schema and is only used for display purposes.
|
||||
Applications should change this to reflect the nature of the search collection.
|
||||
version="1.1" is Solr's version number for the schema syntax and semantics. It should
|
||||
not normally be changed by applications.
|
||||
1.0: multiValued attribute did not exist, all fields are multiValued by nature
|
||||
1.1: multiValued attribute introduced, false by default -->
|
||||
|
||||
<types>
|
||||
<!-- field type definitions. The "name" attribute is
|
||||
just a label to be used by field definitions. The "class"
|
||||
|
@ -127,8 +165,8 @@
|
|||
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
|
||||
-->
|
||||
<!-- Case insensitive stop word removal.
|
||||
enablePositionIncrements=true ensures that a 'gap' is left to
|
||||
allow for accurate phrase queries.
|
||||
add enablePositionIncrements=true in both the index and query
|
||||
analyzers to leave a 'gap' for more accurate phrase queries.
|
||||
-->
|
||||
<filter class="solr.StopFilterFactory"
|
||||
ignoreCase="true"
|
||||
|
@ -143,7 +181,11 @@
|
|||
<analyzer type="query">
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
|
||||
<filter class="solr.StopFilterFactory"
|
||||
ignoreCase="true"
|
||||
words="stopwords.txt"
|
||||
enablePositionIncrements="true"
|
||||
/>
|
||||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
|
||||
|
@ -166,7 +208,6 @@
|
|||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
|
||||
<!--
|
||||
Setup simple analysis for spell checking
|
||||
-->
|
||||
|
@ -178,6 +219,16 @@
|
|||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<!-- charFilter + "CharStream aware" WhitespaceTokenizer -->
|
||||
<!--
|
||||
<fieldType name="textCharNorm" class="solr.TextField" positionIncrementGap="100" >
|
||||
<analyzer>
|
||||
<charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt"/>
|
||||
<tokenizer class="solr.CharStreamAwareWhitespaceTokenizerFactory"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
-->
|
||||
|
||||
<!-- This is an example of using the KeywordTokenizer along
|
||||
With various TokenFilterFactories to produce a sortable field
|
||||
that does not include some properties of the source text
|
||||
|
@ -211,6 +262,14 @@
|
|||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<fieldtype name="phonetic" stored="false" indexed="true" class="solr.TextField" >
|
||||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/>
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
|
||||
|
||||
<!-- since fields of this type are by default not stored or indexed, any data added to
|
||||
them will be ignored outright
|
||||
-->
|
||||
|
@ -220,32 +279,134 @@
|
|||
|
||||
|
||||
<fields>
|
||||
<!-- Valid attributes for fields:
|
||||
name: mandatory - the name for the field
|
||||
type: mandatory - the name of a previously defined type from the <types> section
|
||||
indexed: true if this field should be indexed (searchable or sortable)
|
||||
stored: true if this field should be retrievable
|
||||
compressed: [false] if this field should be stored using gzip compression
|
||||
(this will only apply if the field type is compressable; among
|
||||
the standard field types, only TextField and StrField are)
|
||||
multiValued: true if this field may contain multiple values per document
|
||||
omitNorms: (expert) set to true to omit the norms associated with
|
||||
this field (this disables length normalization and index-time
|
||||
boosting for the field, and saves some memory). Only full-text
|
||||
fields or fields that need an index-time boost need norms.
|
||||
termVectors: [false] set to true to store the term vector for a given field.
|
||||
When using MoreLikeThis, fields used for similarity should be stored for
|
||||
best performance.
|
||||
termPositions: Store position information with the term vector. This will increase storage costs.
|
||||
termOffsets: Store offset information with the term vector. This will increase storage costs.
|
||||
-->
|
||||
|
||||
<field name="id" type="string" indexed="true" stored="true" required="true" />
|
||||
<field name="title" type="text" indexed="true" stored="true"/>
|
||||
<field name="text" type="text" indexed="true" stored="true"/>
|
||||
<field name="date" type="date" indexed="true" stored="true"/>
|
||||
<field name="dateline" type="text" indexed="true" stored="true"/>
|
||||
<field name="places" type="string" indexed="true" stored="true" multiValued="true" omitNorms="true" termVectors="true" />
|
||||
<field name="countryCodes" type="string" indexed="true" stored="true" multiValued="true" omitNorms="true" termVectors="true" />
|
||||
<field name="topics" type="string" indexed="true" stored="true" multiValued="true" omitNorms="true" termVectors="true" />
|
||||
<field name="organisations" type="string" indexed="true" stored="true" multiValued="true" omitNorms="true" termVectors="true" />
|
||||
<field name="exchanges" type="string" indexed="true" stored="true" multiValued="true" omitNorms="true" termVectors="true" />
|
||||
<field name="companies" type="string" indexed="true" stored="true" multiValued="true" omitNorms="true" termVectors="true" />
|
||||
<field name="allText" type="text" indexed="true" stored="true" multiValued="true" omitNorms="true" termVectors="true" />
|
||||
<field name="sku" type="textTight" indexed="true" stored="true" omitNorms="true"/>
|
||||
<field name="name" type="text" indexed="true" stored="true"/>
|
||||
<field name="nameSort" type="string" indexed="true" stored="false"/>
|
||||
<field name="alphaNameSort" type="alphaOnlySort" indexed="true" stored="false"/>
|
||||
<field name="manu" type="text" indexed="true" stored="true" omitNorms="true"/>
|
||||
<field name="cat" type="text_ws" indexed="true" stored="true" multiValued="true" omitNorms="true" termVectors="true" />
|
||||
<field name="features" type="text" indexed="true" stored="true" multiValued="true" termVectors="true" termPositions="true" termOffsets="true"/>
|
||||
<field name="includes" type="text" indexed="true" stored="true"/>
|
||||
|
||||
<field name="weight" type="sfloat" indexed="true" stored="true"/>
|
||||
<field name="price" type="sfloat" indexed="true" stored="true"/>
|
||||
<!-- "default" values can be specified for fields, indicating which
|
||||
value should be used if no value is specified when adding a document.
|
||||
-->
|
||||
<field name="popularity" type="sint" indexed="true" stored="true" default="0"/>
|
||||
<field name="inStock" type="boolean" indexed="true" stored="true"/>
|
||||
|
||||
<!-- Some sample docs exists solely to demonstrate the spellchecker
|
||||
functionality, this is the only field they container.
|
||||
Typically you might build the spellchecker of "catchall" type field
|
||||
containing all of the text in each document
|
||||
-->
|
||||
<field name="word" type="string" indexed="true" stored="true"/>
|
||||
|
||||
|
||||
<!-- catchall field, containing all other searchable text fields (implemented
|
||||
via copyField further on in this schema -->
|
||||
<field name="text" type="text" indexed="true" stored="false" multiValued="true"/>
|
||||
|
||||
<!-- non-tokenized version of manufacturer to make it easier to sort or group
|
||||
results by manufacturer. copied from "manu" via copyField -->
|
||||
<field name="manu_exact" type="string" indexed="true" stored="false"/>
|
||||
|
||||
<!-- Here, default is used to create a "timestamp" field indicating
|
||||
When each document was indexed.
|
||||
-->
|
||||
<field name="timestamp" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/>
|
||||
|
||||
<field name="spell" type="textSpell" indexed="true" stored="true" multiValued="true"/>
|
||||
<!-- Dynamic field definitions. If a field name is not found, dynamicFields
|
||||
will be used if the name matches any of the patterns.
|
||||
RESTRICTION: the glob-like pattern in the name attribute must have
|
||||
a "*" only at the start or the end.
|
||||
EXAMPLE: name="*_i" will match any field ending in _i (like myid_i, z_i)
|
||||
Longer patterns will be matched first. if equal size patterns
|
||||
both match, the first appearing in the schema will be used. -->
|
||||
<dynamicField name="*_i" type="sint" indexed="true" stored="true"/>
|
||||
<dynamicField name="*_s" type="string" indexed="true" stored="true"/>
|
||||
<dynamicField name="*_l" type="slong" indexed="true" stored="true"/>
|
||||
<dynamicField name="*_t" type="text" indexed="true" stored="true"/>
|
||||
<dynamicField name="*_b" type="boolean" indexed="true" stored="true"/>
|
||||
<dynamicField name="*_f" type="sfloat" indexed="true" stored="true"/>
|
||||
<dynamicField name="*_d" type="sdouble" indexed="true" stored="true"/>
|
||||
<dynamicField name="*_dt" type="date" indexed="true" stored="true"/>
|
||||
|
||||
<dynamicField name="random*" type="random" />
|
||||
|
||||
<!-- uncomment the following to ignore any fields that don't already match an existing
|
||||
field name or dynamic field, rather than reporting them as an error.
|
||||
alternately, change the type="ignored" to some other type e.g. "text" if you want
|
||||
unknown fields indexed and/or stored by default -->
|
||||
<!--dynamicField name="*" type="ignored" /-->
|
||||
|
||||
</fields>
|
||||
|
||||
<copyField source="title" dest="allText"/>
|
||||
<copyField source="text" dest="allText"/>
|
||||
<copyField source="places" dest="allText"/>
|
||||
<copyField source="topics" dest="allText"/>
|
||||
<copyField source="companies" dest="allText"/>
|
||||
<copyField source="exchanges" dest="allText"/>
|
||||
|
||||
<!-- Field to use to determine and enforce document uniqueness.
|
||||
Unless this field is marked with required="false", it will be a required field
|
||||
-->
|
||||
<uniqueKey>id</uniqueKey>
|
||||
|
||||
<!-- field for the QueryParser to use when an explicit fieldname is absent -->
|
||||
<defaultSearchField>text</defaultSearchField>
|
||||
|
||||
<!-- SolrQueryParser configuration: defaultOperator="AND|OR" -->
|
||||
<solrQueryParser defaultOperator="OR"/>
|
||||
|
||||
<!-- copyField commands copy one field to another at the time a document
|
||||
is added to the index. It's used either to index the same field differently,
|
||||
or to add multiple fields to the same field for easier/faster searching. -->
|
||||
<copyField source="id" dest="sku"/>
|
||||
|
||||
<copyField source="incubationdate_dt" dest="incubationdate_s"/>
|
||||
<copyField source="cat" dest="text"/>
|
||||
<copyField source="name" dest="text"/>
|
||||
<copyField source="name" dest="nameSort"/>
|
||||
<copyField source="name" dest="alphaNameSort"/>
|
||||
<copyField source="manu" dest="text"/>
|
||||
<copyField source="features" dest="text"/>
|
||||
<copyField source="includes" dest="text"/>
|
||||
|
||||
<copyField source="manu" dest="manu_exact"/>
|
||||
|
||||
<copyField source="name" dest="spell"/>
|
||||
|
||||
<!-- Similarity is the scoring routine for each document vs. a query.
|
||||
A custom similarity may be specified here, but the default is fine
|
||||
for most applications. -->
|
||||
<!-- <similarity class="org.apache.lucene.search.DefaultSimilarity"/> -->
|
||||
<!-- ... OR ...
|
||||
Specify a SimilarityFactory class name implementation
|
||||
allowing parameters to be used.
|
||||
-->
|
||||
<!--
|
||||
<similarity class="com.example.solr.CustomSimilarityFactory">
|
||||
<str name="paramkey">param value</str>
|
||||
</similarity>
|
||||
-->
|
||||
|
||||
|
||||
</schema>
|
||||
|
|
|
@ -165,8 +165,8 @@
|
|||
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
|
||||
-->
|
||||
<!-- Case insensitive stop word removal.
|
||||
enablePositionIncrements=true ensures that a 'gap' is left to
|
||||
allow for accurate phrase queries.
|
||||
add enablePositionIncrements=true in both the index and query
|
||||
analyzers to leave a 'gap' for more accurate phrase queries.
|
||||
-->
|
||||
<filter class="solr.StopFilterFactory"
|
||||
ignoreCase="true"
|
||||
|
@ -181,7 +181,11 @@
|
|||
<analyzer type="query">
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
|
||||
<filter class="solr.StopFilterFactory"
|
||||
ignoreCase="true"
|
||||
words="stopwords.txt"
|
||||
enablePositionIncrements="true"
|
||||
/>
|
||||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
|
||||
|
@ -215,6 +219,16 @@
|
|||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<!-- charFilter + "CharStream aware" WhitespaceTokenizer -->
|
||||
<!--
|
||||
<fieldType name="textCharNorm" class="solr.TextField" positionIncrementGap="100" >
|
||||
<analyzer>
|
||||
<charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt"/>
|
||||
<tokenizer class="solr.CharStreamAwareWhitespaceTokenizerFactory"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
-->
|
||||
|
||||
<!-- This is an example of using the KeywordTokenizer along
|
||||
With various TokenFilterFactories to produce a sortable field
|
||||
that does not include some properties of the source text
|
||||
|
|
|
@ -165,8 +165,8 @@
|
|||
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
|
||||
-->
|
||||
<!-- Case insensitive stop word removal.
|
||||
enablePositionIncrements=true ensures that a 'gap' is left to
|
||||
allow for accurate phrase queries.
|
||||
add enablePositionIncrements=true in both the index and query
|
||||
analyzers to leave a 'gap' for more accurate phrase queries.
|
||||
-->
|
||||
<filter class="solr.StopFilterFactory"
|
||||
ignoreCase="true"
|
||||
|
@ -181,7 +181,11 @@
|
|||
<analyzer type="query">
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
|
||||
<filter class="solr.StopFilterFactory"
|
||||
ignoreCase="true"
|
||||
words="stopwords.txt"
|
||||
enablePositionIncrements="true"
|
||||
/>
|
||||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
|
||||
|
|
|
@ -73,6 +73,7 @@ public class SolrQueryParser extends QueryParser {
|
|||
this.parser = null;
|
||||
this.defaultField = defaultField;
|
||||
setLowercaseExpandedTerms(false);
|
||||
setEnablePositionIncrements(true);
|
||||
}
|
||||
|
||||
public SolrQueryParser(QParser parser, String defaultField) {
|
||||
|
@ -85,6 +86,7 @@ public class SolrQueryParser extends QueryParser {
|
|||
this.parser = parser;
|
||||
this.defaultField = defaultField;
|
||||
setLowercaseExpandedTerms(false);
|
||||
setEnablePositionIncrements(true);
|
||||
}
|
||||
|
||||
private void checkNullField(String field) throws SolrException {
|
||||
|
|
Loading…
Reference in New Issue