git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1206767 13f79535-47bb-0310-9956-ffa450edef68

This commit is contained in:
Erick Erickson 2011-11-27 17:04:38 +00:00
parent 5c4063bef2
commit c94c1c5a64
18 changed files with 366 additions and 210 deletions

View File

@ -193,6 +193,11 @@ New Features
a complete analysis chain for multiterm queries.
(Pete Sturge Erick Erickson, Mentoring from Seeley and Muir)
* SOLR-2918 Improvement to SOLR-2438, added MultiTermAwareComponent to the various classes
that should transform multiterm queries in various ways, and use this as the criteria for
adding them to the multiterm analyzer that is constructed if not specified in the
<fieldType>
Optimizations
----------------------

View File

@ -32,9 +32,14 @@ import org.apache.lucene.analysis.TokenStream;
* &lt;/fieldType&gt;</pre>
*
*/
public class ASCIIFoldingFilterFactory extends BaseTokenFilterFactory {
public class ASCIIFoldingFilterFactory extends BaseTokenFilterFactory implements MultiTermAwareComponent {
public ASCIIFoldingFilter create(TokenStream input) {
return new ASCIIFoldingFilter(input);
}
@Override
public Object getMultiTermComponent() {
return this;
}
}

View File

@ -33,7 +33,7 @@ import org.apache.lucene.analysis.core.LowerCaseFilter;
* &lt;/fieldType&gt;</pre>
*
*/
public class LowerCaseFilterFactory extends BaseTokenFilterFactory {
public class LowerCaseFilterFactory extends BaseTokenFilterFactory implements MultiTermAwareComponent {
@Override
public void init(Map<String,String> args) {
super.init(args);
@ -43,4 +43,9 @@ public class LowerCaseFilterFactory extends BaseTokenFilterFactory {
public LowerCaseFilter create(TokenStream input) {
return new LowerCaseFilter(luceneMatchVersion,input);
}
@Override
public Object getMultiTermComponent() {
return this;
}
}

View File

@ -17,6 +17,7 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
import java.io.Reader;
@ -32,7 +33,7 @@ import java.util.Map;
* &lt;/fieldType&gt;</pre>
*
*/
public class LowerCaseTokenizerFactory extends BaseTokenizerFactory {
public class LowerCaseTokenizerFactory extends BaseTokenizerFactory implements MultiTermAwareComponent {
@Override
public void init(Map<String,String> args) {
super.init(args);
@ -42,4 +43,11 @@ public class LowerCaseTokenizerFactory extends BaseTokenizerFactory {
public LowerCaseTokenizer create(Reader input) {
return new LowerCaseTokenizer(luceneMatchVersion,input);
}
@Override
public Object getMultiTermComponent() {
LowerCaseFilterFactory filt = new LowerCaseFilterFactory();
filt.init(args);
return filt;
}
}

View File

@ -46,7 +46,7 @@ import org.apache.solr.util.plugin.ResourceLoaderAware;
*
*/
public class MappingCharFilterFactory extends BaseCharFilterFactory implements
ResourceLoaderAware {
ResourceLoaderAware, MultiTermAwareComponent {
protected NormalizeCharMap normMap;
private String mapping;
@ -126,4 +126,9 @@ public class MappingCharFilterFactory extends BaseCharFilterFactory implements
}
return new String( out, 0, writePos );
}
@Override
public Object getMultiTermComponent() {
return this;
}
}

View File

@ -0,0 +1,31 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/** Add to any analysis factory component to allow returning an
* analysis component factory for use with partial terms in prefix queries,
* wildcard queries, range query endpoints, regex queries, etc.
*
* @lucene.experimental
*/
public interface MultiTermAwareComponent {
/** Returns an analysis component to handle analysis if multi-term queries.
* The returned component must be a TokenizerFactory, TokenFilterFactory or CharFilterFactory.
*/
public Object getMultiTermComponent();
}

View File

@ -31,10 +31,15 @@ import org.apache.lucene.analysis.fa.PersianCharFilter;
* &lt;/fieldType&gt;</pre>
*
*/
public class PersianCharFilterFactory extends BaseCharFilterFactory {
public class PersianCharFilterFactory extends BaseCharFilterFactory implements MultiTermAwareComponent {
@Override
public CharStream create(CharStream input) {
return new PersianCharFilter(input);
}
@Override
public Object getMultiTermComponent() {
return this;
}
}

View File

@ -67,3 +67,4 @@ public interface TokenFilterFactory {
/** Transform the specified input TokenStream */
public TokenStream create(TokenStream input);
}

View File

@ -48,15 +48,13 @@ public abstract class FieldProperties {
protected final static int REQUIRED = 0x00001000;
protected final static int OMIT_POSITIONS = 0x00002000;
protected final static int LEGACY_MULTITERM = 0x00004000;
static final String[] propertyNames = {
"indexed", "tokenized", "stored",
"binary", "omitNorms", "omitTermFreqAndPositions",
"termVectors", "termPositions", "termOffsets",
"multiValued",
"sortMissingFirst","sortMissingLast","required", "omitPositions" ,
"legacyMultiTerm"
"sortMissingFirst","sortMissingLast","required", "omitPositions"
};
static final Map<String,Integer> propertyMap = new HashMap<String,Integer>();

View File

@ -428,21 +428,6 @@ public abstract class FieldType extends FieldProperties {
*/
protected Analyzer queryAnalyzer=analyzer;
/**
* Analyzer set by schema for text types to use when searching fields
* of this type, subclasses can set analyzer themselves or override
* getAnalyzer()
* This analyzer is used to process wildcard, prefix, regex and other multiterm queries. It
* assembles a list of tokenizer +filters that "make sense" for this, primarily accent folding and
* lowercasing filters, and charfilters.
*
* If users require old-style behavior, they can specify 'legacyMultiterm="true" ' in the schema file
* @see #getMultiTermAnalyzer
* @see #setMultiTermAnalyzer
*/
protected Analyzer multiTermAnalyzer=null;
/**
* Returns the Analyzer to be used when indexing fields of this type.
* <p>
@ -465,20 +450,6 @@ public abstract class FieldType extends FieldProperties {
return queryAnalyzer;
}
/**
* Returns the Analyzer to be used when searching fields of this type when mult-term queries are specified.
* <p>
* This method may be called many times, at any time.
* </p>
* @see #getAnalyzer
*/
public Analyzer getMultiTermAnalyzer() {
return multiTermAnalyzer;
}
private final String analyzerError =
"FieldType: " + this.getClass().getSimpleName() +
" (" + typeName + ") does not support specifying an analyzer";
/**
* Sets the Analyzer to be used when indexing fields of this type.
@ -524,28 +495,6 @@ public abstract class FieldType extends FieldProperties {
throw e;
}
/**
* Sets the Analyzer to be used when querying fields of this type.
*
* <p>
*
* Subclasses that override this method need to ensure the behavior
* of the analyzer is consistent with the implementation of toInternal.
* </p>
*
* @see #toInternal
* @see #setAnalyzer
* @see #getQueryAnalyzer
*/
public void setMultiTermAnalyzer(Analyzer analyzer) {
SolrException e = new SolrException
(ErrorCode.SERVER_ERROR,
"FieldType: " + this.getClass().getSimpleName() +
" (" + typeName + ") does not support specifying an analyzer");
SolrException.logOnce(log,null,e);
throw e;
}
/** @lucene.internal */
protected Similarity similarity;

View File

@ -102,15 +102,13 @@ public final class FieldTypePluginLoader
if (queryAnalyzer==null) queryAnalyzer=analyzer;
if (analyzer==null) analyzer=queryAnalyzer;
if (multiAnalyzer == null) {
Boolean legacyMatch = ! schema.getDefaultLuceneMatchVersion().onOrAfter(Version.LUCENE_36);
legacyMatch = (DOMUtil.getAttr(node, "legacyMultiTerm", null) == null) ? legacyMatch :
Boolean.parseBoolean(DOMUtil.getAttr(node, "legacyMultiTerm", null));
multiAnalyzer = constructMultiTermAnalyzer(queryAnalyzer, legacyMatch);
multiAnalyzer = constructMultiTermAnalyzer(queryAnalyzer);
}
if (analyzer!=null) {
ft.setAnalyzer(analyzer);
ft.setQueryAnalyzer(queryAnalyzer);
ft.setMultiTermAnalyzer(multiAnalyzer);
if (ft instanceof TextField)
((TextField)ft).setMultiTermAnalyzer(multiAnalyzer);
}
if (similarity!=null) {
ft.setSimilarity(similarity);
@ -143,36 +141,75 @@ public final class FieldTypePluginLoader
// 2> If letacyMultiTerm == true just construct the analyzer from a KeywordTokenizer. That should mimic current behavior.
// Do the same if they've specified that the old behavior is required (legacyMultiTerm="true")
private Analyzer constructMultiTermAnalyzer(Analyzer queryAnalyzer, Boolean legacyMultiTerm) {
private Analyzer constructMultiTermAnalyzer(Analyzer queryAnalyzer) {
if (queryAnalyzer == null) return null;
if (legacyMultiTerm || (!(queryAnalyzer instanceof TokenizerChain))) {
if (!(queryAnalyzer instanceof TokenizerChain)) {
return new KeywordAnalyzer();
}
TokenizerChain tc = (TokenizerChain) queryAnalyzer;
MultiTermChainBuilder builder = new MultiTermChainBuilder();
// we know it'll never be longer than this unless the code below is explicitly changed
TokenFilterFactory[] filters = new TokenFilterFactory[2];
int idx = 0;
for (TokenFilterFactory factory : tc.getTokenFilterFactories()) {
if (factory instanceof LowerCaseFilterFactory) {
filters[idx] = new LowerCaseFilterFactory();
filters[idx++].init(factory.getArgs());
}
if (factory instanceof ASCIIFoldingFilterFactory) {
filters[idx] = new ASCIIFoldingFilterFactory();
filters[idx++].init(factory.getArgs());
CharFilterFactory[] charFactories = tc.getCharFilterFactories();
if (charFactories != null) {
for (CharFilterFactory fact : charFactories) {
builder.add(fact);
}
}
WhitespaceTokenizerFactory white = new WhitespaceTokenizerFactory();
white.init(tc.getTokenizerFactory().getArgs());
return new TokenizerChain(tc.getCharFilterFactories(),
white,
Arrays.copyOfRange(filters, 0, idx));
builder.add(tc.getTokenizerFactory());
for (TokenFilterFactory fact : tc.getTokenFilterFactories()) {
builder.add(fact);
}
return builder.build();
}
private static class MultiTermChainBuilder {
static final KeywordTokenizerFactory keyFactory;
static {
keyFactory = new KeywordTokenizerFactory();
keyFactory.init(new HashMap<String,String>());
}
ArrayList<CharFilterFactory> charFilters = null;
ArrayList<TokenFilterFactory> filters = new ArrayList<TokenFilterFactory>(2);
TokenizerFactory tokenizer = keyFactory;
public void add(Object current) {
if (!(current instanceof MultiTermAwareComponent)) return;
Object newComponent = ((MultiTermAwareComponent)current).getMultiTermComponent();
if (newComponent instanceof TokenFilterFactory) {
if (filters == null) {
filters = new ArrayList<TokenFilterFactory>(2);
}
filters.add((TokenFilterFactory)newComponent);
} else if (newComponent instanceof TokenizerFactory) {
tokenizer = (TokenizerFactory)newComponent;
} else if (newComponent instanceof CharFilterFactory) {
if (charFilters == null) {
charFilters = new ArrayList<CharFilterFactory>(1);
}
charFilters.add( (CharFilterFactory)newComponent);
} else {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown analysis component from MultiTermAwareComponent: " + newComponent);
}
}
public TokenizerChain build() {
CharFilterFactory[] charFilterArr = charFilters == null ? null : charFilters.toArray(new CharFilterFactory[charFilters.size()]);
TokenFilterFactory[] filterArr = filters == null ? new TokenFilterFactory[0] : filters.toArray(new TokenFilterFactory[filters.size()]);
return new TokenizerChain(charFilterArr, tokenizer, filterArr);
}
}
//
// <analyzer><tokenizer class="...."/><tokenizer class="...." arg="....">
//

View File

@ -97,10 +97,6 @@ public final class SchemaField extends FieldProperties {
boolean isTokenized() { return (properties & TOKENIZED)!=0; }
boolean isBinary() { return (properties & BINARY)!=0; }
boolean legacyMultiTerm() {
return (properties & LEGACY_MULTITERM) != 0;
}
public IndexableField createField(Object val, float boost) {
return type.createField(this,val,boost);
}

View File

@ -17,13 +17,8 @@
package org.apache.solr.schema;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.MultiPhraseQuery;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.search.*;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
@ -32,6 +27,7 @@ import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.common.SolrException;
import org.apache.solr.response.TextResponseWriter;
import org.apache.solr.search.QParser;
@ -48,6 +44,19 @@ import java.io.StringReader;
public class TextField extends FieldType {
protected boolean autoGeneratePhraseQueries;
/**
* Analyzer set by schema for text types to use when searching fields
* of this type, subclasses can set analyzer themselves or override
* getAnalyzer()
* This analyzer is used to process wildcard, prefix, regex and other multiterm queries. It
* assembles a list of tokenizer +filters that "make sense" for this, primarily accent folding and
* lowercasing filters, and charfilters.
*
* @see #getMultiTermAnalyzer
* @see #setMultiTermAnalyzer
*/
protected Analyzer multiTermAnalyzer=null;
@Override
protected void init(IndexSchema schema, Map<String,String> args) {
properties |= TOKENIZED;
@ -63,6 +72,21 @@ public class TextField extends FieldType {
super.init(schema, args);
}
/**
* Returns the Analyzer to be used when searching fields of this type when mult-term queries are specified.
* <p>
* This method may be called many times, at any time.
* </p>
* @see #getAnalyzer
*/
public Analyzer getMultiTermAnalyzer() {
return multiTermAnalyzer;
}
public void setMultiTermAnalyzer(Analyzer analyzer) {
this.multiTermAnalyzer = analyzer;
}
public boolean getAutoGeneratePhraseQueries() {
return autoGeneratePhraseQueries;
}
@ -98,11 +122,50 @@ public class TextField extends FieldType {
this.queryAnalyzer = analyzer;
}
@Override
public void setMultiTermAnalyzer(Analyzer analyzer) {
this.multiTermAnalyzer = analyzer;
public Query getRangeQuery(QParser parser, SchemaField field, String part1, String part2, boolean minInclusive, boolean maxInclusive) {
Analyzer multiAnalyzer = getMultiTermAnalyzer();
BytesRef lower = analyzeMultiTerm(field.getName(), part1, multiAnalyzer);
BytesRef upper = analyzeMultiTerm(field.getName(), part2, multiAnalyzer);
return new TermRangeQuery(field.getName(), lower, upper, minInclusive, maxInclusive);
}
public static BytesRef analyzeMultiTerm(String field, String part, Analyzer analyzerIn) {
if (part == null) return null;
TokenStream source;
try {
source = analyzerIn.tokenStream(field, new StringReader(part));
source.reset();
} catch (IOException e) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Unable to initialize TokenStream to analyze multiTerm term: " + part, e);
}
TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
BytesRef bytes = termAtt.getBytesRef();
try {
if (!source.incrementToken())
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,"analyzer returned no terms for multiTerm term: " + part);
termAtt.fillBytesRef();
if (source.incrementToken())
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,"analyzer returned too many terms for multiTerm term: " + part);
} catch (IOException e) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,"error analyzing range part: " + part, e);
}
try {
source.end();
source.close();
} catch (IOException e) {
throw new RuntimeException("Unable to end & close TokenStream after analyzing multiTerm term: " + part, e);
}
return BytesRef.deepCopyOf(bytes);
}
static Query parseFieldQuery(QParser parser, Analyzer analyzer, String field, String queryText) {
int phraseSlop = 0;
boolean enablePositionIncrements = true;

View File

@ -58,8 +58,9 @@ public class SolrQueryParser extends QueryParser {
protected final IndexSchema schema;
protected final QParser parser;
protected final String defaultField;
protected final Map<String, ReversedWildcardFilterFactory> leadingWildcards =
new HashMap<String, ReversedWildcardFilterFactory>();
// implementation detail - caching ReversedWildcardFilterFactory based on type
private Map<FieldType, ReversedWildcardFilterFactory> leadingWildcards;
public SolrQueryParser(QParser parser, String defaultField) {
this(parser, defaultField, parser.getReq().getSchema().getQueryAnalyzer());
@ -71,30 +72,34 @@ public class SolrQueryParser extends QueryParser {
this.parser = parser;
this.defaultField = defaultField;
setEnablePositionIncrements(true);
checkAllowLeadingWildcards();
setLowercaseExpandedTerms(false);
setAllowLeadingWildcard(true);
}
protected void checkAllowLeadingWildcards() {
boolean allow = false;
for (Entry<String, FieldType> e : schema.getFieldTypes().entrySet()) {
Analyzer a = e.getValue().getAnalyzer();
protected ReversedWildcardFilterFactory getReversedWildcardFilterFactory(FieldType fieldType) {
if (leadingWildcards == null) leadingWildcards = new HashMap<FieldType, ReversedWildcardFilterFactory>();
ReversedWildcardFilterFactory fac = leadingWildcards.get(fieldType);
if (fac == null && leadingWildcards.containsKey(fac)) {
return fac;
}
Analyzer a = fieldType.getAnalyzer();
if (a instanceof TokenizerChain) {
// examine the indexing analysis chain if it supports leading wildcards
TokenizerChain tc = (TokenizerChain)a;
TokenFilterFactory[] factories = tc.getTokenFilterFactories();
for (TokenFilterFactory factory : factories) {
if (factory instanceof ReversedWildcardFilterFactory) {
allow = true;
leadingWildcards.put(e.getKey(), (ReversedWildcardFilterFactory)factory);
fac = (ReversedWildcardFilterFactory)factory;
break;
}
}
}
leadingWildcards.put(fieldType, fac);
return fac;
}
// XXX should be enabled on a per-field basis
if (allow) {
setAllowLeadingWildcard(true);
}
}
private void checkNullField(String field) throws SolrException {
if (field == null && defaultField == null) {
@ -104,12 +109,14 @@ public class SolrQueryParser extends QueryParser {
}
}
protected String analyzeIfMultitermTermText(String field, String part, Analyzer analyzer) {
protected String analyzeIfMultitermTermText(String field, String part, FieldType fieldType) {
if (part == null) return part;
SchemaField sf = schema.getFieldOrNull((field));
if (sf == null || ! (sf.getType() instanceof TextField)) return part;
return analyzeMultitermTerm(field, part, analyzer).utf8ToString();
if (sf == null || ! (fieldType instanceof TextField)) return part;
String out = TextField.analyzeMultiTerm(field, part, ((TextField)fieldType).getMultiTermAnalyzer()).utf8ToString();
// System.out.println("INPUT="+part + " OUTPUT="+out);
return out;
}
@Override
@ -143,8 +150,6 @@ public class SolrQueryParser extends QueryParser {
@Override
protected Query getRangeQuery(String field, String part1, String part2, boolean startInclusive, boolean endInclusive) throws ParseException {
checkNullField(field);
part1 = analyzeIfMultitermTermText(field, part1, schema.getFieldType(field).getMultiTermAnalyzer());
part2 = analyzeIfMultitermTermText(field, part2, schema.getFieldType(field).getMultiTermAnalyzer());
SchemaField sf = schema.getField(field);
return sf.getType().getRangeQuery(parser, sf, part1, part2, startInclusive, endInclusive);
}
@ -153,21 +158,10 @@ public class SolrQueryParser extends QueryParser {
protected Query getPrefixQuery(String field, String termStr) throws ParseException {
checkNullField(field);
termStr = analyzeIfMultitermTermText(field, termStr, schema.getFieldType(field).getMultiTermAnalyzer());
termStr = analyzeIfMultitermTermText(field, termStr, schema.getFieldType(field));
// TODO: toInternal() won't necessarily work on partial
// values, so it looks like we need a getPrefix() function
// on fieldtype? Or at the minimum, a method on fieldType
// that can tell me if I should lowercase or not...
// Schema could tell if lowercase filter is in the chain,
// but a more sure way would be to run something through
// the first time and check if it got lowercased.
// TODO: throw exception if field type doesn't support prefixes?
// (sortable numeric types don't do prefixes, but can do range queries)
Term t = new Term(field, termStr);
PrefixQuery prefixQuery = new PrefixQuery(t);
return prefixQuery;
// Solr has always used constant scoring for prefix queries. This should return constant scoring by default.
return newPrefixQuery(new Term(field, termStr));
}
@Override
protected Query getWildcardQuery(String field, String termStr) throws ParseException {
@ -175,10 +169,10 @@ public class SolrQueryParser extends QueryParser {
if ("*".equals(field) && "*".equals(termStr)) {
return newMatchAllDocsQuery();
}
termStr = analyzeIfMultitermTermText(field, termStr, schema.getFieldType(field).getMultiTermAnalyzer());
FieldType fieldType = schema.getFieldType(field);
termStr = analyzeIfMultitermTermText(field, termStr, fieldType);
// can we use reversed wildcards in this field?
String type = schema.getFieldType(field).getTypeName();
ReversedWildcardFilterFactory factory = leadingWildcards.get(type);
ReversedWildcardFilterFactory factory = getReversedWildcardFilterFactory(fieldType);
if (factory != null) {
Term term = new Term(field, termStr);
// fsa representing the query
@ -211,19 +205,15 @@ public class SolrQueryParser extends QueryParser {
}
};
}
Query q = super.getWildcardQuery(field, termStr);
if (q instanceof WildcardQuery) {
// use a constant score query to avoid overflowing clauses
WildcardQuery wildcardQuery = new WildcardQuery(((WildcardQuery)q).getTerm());
return wildcardQuery;
}
return q;
// Solr has always used constant scoring for wildcard queries. This should return constant scoring by default.
return newWildcardQuery(new Term(field, termStr));
}
@Override
protected Query getRegexpQuery(String field, String termStr) throws ParseException
{
termStr = analyzeIfMultitermTermText(field, termStr, schema.getFieldType(field).getMultiTermAnalyzer());
return super.getRegexpQuery(field, termStr);
termStr = analyzeIfMultitermTermText(field, termStr, schema.getFieldType(field));
return newRegexpQuery(new Term(field, termStr));
}
}

View File

@ -64,7 +64,7 @@
</analyzer>
</fieldType>
<fieldType name="text_rev" class="solr.TextField" legacyMultiTerm="false">
<fieldType name="text_rev" class="solr.TextField">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
@ -80,12 +80,25 @@
</analyzer>
</fieldType>
<fieldType name="text_lower_tokenizer" class="solr.TextField">
<fieldType name="text_lower_token" class="solr.TextField">
<analyzer>
<tokenizer class="solr.LowerCaseTokenizerFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_oldstyle" class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
</analyzer>
<analyzer type="multiterm">
<tokenizer class="solr.KeywordTokenizerFactory" />
</analyzer>
</fieldType>
<fieldType name="text_charfilter" class="solr.TextField" multiValued="false">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
@ -99,19 +112,47 @@
</analyzer>
</fieldType>
<fieldType name="text_oldstyle" class="solr.TextField" multiValued="false" legacyMultiTerm="true">
<fieldType name="text_straight" class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_lower" class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_folding" class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_stemming" class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.TrimFilterFactory"/>
<filter class="solr.PorterStemFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="text_keyword" class="solr.TextField" sortMissingLast="true" omitNorms="true">
<analyzer>
<tokenizer class="solr.KeywordTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory" />
</analyzer>
</fieldType>
<fieldType name="int" class="solr.TrieIntField" precisionStep="4" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="float" class="solr.TrieFloatField" precisionStep="4" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="long" class="solr.TrieLongField" precisionStep="4" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="double" class="solr.TrieDoubleField" precisionStep="4" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="byte" class="solr.ByteField" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="short" class="solr.ShortField" omitNorms="true" positionIncrementGap="0"/>
<fieldtype name="boolean" class="solr.BoolField" sortMissingLast="true"/>
@ -133,10 +174,17 @@
<field name="content_ws" type="text_ws" indexed="true" stored="true"/>
<field name="content_rev" type="text_rev" indexed="true" stored="true"/>
<field name="content_multi" type="text_multi" indexed="true" stored="true"/>
<field name="content_lower_token" type="text_multi" indexed="true" stored="true"/>
<field name="content_lower_token" type="text_lower_token" indexed="true" stored="true"/>
<field name="content_oldstyle" type="text_oldstyle" indexed="true" stored="true"/>
<field name="content_charfilter" type="text_charfilter" indexed="true" stored="true"/>
<field name="content_multi_bad" type="text_multi_bad" indexed="true" stored="true"/>
<dynamicField name="*_straight" type="text_straight" indexed="true" stored="true"/>
<dynamicField name="*_lower" type="text_lower" indexed="true" stored="true"/>
<dynamicField name="*_folding" type="text_folding" indexed="true" stored="true"/>
<dynamicField name="*_stemming" type="text_stemming" indexed="true" stored="true"/>
<dynamicField name="*_keyword" type="text_keyword" indexed="true" stored="true"/>
</fields>
<defaultSearchField>content</defaultSearchField>

View File

@ -36,7 +36,7 @@ public class MultiTermTest extends SolrTestCaseJ4 {
@Test
public void testMultiFound() {
SchemaField field = h.getCore().getSchema().getField("content_multi");
Analyzer analyzer = field.getType().getMultiTermAnalyzer();
Analyzer analyzer = ((TextField)field.getType()).getMultiTermAnalyzer();
assertTrue(analyzer instanceof TokenizerChain);
assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof WhitespaceTokenizerFactory);
TokenizerChain tc = (TokenizerChain) analyzer;
@ -58,9 +58,9 @@ public class MultiTermTest extends SolrTestCaseJ4 {
@Test
public void testQueryCopiedToMulti() {
SchemaField field = h.getCore().getSchema().getField("content_charfilter");
Analyzer analyzer = field.getType().getMultiTermAnalyzer();
Analyzer analyzer = ((TextField)field.getType()).getMultiTermAnalyzer();
assertTrue(analyzer instanceof TokenizerChain);
assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof WhitespaceTokenizerFactory);
assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof KeywordTokenizerFactory);
TokenizerChain tc = (TokenizerChain) analyzer;
for (TokenFilterFactory factory : tc.getTokenFilterFactories()) {
assertTrue(factory instanceof LowerCaseFilterFactory);
@ -73,15 +73,15 @@ public class MultiTermTest extends SolrTestCaseJ4 {
@Test
public void testDefaultCopiedToMulti() {
SchemaField field = h.getCore().getSchema().getField("content_ws");
Analyzer analyzer = field.getType().getMultiTermAnalyzer();
Analyzer analyzer = ((TextField)field.getType()).getMultiTermAnalyzer();
assertTrue(analyzer instanceof TokenizerChain);
assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof WhitespaceTokenizerFactory);
assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof KeywordTokenizerFactory);
TokenizerChain tc = (TokenizerChain) analyzer;
for (TokenFilterFactory factory : tc.getTokenFilterFactories()) {
assertTrue((factory instanceof ASCIIFoldingFilterFactory) || (factory instanceof LowerCaseFilterFactory));
}
assertTrue(tc.getCharFilterFactories().length == 0);
assertTrue(tc.getCharFilterFactories() == null);
}
}

View File

@ -59,7 +59,12 @@ public class TestFoldingMultitermQuery extends SolrTestCaseJ4 {
"content_lower_token", docs[i],
"content_oldstyle", docs[i],
"content_charfilter", docs[i],
"content_multi_bad", docs[i]
"content_multi_bad", docs[i],
"content_straight", docs[i],
"content_lower", docs[i],
"content_folding", docs[i],
"content_stemming", docs[i],
"content_keyword", docs[i]
));
}
assertU(optimize());
@ -95,6 +100,8 @@ public class TestFoldingMultitermQuery extends SolrTestCaseJ4 {
assertQ(req("q", "content_lower_token:" + me),
"//result[@numFound='1']",
"//*[@name='id'][.='" + Integer.toString(idx) + "']");
assertQ(req("q", "content_oldstyle:" + me),
"//result[@numFound='0']");
}
}
for (int idx = 0; idx < matchRevPrefixUpper.length; idx++) {
@ -128,13 +135,50 @@ public class TestFoldingMultitermQuery extends SolrTestCaseJ4 {
assertQ(req("q", "content_multi:" + me),
"//result[@numFound='1']",
"//*[@name='id'][.='" + Integer.toString(idx) + "']");
assertQ(req("q", "content_lower_token:" + me),
"//result[@numFound='1']",
"//*[@name='id'][.='" + Integer.toString(idx) + "']");
assertQ(req("q", "content_oldstyle:" + me),
"//result[@numFound='0']");
}
}
}
@Test
public void testLowerTokenizer() {
// The lowercasetokenizer will remove the '1' from the index, but not from the query, thus the special test.
assertQ(req("q", "content_lower_token:Á*C*"), "//result[@numFound='1']");
assertQ(req("q", "content_lower_token:Á*C*1"), "//result[@numFound='0']");
assertQ(req("q", "content_lower_token:h*1"), "//result[@numFound='0']");
assertQ(req("q", "content_lower_token:H*1"), "//result[@numFound='0']");
assertQ(req("q", "content_lower_token:*1"), "//result[@numFound='0']");
assertQ(req("q", "content_lower_token:HÏ*l?*"), "//result[@numFound='1']");
assertQ(req("q", "content_lower_token:hȉ*l?*"), "//result[@numFound='1']");
}
@Test
public void testRegex() throws Exception {
assertQ(req("q", "content:/Zill[a-z]/"),
"//result[@numFound='1']");
assertQ(req("q", "content:/Zill[A-Z]/"), // everything in the regex gets lowercased?
"//result[@numFound='1']");
assertQ(req("q", "content_keyword:/.*Zill[A-Z]/"),
"//result[@numFound='1']");
assertQ(req("q", "content_straight:/Zill[a-z]/"), // case preserving field shouldn't match
"//result[@numFound='0']");
assertQ(req("q", "content_folding:/Zill[a-z]/"), // case preserving field shouldn't match
"//result[@numFound='0']");
assertQ(req("q", "content_keyword:/Abcdefg1 Finger/"), // test spaces
"//result[@numFound='1']");
}
@Test
public void testGeneral() throws Exception {
assertQ(req("q", "content_stemming:fings*"), "//result[@numFound='0']"); // should not match (but would if fings* was stemmed to fing*
assertQ(req("q", "content_stemming:fing*"), "//result[@numFound='1']");
}
// Phrases should fail. This test is mainly a marker so if phrases ever do start working with wildcards we go
// and update the documentation
@Test
@ -143,17 +187,14 @@ public class TestFoldingMultitermQuery extends SolrTestCaseJ4 {
"//result[@numFound='0']");
}
// Make sure the legacy behavior flag is honored
@Test
public void testLegacyBehavior() {
assertQ(req("q", "content_oldstyle:ABCD*"),
"//result[@numFound='0']");
}
@Test
public void testWildcardRange() {
assertQ(req("q", "content:[* TO *]"),
"//result[@numFound='3']");
assertQ(req("q", "content:[AB* TO Z*]"),
"//result[@numFound='3']");
assertQ(req("q", "content:[AB*E?G* TO TU*W]"),
"//result[@numFound='3']");
}
@ -222,10 +263,13 @@ public class TestFoldingMultitermQuery extends SolrTestCaseJ4 {
@Test
public void testMultiBad() {
try {
ignoreException("analyzer returned too many terms");
assertQ(req("q", "content_multi_bad:" + "abCD*"));
fail("Should throw exception when token evaluates to more than one term");
} catch (Exception expected) {
assertTrue(expected.getCause() instanceof IllegalArgumentException);
assertTrue(expected.getCause() instanceof org.apache.solr.common.SolrException);
} finally {
resetExceptionIgnores();
}
}
}

View File

@ -427,41 +427,6 @@
</analyzer>
</fieldType>
<!-- Illustrates the new "multiterm" analyzer definition the <fieldType> can take a new
parameter legacyMultiTerm="true" if the old behvaior is desired. The new default
behavior as of 3.6+ is to automatically define a multiterm analyzer
-->
<fieldType name="text_multiterm" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
<!-- Illustrates the use of a new analyzer type "multiterm". See the Wiki page "Multiterm
Query Analysis" and SOLR-2438 for full details. The short form is that this analyzer is
applied to wildcard terms (prefix, wildcard range) if specified. This allows, among other
things, not having to lowercase wildcard terms on the client.
In the absence of this section, the new default behavior (3.6, 4.0) is to construct
one of these from the query analyzer that incorporates any defined charfilters, a
WhitespaceTokenizer, a LowerCaseFilter (if defined), and an ASCIIFoldingFilter
(if defined).
Arguably, this is an expert-level analyzer, most cases will be handled by an instance
of this being automatically constructed from the queryanalyzer.
-->
<analyzer type="multiterm">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
</analyzer>
</fieldType>
<!-- since fields of this type are by default not stored or indexed,
any data added to them will be ignored outright. -->
@ -587,6 +552,7 @@
<dynamicField name="*_l" type="long" indexed="true" stored="true"/>
<dynamicField name="*_t" type="text_general" indexed="true" stored="true"/>
<dynamicField name="*_txt" type="text_general" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="*_en" type="text_en" indexed="true" stored="true" multiValued="true" />
<dynamicField name="*_b" type="boolean" indexed="true" stored="true"/>
<dynamicField name="*_f" type="float" indexed="true" stored="true"/>
<dynamicField name="*_d" type="double" indexed="true" stored="true"/>