git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1206767 13f79535-47bb-0310-9956-ffa450edef68

This commit is contained in:
Erick Erickson 2011-11-27 17:04:38 +00:00
parent 5c4063bef2
commit c94c1c5a64
18 changed files with 366 additions and 210 deletions

View File

@ -193,6 +193,11 @@ New Features
a complete analysis chain for multiterm queries. a complete analysis chain for multiterm queries.
(Pete Sturge Erick Erickson, Mentoring from Seeley and Muir) (Pete Sturge Erick Erickson, Mentoring from Seeley and Muir)
* SOLR-2918 Improvement to SOLR-2438, added MultiTermAwareComponent to the various classes
that should transform multiterm queries in various ways, and use this as the criteria for
adding them to the multiterm analyzer that is constructed if not specified in the
<fieldType>
Optimizations Optimizations
---------------------- ----------------------

View File

@ -32,9 +32,14 @@ import org.apache.lucene.analysis.TokenStream;
* &lt;/fieldType&gt;</pre> * &lt;/fieldType&gt;</pre>
* *
*/ */
public class ASCIIFoldingFilterFactory extends BaseTokenFilterFactory { public class ASCIIFoldingFilterFactory extends BaseTokenFilterFactory implements MultiTermAwareComponent {
public ASCIIFoldingFilter create(TokenStream input) { public ASCIIFoldingFilter create(TokenStream input) {
return new ASCIIFoldingFilter(input); return new ASCIIFoldingFilter(input);
} }
@Override
public Object getMultiTermComponent() {
return this;
}
} }

View File

@ -33,7 +33,7 @@ import org.apache.lucene.analysis.core.LowerCaseFilter;
* &lt;/fieldType&gt;</pre> * &lt;/fieldType&gt;</pre>
* *
*/ */
public class LowerCaseFilterFactory extends BaseTokenFilterFactory { public class LowerCaseFilterFactory extends BaseTokenFilterFactory implements MultiTermAwareComponent {
@Override @Override
public void init(Map<String,String> args) { public void init(Map<String,String> args) {
super.init(args); super.init(args);
@ -43,4 +43,9 @@ public class LowerCaseFilterFactory extends BaseTokenFilterFactory {
public LowerCaseFilter create(TokenStream input) { public LowerCaseFilter create(TokenStream input) {
return new LowerCaseFilter(luceneMatchVersion,input); return new LowerCaseFilter(luceneMatchVersion,input);
} }
@Override
public Object getMultiTermComponent() {
return this;
}
} }

View File

@ -17,6 +17,7 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.LowerCaseTokenizer; import org.apache.lucene.analysis.core.LowerCaseTokenizer;
import java.io.Reader; import java.io.Reader;
@ -32,7 +33,7 @@ import java.util.Map;
* &lt;/fieldType&gt;</pre> * &lt;/fieldType&gt;</pre>
* *
*/ */
public class LowerCaseTokenizerFactory extends BaseTokenizerFactory { public class LowerCaseTokenizerFactory extends BaseTokenizerFactory implements MultiTermAwareComponent {
@Override @Override
public void init(Map<String,String> args) { public void init(Map<String,String> args) {
super.init(args); super.init(args);
@ -42,4 +43,11 @@ public class LowerCaseTokenizerFactory extends BaseTokenizerFactory {
public LowerCaseTokenizer create(Reader input) { public LowerCaseTokenizer create(Reader input) {
return new LowerCaseTokenizer(luceneMatchVersion,input); return new LowerCaseTokenizer(luceneMatchVersion,input);
} }
@Override
public Object getMultiTermComponent() {
LowerCaseFilterFactory filt = new LowerCaseFilterFactory();
filt.init(args);
return filt;
}
} }

View File

@ -46,7 +46,7 @@ import org.apache.solr.util.plugin.ResourceLoaderAware;
* *
*/ */
public class MappingCharFilterFactory extends BaseCharFilterFactory implements public class MappingCharFilterFactory extends BaseCharFilterFactory implements
ResourceLoaderAware { ResourceLoaderAware, MultiTermAwareComponent {
protected NormalizeCharMap normMap; protected NormalizeCharMap normMap;
private String mapping; private String mapping;
@ -126,4 +126,9 @@ public class MappingCharFilterFactory extends BaseCharFilterFactory implements
} }
return new String( out, 0, writePos ); return new String( out, 0, writePos );
} }
@Override
public Object getMultiTermComponent() {
return this;
}
} }

View File

@ -0,0 +1,31 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/** Add to any analysis factory component to allow returning an
* analysis component factory for use with partial terms in prefix queries,
* wildcard queries, range query endpoints, regex queries, etc.
*
* @lucene.experimental
*/
public interface MultiTermAwareComponent {
/** Returns an analysis component to handle analysis if multi-term queries.
* The returned component must be a TokenizerFactory, TokenFilterFactory or CharFilterFactory.
*/
public Object getMultiTermComponent();
}

View File

@ -31,10 +31,15 @@ import org.apache.lucene.analysis.fa.PersianCharFilter;
* &lt;/fieldType&gt;</pre> * &lt;/fieldType&gt;</pre>
* *
*/ */
public class PersianCharFilterFactory extends BaseCharFilterFactory { public class PersianCharFilterFactory extends BaseCharFilterFactory implements MultiTermAwareComponent {
@Override @Override
public CharStream create(CharStream input) { public CharStream create(CharStream input) {
return new PersianCharFilter(input); return new PersianCharFilter(input);
} }
@Override
public Object getMultiTermComponent() {
return this;
}
} }

View File

@ -67,3 +67,4 @@ public interface TokenFilterFactory {
/** Transform the specified input TokenStream */ /** Transform the specified input TokenStream */
public TokenStream create(TokenStream input); public TokenStream create(TokenStream input);
} }

View File

@ -48,15 +48,13 @@ public abstract class FieldProperties {
protected final static int REQUIRED = 0x00001000; protected final static int REQUIRED = 0x00001000;
protected final static int OMIT_POSITIONS = 0x00002000; protected final static int OMIT_POSITIONS = 0x00002000;
protected final static int LEGACY_MULTITERM = 0x00004000;
static final String[] propertyNames = { static final String[] propertyNames = {
"indexed", "tokenized", "stored", "indexed", "tokenized", "stored",
"binary", "omitNorms", "omitTermFreqAndPositions", "binary", "omitNorms", "omitTermFreqAndPositions",
"termVectors", "termPositions", "termOffsets", "termVectors", "termPositions", "termOffsets",
"multiValued", "multiValued",
"sortMissingFirst","sortMissingLast","required", "omitPositions" , "sortMissingFirst","sortMissingLast","required", "omitPositions"
"legacyMultiTerm"
}; };
static final Map<String,Integer> propertyMap = new HashMap<String,Integer>(); static final Map<String,Integer> propertyMap = new HashMap<String,Integer>();

View File

@ -428,21 +428,6 @@ public abstract class FieldType extends FieldProperties {
*/ */
protected Analyzer queryAnalyzer=analyzer; protected Analyzer queryAnalyzer=analyzer;
/**
* Analyzer set by schema for text types to use when searching fields
* of this type, subclasses can set analyzer themselves or override
* getAnalyzer()
* This analyzer is used to process wildcard, prefix, regex and other multiterm queries. It
* assembles a list of tokenizer +filters that "make sense" for this, primarily accent folding and
* lowercasing filters, and charfilters.
*
* If users require old-style behavior, they can specify 'legacyMultiterm="true" ' in the schema file
* @see #getMultiTermAnalyzer
* @see #setMultiTermAnalyzer
*/
protected Analyzer multiTermAnalyzer=null;
/** /**
* Returns the Analyzer to be used when indexing fields of this type. * Returns the Analyzer to be used when indexing fields of this type.
* <p> * <p>
@ -465,20 +450,6 @@ public abstract class FieldType extends FieldProperties {
return queryAnalyzer; return queryAnalyzer;
} }
/**
* Returns the Analyzer to be used when searching fields of this type when mult-term queries are specified.
* <p>
* This method may be called many times, at any time.
* </p>
* @see #getAnalyzer
*/
public Analyzer getMultiTermAnalyzer() {
return multiTermAnalyzer;
}
private final String analyzerError =
"FieldType: " + this.getClass().getSimpleName() +
" (" + typeName + ") does not support specifying an analyzer";
/** /**
* Sets the Analyzer to be used when indexing fields of this type. * Sets the Analyzer to be used when indexing fields of this type.
@ -524,28 +495,6 @@ public abstract class FieldType extends FieldProperties {
throw e; throw e;
} }
/**
* Sets the Analyzer to be used when querying fields of this type.
*
* <p>
*
* Subclasses that override this method need to ensure the behavior
* of the analyzer is consistent with the implementation of toInternal.
* </p>
*
* @see #toInternal
* @see #setAnalyzer
* @see #getQueryAnalyzer
*/
public void setMultiTermAnalyzer(Analyzer analyzer) {
SolrException e = new SolrException
(ErrorCode.SERVER_ERROR,
"FieldType: " + this.getClass().getSimpleName() +
" (" + typeName + ") does not support specifying an analyzer");
SolrException.logOnce(log,null,e);
throw e;
}
/** @lucene.internal */ /** @lucene.internal */
protected Similarity similarity; protected Similarity similarity;

View File

@ -102,15 +102,13 @@ public final class FieldTypePluginLoader
if (queryAnalyzer==null) queryAnalyzer=analyzer; if (queryAnalyzer==null) queryAnalyzer=analyzer;
if (analyzer==null) analyzer=queryAnalyzer; if (analyzer==null) analyzer=queryAnalyzer;
if (multiAnalyzer == null) { if (multiAnalyzer == null) {
Boolean legacyMatch = ! schema.getDefaultLuceneMatchVersion().onOrAfter(Version.LUCENE_36); multiAnalyzer = constructMultiTermAnalyzer(queryAnalyzer);
legacyMatch = (DOMUtil.getAttr(node, "legacyMultiTerm", null) == null) ? legacyMatch :
Boolean.parseBoolean(DOMUtil.getAttr(node, "legacyMultiTerm", null));
multiAnalyzer = constructMultiTermAnalyzer(queryAnalyzer, legacyMatch);
} }
if (analyzer!=null) { if (analyzer!=null) {
ft.setAnalyzer(analyzer); ft.setAnalyzer(analyzer);
ft.setQueryAnalyzer(queryAnalyzer); ft.setQueryAnalyzer(queryAnalyzer);
ft.setMultiTermAnalyzer(multiAnalyzer); if (ft instanceof TextField)
((TextField)ft).setMultiTermAnalyzer(multiAnalyzer);
} }
if (similarity!=null) { if (similarity!=null) {
ft.setSimilarity(similarity); ft.setSimilarity(similarity);
@ -143,36 +141,75 @@ public final class FieldTypePluginLoader
// 2> If letacyMultiTerm == true just construct the analyzer from a KeywordTokenizer. That should mimic current behavior. // 2> If letacyMultiTerm == true just construct the analyzer from a KeywordTokenizer. That should mimic current behavior.
// Do the same if they've specified that the old behavior is required (legacyMultiTerm="true") // Do the same if they've specified that the old behavior is required (legacyMultiTerm="true")
private Analyzer constructMultiTermAnalyzer(Analyzer queryAnalyzer, Boolean legacyMultiTerm) { private Analyzer constructMultiTermAnalyzer(Analyzer queryAnalyzer) {
if (queryAnalyzer == null) return null; if (queryAnalyzer == null) return null;
if (legacyMultiTerm || (!(queryAnalyzer instanceof TokenizerChain))) { if (!(queryAnalyzer instanceof TokenizerChain)) {
return new KeywordAnalyzer(); return new KeywordAnalyzer();
} }
TokenizerChain tc = (TokenizerChain) queryAnalyzer; TokenizerChain tc = (TokenizerChain) queryAnalyzer;
MultiTermChainBuilder builder = new MultiTermChainBuilder();
// we know it'll never be longer than this unless the code below is explicitly changed CharFilterFactory[] charFactories = tc.getCharFilterFactories();
TokenFilterFactory[] filters = new TokenFilterFactory[2]; if (charFactories != null) {
int idx = 0; for (CharFilterFactory fact : charFactories) {
for (TokenFilterFactory factory : tc.getTokenFilterFactories()) { builder.add(fact);
if (factory instanceof LowerCaseFilterFactory) {
filters[idx] = new LowerCaseFilterFactory();
filters[idx++].init(factory.getArgs());
}
if (factory instanceof ASCIIFoldingFilterFactory) {
filters[idx] = new ASCIIFoldingFilterFactory();
filters[idx++].init(factory.getArgs());
} }
} }
WhitespaceTokenizerFactory white = new WhitespaceTokenizerFactory();
white.init(tc.getTokenizerFactory().getArgs());
return new TokenizerChain(tc.getCharFilterFactories(), builder.add(tc.getTokenizerFactory());
white,
Arrays.copyOfRange(filters, 0, idx)); for (TokenFilterFactory fact : tc.getTokenFilterFactories()) {
builder.add(fact);
}
return builder.build();
} }
private static class MultiTermChainBuilder {
static final KeywordTokenizerFactory keyFactory;
static {
keyFactory = new KeywordTokenizerFactory();
keyFactory.init(new HashMap<String,String>());
}
ArrayList<CharFilterFactory> charFilters = null;
ArrayList<TokenFilterFactory> filters = new ArrayList<TokenFilterFactory>(2);
TokenizerFactory tokenizer = keyFactory;
public void add(Object current) {
if (!(current instanceof MultiTermAwareComponent)) return;
Object newComponent = ((MultiTermAwareComponent)current).getMultiTermComponent();
if (newComponent instanceof TokenFilterFactory) {
if (filters == null) {
filters = new ArrayList<TokenFilterFactory>(2);
}
filters.add((TokenFilterFactory)newComponent);
} else if (newComponent instanceof TokenizerFactory) {
tokenizer = (TokenizerFactory)newComponent;
} else if (newComponent instanceof CharFilterFactory) {
if (charFilters == null) {
charFilters = new ArrayList<CharFilterFactory>(1);
}
charFilters.add( (CharFilterFactory)newComponent);
} else {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown analysis component from MultiTermAwareComponent: " + newComponent);
}
}
public TokenizerChain build() {
CharFilterFactory[] charFilterArr = charFilters == null ? null : charFilters.toArray(new CharFilterFactory[charFilters.size()]);
TokenFilterFactory[] filterArr = filters == null ? new TokenFilterFactory[0] : filters.toArray(new TokenFilterFactory[filters.size()]);
return new TokenizerChain(charFilterArr, tokenizer, filterArr);
}
}
// //
// <analyzer><tokenizer class="...."/><tokenizer class="...." arg="...."> // <analyzer><tokenizer class="...."/><tokenizer class="...." arg="....">
// //

View File

@ -97,10 +97,6 @@ public final class SchemaField extends FieldProperties {
boolean isTokenized() { return (properties & TOKENIZED)!=0; } boolean isTokenized() { return (properties & TOKENIZED)!=0; }
boolean isBinary() { return (properties & BINARY)!=0; } boolean isBinary() { return (properties & BINARY)!=0; }
boolean legacyMultiTerm() {
return (properties & LEGACY_MULTITERM) != 0;
}
public IndexableField createField(Object val, float boost) { public IndexableField createField(Object val, float boost) {
return type.createField(this,val,boost); return type.createField(this,val,boost);
} }

View File

@ -17,13 +17,8 @@
package org.apache.solr.schema; package org.apache.solr.schema;
import org.apache.lucene.search.SortField; import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.search.Query; import org.apache.lucene.search.*;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.MultiPhraseQuery;
import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
@ -32,6 +27,7 @@ import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.solr.common.SolrException;
import org.apache.solr.response.TextResponseWriter; import org.apache.solr.response.TextResponseWriter;
import org.apache.solr.search.QParser; import org.apache.solr.search.QParser;
@ -48,6 +44,19 @@ import java.io.StringReader;
public class TextField extends FieldType { public class TextField extends FieldType {
protected boolean autoGeneratePhraseQueries; protected boolean autoGeneratePhraseQueries;
/**
* Analyzer set by schema for text types to use when searching fields
* of this type, subclasses can set analyzer themselves or override
* getAnalyzer()
* This analyzer is used to process wildcard, prefix, regex and other multiterm queries. It
* assembles a list of tokenizer +filters that "make sense" for this, primarily accent folding and
* lowercasing filters, and charfilters.
*
* @see #getMultiTermAnalyzer
* @see #setMultiTermAnalyzer
*/
protected Analyzer multiTermAnalyzer=null;
@Override @Override
protected void init(IndexSchema schema, Map<String,String> args) { protected void init(IndexSchema schema, Map<String,String> args) {
properties |= TOKENIZED; properties |= TOKENIZED;
@ -63,6 +72,21 @@ public class TextField extends FieldType {
super.init(schema, args); super.init(schema, args);
} }
/**
* Returns the Analyzer to be used when searching fields of this type when mult-term queries are specified.
* <p>
* This method may be called many times, at any time.
* </p>
* @see #getAnalyzer
*/
public Analyzer getMultiTermAnalyzer() {
return multiTermAnalyzer;
}
public void setMultiTermAnalyzer(Analyzer analyzer) {
this.multiTermAnalyzer = analyzer;
}
public boolean getAutoGeneratePhraseQueries() { public boolean getAutoGeneratePhraseQueries() {
return autoGeneratePhraseQueries; return autoGeneratePhraseQueries;
} }
@ -98,11 +122,50 @@ public class TextField extends FieldType {
this.queryAnalyzer = analyzer; this.queryAnalyzer = analyzer;
} }
@Override @Override
public void setMultiTermAnalyzer(Analyzer analyzer) { public Query getRangeQuery(QParser parser, SchemaField field, String part1, String part2, boolean minInclusive, boolean maxInclusive) {
this.multiTermAnalyzer = analyzer; Analyzer multiAnalyzer = getMultiTermAnalyzer();
BytesRef lower = analyzeMultiTerm(field.getName(), part1, multiAnalyzer);
BytesRef upper = analyzeMultiTerm(field.getName(), part2, multiAnalyzer);
return new TermRangeQuery(field.getName(), lower, upper, minInclusive, maxInclusive);
} }
public static BytesRef analyzeMultiTerm(String field, String part, Analyzer analyzerIn) {
if (part == null) return null;
TokenStream source;
try {
source = analyzerIn.tokenStream(field, new StringReader(part));
source.reset();
} catch (IOException e) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Unable to initialize TokenStream to analyze multiTerm term: " + part, e);
}
TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
BytesRef bytes = termAtt.getBytesRef();
try {
if (!source.incrementToken())
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,"analyzer returned no terms for multiTerm term: " + part);
termAtt.fillBytesRef();
if (source.incrementToken())
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,"analyzer returned too many terms for multiTerm term: " + part);
} catch (IOException e) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,"error analyzing range part: " + part, e);
}
try {
source.end();
source.close();
} catch (IOException e) {
throw new RuntimeException("Unable to end & close TokenStream after analyzing multiTerm term: " + part, e);
}
return BytesRef.deepCopyOf(bytes);
}
static Query parseFieldQuery(QParser parser, Analyzer analyzer, String field, String queryText) { static Query parseFieldQuery(QParser parser, Analyzer analyzer, String field, String queryText) {
int phraseSlop = 0; int phraseSlop = 0;
boolean enablePositionIncrements = true; boolean enablePositionIncrements = true;

View File

@ -58,8 +58,9 @@ public class SolrQueryParser extends QueryParser {
protected final IndexSchema schema; protected final IndexSchema schema;
protected final QParser parser; protected final QParser parser;
protected final String defaultField; protected final String defaultField;
protected final Map<String, ReversedWildcardFilterFactory> leadingWildcards =
new HashMap<String, ReversedWildcardFilterFactory>(); // implementation detail - caching ReversedWildcardFilterFactory based on type
private Map<FieldType, ReversedWildcardFilterFactory> leadingWildcards;
public SolrQueryParser(QParser parser, String defaultField) { public SolrQueryParser(QParser parser, String defaultField) {
this(parser, defaultField, parser.getReq().getSchema().getQueryAnalyzer()); this(parser, defaultField, parser.getReq().getSchema().getQueryAnalyzer());
@ -71,31 +72,35 @@ public class SolrQueryParser extends QueryParser {
this.parser = parser; this.parser = parser;
this.defaultField = defaultField; this.defaultField = defaultField;
setEnablePositionIncrements(true); setEnablePositionIncrements(true);
checkAllowLeadingWildcards(); setLowercaseExpandedTerms(false);
setAllowLeadingWildcard(true);
} }
protected void checkAllowLeadingWildcards() { protected ReversedWildcardFilterFactory getReversedWildcardFilterFactory(FieldType fieldType) {
boolean allow = false; if (leadingWildcards == null) leadingWildcards = new HashMap<FieldType, ReversedWildcardFilterFactory>();
for (Entry<String, FieldType> e : schema.getFieldTypes().entrySet()) { ReversedWildcardFilterFactory fac = leadingWildcards.get(fieldType);
Analyzer a = e.getValue().getAnalyzer(); if (fac == null && leadingWildcards.containsKey(fac)) {
if (a instanceof TokenizerChain) { return fac;
// examine the indexing analysis chain if it supports leading wildcards }
TokenizerChain tc = (TokenizerChain)a;
TokenFilterFactory[] factories = tc.getTokenFilterFactories(); Analyzer a = fieldType.getAnalyzer();
for (TokenFilterFactory factory : factories) { if (a instanceof TokenizerChain) {
if (factory instanceof ReversedWildcardFilterFactory) { // examine the indexing analysis chain if it supports leading wildcards
allow = true; TokenizerChain tc = (TokenizerChain)a;
leadingWildcards.put(e.getKey(), (ReversedWildcardFilterFactory)factory); TokenFilterFactory[] factories = tc.getTokenFilterFactories();
} for (TokenFilterFactory factory : factories) {
if (factory instanceof ReversedWildcardFilterFactory) {
fac = (ReversedWildcardFilterFactory)factory;
break;
} }
} }
} }
// XXX should be enabled on a per-field basis
if (allow) { leadingWildcards.put(fieldType, fac);
setAllowLeadingWildcard(true); return fac;
}
} }
private void checkNullField(String field) throws SolrException { private void checkNullField(String field) throws SolrException {
if (field == null && defaultField == null) { if (field == null && defaultField == null) {
throw new SolrException throw new SolrException
@ -104,12 +109,14 @@ public class SolrQueryParser extends QueryParser {
} }
} }
protected String analyzeIfMultitermTermText(String field, String part, Analyzer analyzer) { protected String analyzeIfMultitermTermText(String field, String part, FieldType fieldType) {
if (part == null) return part; if (part == null) return part;
SchemaField sf = schema.getFieldOrNull((field)); SchemaField sf = schema.getFieldOrNull((field));
if (sf == null || ! (sf.getType() instanceof TextField)) return part; if (sf == null || ! (fieldType instanceof TextField)) return part;
return analyzeMultitermTerm(field, part, analyzer).utf8ToString(); String out = TextField.analyzeMultiTerm(field, part, ((TextField)fieldType).getMultiTermAnalyzer()).utf8ToString();
// System.out.println("INPUT="+part + " OUTPUT="+out);
return out;
} }
@Override @Override
@ -143,8 +150,6 @@ public class SolrQueryParser extends QueryParser {
@Override @Override
protected Query getRangeQuery(String field, String part1, String part2, boolean startInclusive, boolean endInclusive) throws ParseException { protected Query getRangeQuery(String field, String part1, String part2, boolean startInclusive, boolean endInclusive) throws ParseException {
checkNullField(field); checkNullField(field);
part1 = analyzeIfMultitermTermText(field, part1, schema.getFieldType(field).getMultiTermAnalyzer());
part2 = analyzeIfMultitermTermText(field, part2, schema.getFieldType(field).getMultiTermAnalyzer());
SchemaField sf = schema.getField(field); SchemaField sf = schema.getField(field);
return sf.getType().getRangeQuery(parser, sf, part1, part2, startInclusive, endInclusive); return sf.getType().getRangeQuery(parser, sf, part1, part2, startInclusive, endInclusive);
} }
@ -153,21 +158,10 @@ public class SolrQueryParser extends QueryParser {
protected Query getPrefixQuery(String field, String termStr) throws ParseException { protected Query getPrefixQuery(String field, String termStr) throws ParseException {
checkNullField(field); checkNullField(field);
termStr = analyzeIfMultitermTermText(field, termStr, schema.getFieldType(field).getMultiTermAnalyzer()); termStr = analyzeIfMultitermTermText(field, termStr, schema.getFieldType(field));
// TODO: toInternal() won't necessarily work on partial // Solr has always used constant scoring for prefix queries. This should return constant scoring by default.
// values, so it looks like we need a getPrefix() function return newPrefixQuery(new Term(field, termStr));
// on fieldtype? Or at the minimum, a method on fieldType
// that can tell me if I should lowercase or not...
// Schema could tell if lowercase filter is in the chain,
// but a more sure way would be to run something through
// the first time and check if it got lowercased.
// TODO: throw exception if field type doesn't support prefixes?
// (sortable numeric types don't do prefixes, but can do range queries)
Term t = new Term(field, termStr);
PrefixQuery prefixQuery = new PrefixQuery(t);
return prefixQuery;
} }
@Override @Override
protected Query getWildcardQuery(String field, String termStr) throws ParseException { protected Query getWildcardQuery(String field, String termStr) throws ParseException {
@ -175,10 +169,10 @@ public class SolrQueryParser extends QueryParser {
if ("*".equals(field) && "*".equals(termStr)) { if ("*".equals(field) && "*".equals(termStr)) {
return newMatchAllDocsQuery(); return newMatchAllDocsQuery();
} }
termStr = analyzeIfMultitermTermText(field, termStr, schema.getFieldType(field).getMultiTermAnalyzer()); FieldType fieldType = schema.getFieldType(field);
termStr = analyzeIfMultitermTermText(field, termStr, fieldType);
// can we use reversed wildcards in this field? // can we use reversed wildcards in this field?
String type = schema.getFieldType(field).getTypeName(); ReversedWildcardFilterFactory factory = getReversedWildcardFilterFactory(fieldType);
ReversedWildcardFilterFactory factory = leadingWildcards.get(type);
if (factory != null) { if (factory != null) {
Term term = new Term(field, termStr); Term term = new Term(field, termStr);
// fsa representing the query // fsa representing the query
@ -211,19 +205,15 @@ public class SolrQueryParser extends QueryParser {
} }
}; };
} }
Query q = super.getWildcardQuery(field, termStr);
if (q instanceof WildcardQuery) { // Solr has always used constant scoring for wildcard queries. This should return constant scoring by default.
// use a constant score query to avoid overflowing clauses return newWildcardQuery(new Term(field, termStr));
WildcardQuery wildcardQuery = new WildcardQuery(((WildcardQuery)q).getTerm());
return wildcardQuery;
}
return q;
} }
@Override
protected Query getRegexpQuery(String field, String termStr) throws ParseException protected Query getRegexpQuery(String field, String termStr) throws ParseException
{ {
termStr = analyzeIfMultitermTermText(field, termStr, schema.getFieldType(field).getMultiTermAnalyzer()); termStr = analyzeIfMultitermTermText(field, termStr, schema.getFieldType(field));
return super.getRegexpQuery(field, termStr); return newRegexpQuery(new Term(field, termStr));
} }
} }

View File

@ -64,7 +64,7 @@
</analyzer> </analyzer>
</fieldType> </fieldType>
<fieldType name="text_rev" class="solr.TextField" legacyMultiTerm="false"> <fieldType name="text_rev" class="solr.TextField">
<analyzer type="index"> <analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/> <tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/> <filter class="solr.LowerCaseFilterFactory"/>
@ -80,12 +80,25 @@
</analyzer> </analyzer>
</fieldType> </fieldType>
<fieldType name="text_lower_tokenizer" class="solr.TextField"> <fieldType name="text_lower_token" class="solr.TextField">
<analyzer> <analyzer>
<tokenizer class="solr.LowerCaseTokenizerFactory"/> <tokenizer class="solr.LowerCaseTokenizerFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
</analyzer> </analyzer>
</fieldType> </fieldType>
<fieldType name="text_oldstyle" class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
</analyzer>
<analyzer type="multiterm">
<tokenizer class="solr.KeywordTokenizerFactory" />
</analyzer>
</fieldType>
<fieldType name="text_charfilter" class="solr.TextField" multiValued="false"> <fieldType name="text_charfilter" class="solr.TextField" multiValued="false">
<analyzer type="index"> <analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/> <tokenizer class="solr.WhitespaceTokenizerFactory"/>
@ -99,19 +112,47 @@
</analyzer> </analyzer>
</fieldType> </fieldType>
<fieldType name="text_oldstyle" class="solr.TextField" multiValued="false" legacyMultiTerm="true"> <fieldType name="text_straight" class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_lower" class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_folding" class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_stemming" class="solr.TextField">
<analyzer> <analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/> <tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/> <filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/> <filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.TrimFilterFactory"/> <filter class="solr.PorterStemFilterFactory"/>
</analyzer> </analyzer>
</fieldType> </fieldType>
<fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/> <fieldType name="text_keyword" class="solr.TextField" sortMissingLast="true" omitNorms="true">
<fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/> <analyzer>
<fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/> <tokenizer class="solr.KeywordTokenizerFactory"/>
<fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/> <filter class="solr.LowerCaseFilterFactory" />
</analyzer>
</fieldType>
<fieldType name="int" class="solr.TrieIntField" precisionStep="4" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="float" class="solr.TrieFloatField" precisionStep="4" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="long" class="solr.TrieLongField" precisionStep="4" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="double" class="solr.TrieDoubleField" precisionStep="4" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="byte" class="solr.ByteField" omitNorms="true" positionIncrementGap="0"/> <fieldType name="byte" class="solr.ByteField" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="short" class="solr.ShortField" omitNorms="true" positionIncrementGap="0"/> <fieldType name="short" class="solr.ShortField" omitNorms="true" positionIncrementGap="0"/>
<fieldtype name="boolean" class="solr.BoolField" sortMissingLast="true"/> <fieldtype name="boolean" class="solr.BoolField" sortMissingLast="true"/>
@ -133,10 +174,17 @@
<field name="content_ws" type="text_ws" indexed="true" stored="true"/> <field name="content_ws" type="text_ws" indexed="true" stored="true"/>
<field name="content_rev" type="text_rev" indexed="true" stored="true"/> <field name="content_rev" type="text_rev" indexed="true" stored="true"/>
<field name="content_multi" type="text_multi" indexed="true" stored="true"/> <field name="content_multi" type="text_multi" indexed="true" stored="true"/>
<field name="content_lower_token" type="text_multi" indexed="true" stored="true"/> <field name="content_lower_token" type="text_lower_token" indexed="true" stored="true"/>
<field name="content_oldstyle" type="text_oldstyle" indexed="true" stored="true"/> <field name="content_oldstyle" type="text_oldstyle" indexed="true" stored="true"/>
<field name="content_charfilter" type="text_charfilter" indexed="true" stored="true"/> <field name="content_charfilter" type="text_charfilter" indexed="true" stored="true"/>
<field name="content_multi_bad" type="text_multi_bad" indexed="true" stored="true"/> <field name="content_multi_bad" type="text_multi_bad" indexed="true" stored="true"/>
<dynamicField name="*_straight" type="text_straight" indexed="true" stored="true"/>
<dynamicField name="*_lower" type="text_lower" indexed="true" stored="true"/>
<dynamicField name="*_folding" type="text_folding" indexed="true" stored="true"/>
<dynamicField name="*_stemming" type="text_stemming" indexed="true" stored="true"/>
<dynamicField name="*_keyword" type="text_keyword" indexed="true" stored="true"/>
</fields> </fields>
<defaultSearchField>content</defaultSearchField> <defaultSearchField>content</defaultSearchField>

View File

@ -36,7 +36,7 @@ public class MultiTermTest extends SolrTestCaseJ4 {
@Test @Test
public void testMultiFound() { public void testMultiFound() {
SchemaField field = h.getCore().getSchema().getField("content_multi"); SchemaField field = h.getCore().getSchema().getField("content_multi");
Analyzer analyzer = field.getType().getMultiTermAnalyzer(); Analyzer analyzer = ((TextField)field.getType()).getMultiTermAnalyzer();
assertTrue(analyzer instanceof TokenizerChain); assertTrue(analyzer instanceof TokenizerChain);
assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof WhitespaceTokenizerFactory); assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof WhitespaceTokenizerFactory);
TokenizerChain tc = (TokenizerChain) analyzer; TokenizerChain tc = (TokenizerChain) analyzer;
@ -58,9 +58,9 @@ public class MultiTermTest extends SolrTestCaseJ4 {
@Test @Test
public void testQueryCopiedToMulti() { public void testQueryCopiedToMulti() {
SchemaField field = h.getCore().getSchema().getField("content_charfilter"); SchemaField field = h.getCore().getSchema().getField("content_charfilter");
Analyzer analyzer = field.getType().getMultiTermAnalyzer(); Analyzer analyzer = ((TextField)field.getType()).getMultiTermAnalyzer();
assertTrue(analyzer instanceof TokenizerChain); assertTrue(analyzer instanceof TokenizerChain);
assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof WhitespaceTokenizerFactory); assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof KeywordTokenizerFactory);
TokenizerChain tc = (TokenizerChain) analyzer; TokenizerChain tc = (TokenizerChain) analyzer;
for (TokenFilterFactory factory : tc.getTokenFilterFactories()) { for (TokenFilterFactory factory : tc.getTokenFilterFactories()) {
assertTrue(factory instanceof LowerCaseFilterFactory); assertTrue(factory instanceof LowerCaseFilterFactory);
@ -73,15 +73,15 @@ public class MultiTermTest extends SolrTestCaseJ4 {
@Test @Test
public void testDefaultCopiedToMulti() { public void testDefaultCopiedToMulti() {
SchemaField field = h.getCore().getSchema().getField("content_ws"); SchemaField field = h.getCore().getSchema().getField("content_ws");
Analyzer analyzer = field.getType().getMultiTermAnalyzer(); Analyzer analyzer = ((TextField)field.getType()).getMultiTermAnalyzer();
assertTrue(analyzer instanceof TokenizerChain); assertTrue(analyzer instanceof TokenizerChain);
assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof WhitespaceTokenizerFactory); assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof KeywordTokenizerFactory);
TokenizerChain tc = (TokenizerChain) analyzer; TokenizerChain tc = (TokenizerChain) analyzer;
for (TokenFilterFactory factory : tc.getTokenFilterFactories()) { for (TokenFilterFactory factory : tc.getTokenFilterFactories()) {
assertTrue((factory instanceof ASCIIFoldingFilterFactory) || (factory instanceof LowerCaseFilterFactory)); assertTrue((factory instanceof ASCIIFoldingFilterFactory) || (factory instanceof LowerCaseFilterFactory));
} }
assertTrue(tc.getCharFilterFactories().length == 0); assertTrue(tc.getCharFilterFactories() == null);
} }
} }

View File

@ -59,7 +59,12 @@ public class TestFoldingMultitermQuery extends SolrTestCaseJ4 {
"content_lower_token", docs[i], "content_lower_token", docs[i],
"content_oldstyle", docs[i], "content_oldstyle", docs[i],
"content_charfilter", docs[i], "content_charfilter", docs[i],
"content_multi_bad", docs[i] "content_multi_bad", docs[i],
"content_straight", docs[i],
"content_lower", docs[i],
"content_folding", docs[i],
"content_stemming", docs[i],
"content_keyword", docs[i]
)); ));
} }
assertU(optimize()); assertU(optimize());
@ -95,6 +100,8 @@ public class TestFoldingMultitermQuery extends SolrTestCaseJ4 {
assertQ(req("q", "content_lower_token:" + me), assertQ(req("q", "content_lower_token:" + me),
"//result[@numFound='1']", "//result[@numFound='1']",
"//*[@name='id'][.='" + Integer.toString(idx) + "']"); "//*[@name='id'][.='" + Integer.toString(idx) + "']");
assertQ(req("q", "content_oldstyle:" + me),
"//result[@numFound='0']");
} }
} }
for (int idx = 0; idx < matchRevPrefixUpper.length; idx++) { for (int idx = 0; idx < matchRevPrefixUpper.length; idx++) {
@ -128,13 +135,50 @@ public class TestFoldingMultitermQuery extends SolrTestCaseJ4 {
assertQ(req("q", "content_multi:" + me), assertQ(req("q", "content_multi:" + me),
"//result[@numFound='1']", "//result[@numFound='1']",
"//*[@name='id'][.='" + Integer.toString(idx) + "']"); "//*[@name='id'][.='" + Integer.toString(idx) + "']");
assertQ(req("q", "content_lower_token:" + me), assertQ(req("q", "content_oldstyle:" + me),
"//result[@numFound='1']", "//result[@numFound='0']");
"//*[@name='id'][.='" + Integer.toString(idx) + "']");
} }
} }
} }
@Test
public void testLowerTokenizer() {
// The lowercasetokenizer will remove the '1' from the index, but not from the query, thus the special test.
assertQ(req("q", "content_lower_token:Á*C*"), "//result[@numFound='1']");
assertQ(req("q", "content_lower_token:Á*C*1"), "//result[@numFound='0']");
assertQ(req("q", "content_lower_token:h*1"), "//result[@numFound='0']");
assertQ(req("q", "content_lower_token:H*1"), "//result[@numFound='0']");
assertQ(req("q", "content_lower_token:*1"), "//result[@numFound='0']");
assertQ(req("q", "content_lower_token:HÏ*l?*"), "//result[@numFound='1']");
assertQ(req("q", "content_lower_token:hȉ*l?*"), "//result[@numFound='1']");
}
@Test
public void testRegex() throws Exception {
assertQ(req("q", "content:/Zill[a-z]/"),
"//result[@numFound='1']");
assertQ(req("q", "content:/Zill[A-Z]/"), // everything in the regex gets lowercased?
"//result[@numFound='1']");
assertQ(req("q", "content_keyword:/.*Zill[A-Z]/"),
"//result[@numFound='1']");
assertQ(req("q", "content_straight:/Zill[a-z]/"), // case preserving field shouldn't match
"//result[@numFound='0']");
assertQ(req("q", "content_folding:/Zill[a-z]/"), // case preserving field shouldn't match
"//result[@numFound='0']");
assertQ(req("q", "content_keyword:/Abcdefg1 Finger/"), // test spaces
"//result[@numFound='1']");
}
@Test
public void testGeneral() throws Exception {
assertQ(req("q", "content_stemming:fings*"), "//result[@numFound='0']"); // should not match (but would if fings* was stemmed to fing*
assertQ(req("q", "content_stemming:fing*"), "//result[@numFound='1']");
}
// Phrases should fail. This test is mainly a marker so if phrases ever do start working with wildcards we go // Phrases should fail. This test is mainly a marker so if phrases ever do start working with wildcards we go
// and update the documentation // and update the documentation
@Test @Test
@ -143,17 +187,14 @@ public class TestFoldingMultitermQuery extends SolrTestCaseJ4 {
"//result[@numFound='0']"); "//result[@numFound='0']");
} }
// Make sure the legacy behavior flag is honored
@Test
public void testLegacyBehavior() {
assertQ(req("q", "content_oldstyle:ABCD*"),
"//result[@numFound='0']");
}
@Test @Test
public void testWildcardRange() { public void testWildcardRange() {
assertQ(req("q", "content:[* TO *]"), assertQ(req("q", "content:[* TO *]"),
"//result[@numFound='3']"); "//result[@numFound='3']");
assertQ(req("q", "content:[AB* TO Z*]"),
"//result[@numFound='3']");
assertQ(req("q", "content:[AB*E?G* TO TU*W]"),
"//result[@numFound='3']");
} }
@ -222,10 +263,13 @@ public class TestFoldingMultitermQuery extends SolrTestCaseJ4 {
@Test @Test
public void testMultiBad() { public void testMultiBad() {
try { try {
ignoreException("analyzer returned too many terms");
assertQ(req("q", "content_multi_bad:" + "abCD*")); assertQ(req("q", "content_multi_bad:" + "abCD*"));
fail("Should throw exception when token evaluates to more than one term"); fail("Should throw exception when token evaluates to more than one term");
} catch (Exception expected) { } catch (Exception expected) {
assertTrue(expected.getCause() instanceof IllegalArgumentException); assertTrue(expected.getCause() instanceof org.apache.solr.common.SolrException);
} finally {
resetExceptionIgnores();
} }
} }
} }

View File

@ -427,41 +427,6 @@
</analyzer> </analyzer>
</fieldType> </fieldType>
<!-- Illustrates the new "multiterm" analyzer definition the <fieldType> can take a new
parameter legacyMultiTerm="true" if the old behvaior is desired. The new default
behavior as of 3.6+ is to automatically define a multiterm analyzer
-->
<fieldType name="text_multiterm" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
<!-- Illustrates the use of a new analyzer type "multiterm". See the Wiki page "Multiterm
Query Analysis" and SOLR-2438 for full details. The short form is that this analyzer is
applied to wildcard terms (prefix, wildcard range) if specified. This allows, among other
things, not having to lowercase wildcard terms on the client.
In the absence of this section, the new default behavior (3.6, 4.0) is to construct
one of these from the query analyzer that incorporates any defined charfilters, a
WhitespaceTokenizer, a LowerCaseFilter (if defined), and an ASCIIFoldingFilter
(if defined).
Arguably, this is an expert-level analyzer, most cases will be handled by an instance
of this being automatically constructed from the queryanalyzer.
-->
<analyzer type="multiterm">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
</analyzer>
</fieldType>
<!-- since fields of this type are by default not stored or indexed, <!-- since fields of this type are by default not stored or indexed,
any data added to them will be ignored outright. --> any data added to them will be ignored outright. -->
@ -587,6 +552,7 @@
<dynamicField name="*_l" type="long" indexed="true" stored="true"/> <dynamicField name="*_l" type="long" indexed="true" stored="true"/>
<dynamicField name="*_t" type="text_general" indexed="true" stored="true"/> <dynamicField name="*_t" type="text_general" indexed="true" stored="true"/>
<dynamicField name="*_txt" type="text_general" indexed="true" stored="true" multiValued="true"/> <dynamicField name="*_txt" type="text_general" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="*_en" type="text_en" indexed="true" stored="true" multiValued="true" />
<dynamicField name="*_b" type="boolean" indexed="true" stored="true"/> <dynamicField name="*_b" type="boolean" indexed="true" stored="true"/>
<dynamicField name="*_f" type="float" indexed="true" stored="true"/> <dynamicField name="*_f" type="float" indexed="true" stored="true"/>
<dynamicField name="*_d" type="double" indexed="true" stored="true"/> <dynamicField name="*_d" type="double" indexed="true" stored="true"/>