mirror of https://github.com/apache/lucene.git
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1206767 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5c4063bef2
commit
c94c1c5a64
|
@ -193,6 +193,11 @@ New Features
|
||||||
a complete analysis chain for multiterm queries.
|
a complete analysis chain for multiterm queries.
|
||||||
(Pete Sturge Erick Erickson, Mentoring from Seeley and Muir)
|
(Pete Sturge Erick Erickson, Mentoring from Seeley and Muir)
|
||||||
|
|
||||||
|
* SOLR-2918 Improvement to SOLR-2438, added MultiTermAwareComponent to the various classes
|
||||||
|
that should transform multiterm queries in various ways, and use this as the criteria for
|
||||||
|
adding them to the multiterm analyzer that is constructed if not specified in the
|
||||||
|
<fieldType>
|
||||||
|
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
----------------------
|
----------------------
|
||||||
|
|
|
@ -32,9 +32,14 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
* </fieldType></pre>
|
* </fieldType></pre>
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
public class ASCIIFoldingFilterFactory extends BaseTokenFilterFactory {
|
public class ASCIIFoldingFilterFactory extends BaseTokenFilterFactory implements MultiTermAwareComponent {
|
||||||
public ASCIIFoldingFilter create(TokenStream input) {
|
public ASCIIFoldingFilter create(TokenStream input) {
|
||||||
return new ASCIIFoldingFilter(input);
|
return new ASCIIFoldingFilter(input);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Object getMultiTermComponent() {
|
||||||
|
return this;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -33,7 +33,7 @@ import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
* </fieldType></pre>
|
* </fieldType></pre>
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
public class LowerCaseFilterFactory extends BaseTokenFilterFactory {
|
public class LowerCaseFilterFactory extends BaseTokenFilterFactory implements MultiTermAwareComponent {
|
||||||
@Override
|
@Override
|
||||||
public void init(Map<String,String> args) {
|
public void init(Map<String,String> args) {
|
||||||
super.init(args);
|
super.init(args);
|
||||||
|
@ -43,4 +43,9 @@ public class LowerCaseFilterFactory extends BaseTokenFilterFactory {
|
||||||
public LowerCaseFilter create(TokenStream input) {
|
public LowerCaseFilter create(TokenStream input) {
|
||||||
return new LowerCaseFilter(luceneMatchVersion,input);
|
return new LowerCaseFilter(luceneMatchVersion,input);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Object getMultiTermComponent() {
|
||||||
|
return this;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,6 +17,7 @@
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
|
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
|
||||||
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
@ -32,7 +33,7 @@ import java.util.Map;
|
||||||
* </fieldType></pre>
|
* </fieldType></pre>
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
public class LowerCaseTokenizerFactory extends BaseTokenizerFactory {
|
public class LowerCaseTokenizerFactory extends BaseTokenizerFactory implements MultiTermAwareComponent {
|
||||||
@Override
|
@Override
|
||||||
public void init(Map<String,String> args) {
|
public void init(Map<String,String> args) {
|
||||||
super.init(args);
|
super.init(args);
|
||||||
|
@ -42,4 +43,11 @@ public class LowerCaseTokenizerFactory extends BaseTokenizerFactory {
|
||||||
public LowerCaseTokenizer create(Reader input) {
|
public LowerCaseTokenizer create(Reader input) {
|
||||||
return new LowerCaseTokenizer(luceneMatchVersion,input);
|
return new LowerCaseTokenizer(luceneMatchVersion,input);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Object getMultiTermComponent() {
|
||||||
|
LowerCaseFilterFactory filt = new LowerCaseFilterFactory();
|
||||||
|
filt.init(args);
|
||||||
|
return filt;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -46,7 +46,7 @@ import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
public class MappingCharFilterFactory extends BaseCharFilterFactory implements
|
public class MappingCharFilterFactory extends BaseCharFilterFactory implements
|
||||||
ResourceLoaderAware {
|
ResourceLoaderAware, MultiTermAwareComponent {
|
||||||
|
|
||||||
protected NormalizeCharMap normMap;
|
protected NormalizeCharMap normMap;
|
||||||
private String mapping;
|
private String mapping;
|
||||||
|
@ -126,4 +126,9 @@ public class MappingCharFilterFactory extends BaseCharFilterFactory implements
|
||||||
}
|
}
|
||||||
return new String( out, 0, writePos );
|
return new String( out, 0, writePos );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Object getMultiTermComponent() {
|
||||||
|
return this;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,31 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/** Add to any analysis factory component to allow returning an
|
||||||
|
* analysis component factory for use with partial terms in prefix queries,
|
||||||
|
* wildcard queries, range query endpoints, regex queries, etc.
|
||||||
|
*
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
public interface MultiTermAwareComponent {
|
||||||
|
/** Returns an analysis component to handle analysis if multi-term queries.
|
||||||
|
* The returned component must be a TokenizerFactory, TokenFilterFactory or CharFilterFactory.
|
||||||
|
*/
|
||||||
|
public Object getMultiTermComponent();
|
||||||
|
}
|
|
@ -31,10 +31,15 @@ import org.apache.lucene.analysis.fa.PersianCharFilter;
|
||||||
* </fieldType></pre>
|
* </fieldType></pre>
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
public class PersianCharFilterFactory extends BaseCharFilterFactory {
|
public class PersianCharFilterFactory extends BaseCharFilterFactory implements MultiTermAwareComponent {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public CharStream create(CharStream input) {
|
public CharStream create(CharStream input) {
|
||||||
return new PersianCharFilter(input);
|
return new PersianCharFilter(input);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Object getMultiTermComponent() {
|
||||||
|
return this;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -67,3 +67,4 @@ public interface TokenFilterFactory {
|
||||||
/** Transform the specified input TokenStream */
|
/** Transform the specified input TokenStream */
|
||||||
public TokenStream create(TokenStream input);
|
public TokenStream create(TokenStream input);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -48,15 +48,13 @@ public abstract class FieldProperties {
|
||||||
|
|
||||||
protected final static int REQUIRED = 0x00001000;
|
protected final static int REQUIRED = 0x00001000;
|
||||||
protected final static int OMIT_POSITIONS = 0x00002000;
|
protected final static int OMIT_POSITIONS = 0x00002000;
|
||||||
protected final static int LEGACY_MULTITERM = 0x00004000;
|
|
||||||
|
|
||||||
static final String[] propertyNames = {
|
static final String[] propertyNames = {
|
||||||
"indexed", "tokenized", "stored",
|
"indexed", "tokenized", "stored",
|
||||||
"binary", "omitNorms", "omitTermFreqAndPositions",
|
"binary", "omitNorms", "omitTermFreqAndPositions",
|
||||||
"termVectors", "termPositions", "termOffsets",
|
"termVectors", "termPositions", "termOffsets",
|
||||||
"multiValued",
|
"multiValued",
|
||||||
"sortMissingFirst","sortMissingLast","required", "omitPositions" ,
|
"sortMissingFirst","sortMissingLast","required", "omitPositions"
|
||||||
"legacyMultiTerm"
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static final Map<String,Integer> propertyMap = new HashMap<String,Integer>();
|
static final Map<String,Integer> propertyMap = new HashMap<String,Integer>();
|
||||||
|
|
|
@ -428,21 +428,6 @@ public abstract class FieldType extends FieldProperties {
|
||||||
*/
|
*/
|
||||||
protected Analyzer queryAnalyzer=analyzer;
|
protected Analyzer queryAnalyzer=analyzer;
|
||||||
|
|
||||||
/**
|
|
||||||
* Analyzer set by schema for text types to use when searching fields
|
|
||||||
* of this type, subclasses can set analyzer themselves or override
|
|
||||||
* getAnalyzer()
|
|
||||||
* This analyzer is used to process wildcard, prefix, regex and other multiterm queries. It
|
|
||||||
* assembles a list of tokenizer +filters that "make sense" for this, primarily accent folding and
|
|
||||||
* lowercasing filters, and charfilters.
|
|
||||||
*
|
|
||||||
* If users require old-style behavior, they can specify 'legacyMultiterm="true" ' in the schema file
|
|
||||||
* @see #getMultiTermAnalyzer
|
|
||||||
* @see #setMultiTermAnalyzer
|
|
||||||
*/
|
|
||||||
protected Analyzer multiTermAnalyzer=null;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the Analyzer to be used when indexing fields of this type.
|
* Returns the Analyzer to be used when indexing fields of this type.
|
||||||
* <p>
|
* <p>
|
||||||
|
@ -465,20 +450,6 @@ public abstract class FieldType extends FieldProperties {
|
||||||
return queryAnalyzer;
|
return queryAnalyzer;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the Analyzer to be used when searching fields of this type when mult-term queries are specified.
|
|
||||||
* <p>
|
|
||||||
* This method may be called many times, at any time.
|
|
||||||
* </p>
|
|
||||||
* @see #getAnalyzer
|
|
||||||
*/
|
|
||||||
public Analyzer getMultiTermAnalyzer() {
|
|
||||||
return multiTermAnalyzer;
|
|
||||||
}
|
|
||||||
|
|
||||||
private final String analyzerError =
|
|
||||||
"FieldType: " + this.getClass().getSimpleName() +
|
|
||||||
" (" + typeName + ") does not support specifying an analyzer";
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Sets the Analyzer to be used when indexing fields of this type.
|
* Sets the Analyzer to be used when indexing fields of this type.
|
||||||
|
@ -524,28 +495,6 @@ public abstract class FieldType extends FieldProperties {
|
||||||
throw e;
|
throw e;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Sets the Analyzer to be used when querying fields of this type.
|
|
||||||
*
|
|
||||||
* <p>
|
|
||||||
*
|
|
||||||
* Subclasses that override this method need to ensure the behavior
|
|
||||||
* of the analyzer is consistent with the implementation of toInternal.
|
|
||||||
* </p>
|
|
||||||
*
|
|
||||||
* @see #toInternal
|
|
||||||
* @see #setAnalyzer
|
|
||||||
* @see #getQueryAnalyzer
|
|
||||||
*/
|
|
||||||
public void setMultiTermAnalyzer(Analyzer analyzer) {
|
|
||||||
SolrException e = new SolrException
|
|
||||||
(ErrorCode.SERVER_ERROR,
|
|
||||||
"FieldType: " + this.getClass().getSimpleName() +
|
|
||||||
" (" + typeName + ") does not support specifying an analyzer");
|
|
||||||
SolrException.logOnce(log,null,e);
|
|
||||||
throw e;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** @lucene.internal */
|
/** @lucene.internal */
|
||||||
protected Similarity similarity;
|
protected Similarity similarity;
|
||||||
|
|
||||||
|
|
|
@ -102,15 +102,13 @@ public final class FieldTypePluginLoader
|
||||||
if (queryAnalyzer==null) queryAnalyzer=analyzer;
|
if (queryAnalyzer==null) queryAnalyzer=analyzer;
|
||||||
if (analyzer==null) analyzer=queryAnalyzer;
|
if (analyzer==null) analyzer=queryAnalyzer;
|
||||||
if (multiAnalyzer == null) {
|
if (multiAnalyzer == null) {
|
||||||
Boolean legacyMatch = ! schema.getDefaultLuceneMatchVersion().onOrAfter(Version.LUCENE_36);
|
multiAnalyzer = constructMultiTermAnalyzer(queryAnalyzer);
|
||||||
legacyMatch = (DOMUtil.getAttr(node, "legacyMultiTerm", null) == null) ? legacyMatch :
|
|
||||||
Boolean.parseBoolean(DOMUtil.getAttr(node, "legacyMultiTerm", null));
|
|
||||||
multiAnalyzer = constructMultiTermAnalyzer(queryAnalyzer, legacyMatch);
|
|
||||||
}
|
}
|
||||||
if (analyzer!=null) {
|
if (analyzer!=null) {
|
||||||
ft.setAnalyzer(analyzer);
|
ft.setAnalyzer(analyzer);
|
||||||
ft.setQueryAnalyzer(queryAnalyzer);
|
ft.setQueryAnalyzer(queryAnalyzer);
|
||||||
ft.setMultiTermAnalyzer(multiAnalyzer);
|
if (ft instanceof TextField)
|
||||||
|
((TextField)ft).setMultiTermAnalyzer(multiAnalyzer);
|
||||||
}
|
}
|
||||||
if (similarity!=null) {
|
if (similarity!=null) {
|
||||||
ft.setSimilarity(similarity);
|
ft.setSimilarity(similarity);
|
||||||
|
@ -143,36 +141,75 @@ public final class FieldTypePluginLoader
|
||||||
// 2> If letacyMultiTerm == true just construct the analyzer from a KeywordTokenizer. That should mimic current behavior.
|
// 2> If letacyMultiTerm == true just construct the analyzer from a KeywordTokenizer. That should mimic current behavior.
|
||||||
// Do the same if they've specified that the old behavior is required (legacyMultiTerm="true")
|
// Do the same if they've specified that the old behavior is required (legacyMultiTerm="true")
|
||||||
|
|
||||||
private Analyzer constructMultiTermAnalyzer(Analyzer queryAnalyzer, Boolean legacyMultiTerm) {
|
private Analyzer constructMultiTermAnalyzer(Analyzer queryAnalyzer) {
|
||||||
if (queryAnalyzer == null) return null;
|
if (queryAnalyzer == null) return null;
|
||||||
|
|
||||||
if (legacyMultiTerm || (!(queryAnalyzer instanceof TokenizerChain))) {
|
if (!(queryAnalyzer instanceof TokenizerChain)) {
|
||||||
return new KeywordAnalyzer();
|
return new KeywordAnalyzer();
|
||||||
}
|
}
|
||||||
|
|
||||||
TokenizerChain tc = (TokenizerChain) queryAnalyzer;
|
TokenizerChain tc = (TokenizerChain) queryAnalyzer;
|
||||||
|
MultiTermChainBuilder builder = new MultiTermChainBuilder();
|
||||||
|
|
||||||
// we know it'll never be longer than this unless the code below is explicitly changed
|
CharFilterFactory[] charFactories = tc.getCharFilterFactories();
|
||||||
TokenFilterFactory[] filters = new TokenFilterFactory[2];
|
if (charFactories != null) {
|
||||||
int idx = 0;
|
for (CharFilterFactory fact : charFactories) {
|
||||||
for (TokenFilterFactory factory : tc.getTokenFilterFactories()) {
|
builder.add(fact);
|
||||||
if (factory instanceof LowerCaseFilterFactory) {
|
|
||||||
filters[idx] = new LowerCaseFilterFactory();
|
|
||||||
filters[idx++].init(factory.getArgs());
|
|
||||||
}
|
|
||||||
if (factory instanceof ASCIIFoldingFilterFactory) {
|
|
||||||
filters[idx] = new ASCIIFoldingFilterFactory();
|
|
||||||
filters[idx++].init(factory.getArgs());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
WhitespaceTokenizerFactory white = new WhitespaceTokenizerFactory();
|
|
||||||
white.init(tc.getTokenizerFactory().getArgs());
|
|
||||||
|
|
||||||
return new TokenizerChain(tc.getCharFilterFactories(),
|
builder.add(tc.getTokenizerFactory());
|
||||||
white,
|
|
||||||
Arrays.copyOfRange(filters, 0, idx));
|
for (TokenFilterFactory fact : tc.getTokenFilterFactories()) {
|
||||||
|
builder.add(fact);
|
||||||
|
}
|
||||||
|
|
||||||
|
return builder.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static class MultiTermChainBuilder {
|
||||||
|
static final KeywordTokenizerFactory keyFactory;
|
||||||
|
|
||||||
|
static {
|
||||||
|
keyFactory = new KeywordTokenizerFactory();
|
||||||
|
keyFactory.init(new HashMap<String,String>());
|
||||||
|
}
|
||||||
|
|
||||||
|
ArrayList<CharFilterFactory> charFilters = null;
|
||||||
|
ArrayList<TokenFilterFactory> filters = new ArrayList<TokenFilterFactory>(2);
|
||||||
|
TokenizerFactory tokenizer = keyFactory;
|
||||||
|
|
||||||
|
public void add(Object current) {
|
||||||
|
if (!(current instanceof MultiTermAwareComponent)) return;
|
||||||
|
Object newComponent = ((MultiTermAwareComponent)current).getMultiTermComponent();
|
||||||
|
if (newComponent instanceof TokenFilterFactory) {
|
||||||
|
if (filters == null) {
|
||||||
|
filters = new ArrayList<TokenFilterFactory>(2);
|
||||||
|
}
|
||||||
|
filters.add((TokenFilterFactory)newComponent);
|
||||||
|
} else if (newComponent instanceof TokenizerFactory) {
|
||||||
|
tokenizer = (TokenizerFactory)newComponent;
|
||||||
|
} else if (newComponent instanceof CharFilterFactory) {
|
||||||
|
if (charFilters == null) {
|
||||||
|
charFilters = new ArrayList<CharFilterFactory>(1);
|
||||||
|
}
|
||||||
|
charFilters.add( (CharFilterFactory)newComponent);
|
||||||
|
|
||||||
|
} else {
|
||||||
|
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown analysis component from MultiTermAwareComponent: " + newComponent);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public TokenizerChain build() {
|
||||||
|
CharFilterFactory[] charFilterArr = charFilters == null ? null : charFilters.toArray(new CharFilterFactory[charFilters.size()]);
|
||||||
|
TokenFilterFactory[] filterArr = filters == null ? new TokenFilterFactory[0] : filters.toArray(new TokenFilterFactory[filters.size()]);
|
||||||
|
return new TokenizerChain(charFilterArr, tokenizer, filterArr);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// <analyzer><tokenizer class="...."/><tokenizer class="...." arg="....">
|
// <analyzer><tokenizer class="...."/><tokenizer class="...." arg="....">
|
||||||
//
|
//
|
||||||
|
|
|
@ -97,10 +97,6 @@ public final class SchemaField extends FieldProperties {
|
||||||
boolean isTokenized() { return (properties & TOKENIZED)!=0; }
|
boolean isTokenized() { return (properties & TOKENIZED)!=0; }
|
||||||
boolean isBinary() { return (properties & BINARY)!=0; }
|
boolean isBinary() { return (properties & BINARY)!=0; }
|
||||||
|
|
||||||
boolean legacyMultiTerm() {
|
|
||||||
return (properties & LEGACY_MULTITERM) != 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
public IndexableField createField(Object val, float boost) {
|
public IndexableField createField(Object val, float boost) {
|
||||||
return type.createField(this,val,boost);
|
return type.createField(this,val,boost);
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,13 +17,8 @@
|
||||||
|
|
||||||
package org.apache.solr.schema;
|
package org.apache.solr.schema;
|
||||||
|
|
||||||
import org.apache.lucene.search.SortField;
|
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.*;
|
||||||
import org.apache.lucene.search.PhraseQuery;
|
|
||||||
import org.apache.lucene.search.TermQuery;
|
|
||||||
import org.apache.lucene.search.BooleanQuery;
|
|
||||||
import org.apache.lucene.search.BooleanClause;
|
|
||||||
import org.apache.lucene.search.MultiPhraseQuery;
|
|
||||||
import org.apache.lucene.index.IndexableField;
|
import org.apache.lucene.index.IndexableField;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
@ -32,6 +27,7 @@ import org.apache.lucene.analysis.CachingTokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.solr.common.SolrException;
|
||||||
import org.apache.solr.response.TextResponseWriter;
|
import org.apache.solr.response.TextResponseWriter;
|
||||||
import org.apache.solr.search.QParser;
|
import org.apache.solr.search.QParser;
|
||||||
|
|
||||||
|
@ -48,6 +44,19 @@ import java.io.StringReader;
|
||||||
public class TextField extends FieldType {
|
public class TextField extends FieldType {
|
||||||
protected boolean autoGeneratePhraseQueries;
|
protected boolean autoGeneratePhraseQueries;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Analyzer set by schema for text types to use when searching fields
|
||||||
|
* of this type, subclasses can set analyzer themselves or override
|
||||||
|
* getAnalyzer()
|
||||||
|
* This analyzer is used to process wildcard, prefix, regex and other multiterm queries. It
|
||||||
|
* assembles a list of tokenizer +filters that "make sense" for this, primarily accent folding and
|
||||||
|
* lowercasing filters, and charfilters.
|
||||||
|
*
|
||||||
|
* @see #getMultiTermAnalyzer
|
||||||
|
* @see #setMultiTermAnalyzer
|
||||||
|
*/
|
||||||
|
protected Analyzer multiTermAnalyzer=null;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void init(IndexSchema schema, Map<String,String> args) {
|
protected void init(IndexSchema schema, Map<String,String> args) {
|
||||||
properties |= TOKENIZED;
|
properties |= TOKENIZED;
|
||||||
|
@ -63,6 +72,21 @@ public class TextField extends FieldType {
|
||||||
super.init(schema, args);
|
super.init(schema, args);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the Analyzer to be used when searching fields of this type when mult-term queries are specified.
|
||||||
|
* <p>
|
||||||
|
* This method may be called many times, at any time.
|
||||||
|
* </p>
|
||||||
|
* @see #getAnalyzer
|
||||||
|
*/
|
||||||
|
public Analyzer getMultiTermAnalyzer() {
|
||||||
|
return multiTermAnalyzer;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setMultiTermAnalyzer(Analyzer analyzer) {
|
||||||
|
this.multiTermAnalyzer = analyzer;
|
||||||
|
}
|
||||||
|
|
||||||
public boolean getAutoGeneratePhraseQueries() {
|
public boolean getAutoGeneratePhraseQueries() {
|
||||||
return autoGeneratePhraseQueries;
|
return autoGeneratePhraseQueries;
|
||||||
}
|
}
|
||||||
|
@ -98,11 +122,50 @@ public class TextField extends FieldType {
|
||||||
this.queryAnalyzer = analyzer;
|
this.queryAnalyzer = analyzer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void setMultiTermAnalyzer(Analyzer analyzer) {
|
public Query getRangeQuery(QParser parser, SchemaField field, String part1, String part2, boolean minInclusive, boolean maxInclusive) {
|
||||||
this.multiTermAnalyzer = analyzer;
|
Analyzer multiAnalyzer = getMultiTermAnalyzer();
|
||||||
|
BytesRef lower = analyzeMultiTerm(field.getName(), part1, multiAnalyzer);
|
||||||
|
BytesRef upper = analyzeMultiTerm(field.getName(), part2, multiAnalyzer);
|
||||||
|
return new TermRangeQuery(field.getName(), lower, upper, minInclusive, maxInclusive);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static BytesRef analyzeMultiTerm(String field, String part, Analyzer analyzerIn) {
|
||||||
|
if (part == null) return null;
|
||||||
|
|
||||||
|
TokenStream source;
|
||||||
|
try {
|
||||||
|
source = analyzerIn.tokenStream(field, new StringReader(part));
|
||||||
|
source.reset();
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Unable to initialize TokenStream to analyze multiTerm term: " + part, e);
|
||||||
|
}
|
||||||
|
|
||||||
|
TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
|
||||||
|
BytesRef bytes = termAtt.getBytesRef();
|
||||||
|
|
||||||
|
try {
|
||||||
|
if (!source.incrementToken())
|
||||||
|
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,"analyzer returned no terms for multiTerm term: " + part);
|
||||||
|
termAtt.fillBytesRef();
|
||||||
|
if (source.incrementToken())
|
||||||
|
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,"analyzer returned too many terms for multiTerm term: " + part);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,"error analyzing range part: " + part, e);
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
source.end();
|
||||||
|
source.close();
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException("Unable to end & close TokenStream after analyzing multiTerm term: " + part, e);
|
||||||
|
}
|
||||||
|
|
||||||
|
return BytesRef.deepCopyOf(bytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static Query parseFieldQuery(QParser parser, Analyzer analyzer, String field, String queryText) {
|
static Query parseFieldQuery(QParser parser, Analyzer analyzer, String field, String queryText) {
|
||||||
int phraseSlop = 0;
|
int phraseSlop = 0;
|
||||||
boolean enablePositionIncrements = true;
|
boolean enablePositionIncrements = true;
|
||||||
|
|
|
@ -58,8 +58,9 @@ public class SolrQueryParser extends QueryParser {
|
||||||
protected final IndexSchema schema;
|
protected final IndexSchema schema;
|
||||||
protected final QParser parser;
|
protected final QParser parser;
|
||||||
protected final String defaultField;
|
protected final String defaultField;
|
||||||
protected final Map<String, ReversedWildcardFilterFactory> leadingWildcards =
|
|
||||||
new HashMap<String, ReversedWildcardFilterFactory>();
|
// implementation detail - caching ReversedWildcardFilterFactory based on type
|
||||||
|
private Map<FieldType, ReversedWildcardFilterFactory> leadingWildcards;
|
||||||
|
|
||||||
public SolrQueryParser(QParser parser, String defaultField) {
|
public SolrQueryParser(QParser parser, String defaultField) {
|
||||||
this(parser, defaultField, parser.getReq().getSchema().getQueryAnalyzer());
|
this(parser, defaultField, parser.getReq().getSchema().getQueryAnalyzer());
|
||||||
|
@ -71,31 +72,35 @@ public class SolrQueryParser extends QueryParser {
|
||||||
this.parser = parser;
|
this.parser = parser;
|
||||||
this.defaultField = defaultField;
|
this.defaultField = defaultField;
|
||||||
setEnablePositionIncrements(true);
|
setEnablePositionIncrements(true);
|
||||||
checkAllowLeadingWildcards();
|
setLowercaseExpandedTerms(false);
|
||||||
|
setAllowLeadingWildcard(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void checkAllowLeadingWildcards() {
|
protected ReversedWildcardFilterFactory getReversedWildcardFilterFactory(FieldType fieldType) {
|
||||||
boolean allow = false;
|
if (leadingWildcards == null) leadingWildcards = new HashMap<FieldType, ReversedWildcardFilterFactory>();
|
||||||
for (Entry<String, FieldType> e : schema.getFieldTypes().entrySet()) {
|
ReversedWildcardFilterFactory fac = leadingWildcards.get(fieldType);
|
||||||
Analyzer a = e.getValue().getAnalyzer();
|
if (fac == null && leadingWildcards.containsKey(fac)) {
|
||||||
if (a instanceof TokenizerChain) {
|
return fac;
|
||||||
// examine the indexing analysis chain if it supports leading wildcards
|
}
|
||||||
TokenizerChain tc = (TokenizerChain)a;
|
|
||||||
TokenFilterFactory[] factories = tc.getTokenFilterFactories();
|
Analyzer a = fieldType.getAnalyzer();
|
||||||
for (TokenFilterFactory factory : factories) {
|
if (a instanceof TokenizerChain) {
|
||||||
if (factory instanceof ReversedWildcardFilterFactory) {
|
// examine the indexing analysis chain if it supports leading wildcards
|
||||||
allow = true;
|
TokenizerChain tc = (TokenizerChain)a;
|
||||||
leadingWildcards.put(e.getKey(), (ReversedWildcardFilterFactory)factory);
|
TokenFilterFactory[] factories = tc.getTokenFilterFactories();
|
||||||
}
|
for (TokenFilterFactory factory : factories) {
|
||||||
|
if (factory instanceof ReversedWildcardFilterFactory) {
|
||||||
|
fac = (ReversedWildcardFilterFactory)factory;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// XXX should be enabled on a per-field basis
|
|
||||||
if (allow) {
|
leadingWildcards.put(fieldType, fac);
|
||||||
setAllowLeadingWildcard(true);
|
return fac;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void checkNullField(String field) throws SolrException {
|
private void checkNullField(String field) throws SolrException {
|
||||||
if (field == null && defaultField == null) {
|
if (field == null && defaultField == null) {
|
||||||
throw new SolrException
|
throw new SolrException
|
||||||
|
@ -104,12 +109,14 @@ public class SolrQueryParser extends QueryParser {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String analyzeIfMultitermTermText(String field, String part, Analyzer analyzer) {
|
protected String analyzeIfMultitermTermText(String field, String part, FieldType fieldType) {
|
||||||
if (part == null) return part;
|
if (part == null) return part;
|
||||||
|
|
||||||
SchemaField sf = schema.getFieldOrNull((field));
|
SchemaField sf = schema.getFieldOrNull((field));
|
||||||
if (sf == null || ! (sf.getType() instanceof TextField)) return part;
|
if (sf == null || ! (fieldType instanceof TextField)) return part;
|
||||||
return analyzeMultitermTerm(field, part, analyzer).utf8ToString();
|
String out = TextField.analyzeMultiTerm(field, part, ((TextField)fieldType).getMultiTermAnalyzer()).utf8ToString();
|
||||||
|
// System.out.println("INPUT="+part + " OUTPUT="+out);
|
||||||
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -143,8 +150,6 @@ public class SolrQueryParser extends QueryParser {
|
||||||
@Override
|
@Override
|
||||||
protected Query getRangeQuery(String field, String part1, String part2, boolean startInclusive, boolean endInclusive) throws ParseException {
|
protected Query getRangeQuery(String field, String part1, String part2, boolean startInclusive, boolean endInclusive) throws ParseException {
|
||||||
checkNullField(field);
|
checkNullField(field);
|
||||||
part1 = analyzeIfMultitermTermText(field, part1, schema.getFieldType(field).getMultiTermAnalyzer());
|
|
||||||
part2 = analyzeIfMultitermTermText(field, part2, schema.getFieldType(field).getMultiTermAnalyzer());
|
|
||||||
SchemaField sf = schema.getField(field);
|
SchemaField sf = schema.getField(field);
|
||||||
return sf.getType().getRangeQuery(parser, sf, part1, part2, startInclusive, endInclusive);
|
return sf.getType().getRangeQuery(parser, sf, part1, part2, startInclusive, endInclusive);
|
||||||
}
|
}
|
||||||
|
@ -153,21 +158,10 @@ public class SolrQueryParser extends QueryParser {
|
||||||
protected Query getPrefixQuery(String field, String termStr) throws ParseException {
|
protected Query getPrefixQuery(String field, String termStr) throws ParseException {
|
||||||
checkNullField(field);
|
checkNullField(field);
|
||||||
|
|
||||||
termStr = analyzeIfMultitermTermText(field, termStr, schema.getFieldType(field).getMultiTermAnalyzer());
|
termStr = analyzeIfMultitermTermText(field, termStr, schema.getFieldType(field));
|
||||||
|
|
||||||
// TODO: toInternal() won't necessarily work on partial
|
// Solr has always used constant scoring for prefix queries. This should return constant scoring by default.
|
||||||
// values, so it looks like we need a getPrefix() function
|
return newPrefixQuery(new Term(field, termStr));
|
||||||
// on fieldtype? Or at the minimum, a method on fieldType
|
|
||||||
// that can tell me if I should lowercase or not...
|
|
||||||
// Schema could tell if lowercase filter is in the chain,
|
|
||||||
// but a more sure way would be to run something through
|
|
||||||
// the first time and check if it got lowercased.
|
|
||||||
|
|
||||||
// TODO: throw exception if field type doesn't support prefixes?
|
|
||||||
// (sortable numeric types don't do prefixes, but can do range queries)
|
|
||||||
Term t = new Term(field, termStr);
|
|
||||||
PrefixQuery prefixQuery = new PrefixQuery(t);
|
|
||||||
return prefixQuery;
|
|
||||||
}
|
}
|
||||||
@Override
|
@Override
|
||||||
protected Query getWildcardQuery(String field, String termStr) throws ParseException {
|
protected Query getWildcardQuery(String field, String termStr) throws ParseException {
|
||||||
|
@ -175,10 +169,10 @@ public class SolrQueryParser extends QueryParser {
|
||||||
if ("*".equals(field) && "*".equals(termStr)) {
|
if ("*".equals(field) && "*".equals(termStr)) {
|
||||||
return newMatchAllDocsQuery();
|
return newMatchAllDocsQuery();
|
||||||
}
|
}
|
||||||
termStr = analyzeIfMultitermTermText(field, termStr, schema.getFieldType(field).getMultiTermAnalyzer());
|
FieldType fieldType = schema.getFieldType(field);
|
||||||
|
termStr = analyzeIfMultitermTermText(field, termStr, fieldType);
|
||||||
// can we use reversed wildcards in this field?
|
// can we use reversed wildcards in this field?
|
||||||
String type = schema.getFieldType(field).getTypeName();
|
ReversedWildcardFilterFactory factory = getReversedWildcardFilterFactory(fieldType);
|
||||||
ReversedWildcardFilterFactory factory = leadingWildcards.get(type);
|
|
||||||
if (factory != null) {
|
if (factory != null) {
|
||||||
Term term = new Term(field, termStr);
|
Term term = new Term(field, termStr);
|
||||||
// fsa representing the query
|
// fsa representing the query
|
||||||
|
@ -211,19 +205,15 @@ public class SolrQueryParser extends QueryParser {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
Query q = super.getWildcardQuery(field, termStr);
|
|
||||||
if (q instanceof WildcardQuery) {
|
// Solr has always used constant scoring for wildcard queries. This should return constant scoring by default.
|
||||||
// use a constant score query to avoid overflowing clauses
|
return newWildcardQuery(new Term(field, termStr));
|
||||||
WildcardQuery wildcardQuery = new WildcardQuery(((WildcardQuery)q).getTerm());
|
|
||||||
return wildcardQuery;
|
|
||||||
}
|
|
||||||
return q;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
protected Query getRegexpQuery(String field, String termStr) throws ParseException
|
protected Query getRegexpQuery(String field, String termStr) throws ParseException
|
||||||
{
|
{
|
||||||
termStr = analyzeIfMultitermTermText(field, termStr, schema.getFieldType(field).getMultiTermAnalyzer());
|
termStr = analyzeIfMultitermTermText(field, termStr, schema.getFieldType(field));
|
||||||
return super.getRegexpQuery(field, termStr);
|
return newRegexpQuery(new Term(field, termStr));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -64,7 +64,7 @@
|
||||||
</analyzer>
|
</analyzer>
|
||||||
</fieldType>
|
</fieldType>
|
||||||
|
|
||||||
<fieldType name="text_rev" class="solr.TextField" legacyMultiTerm="false">
|
<fieldType name="text_rev" class="solr.TextField">
|
||||||
<analyzer type="index">
|
<analyzer type="index">
|
||||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
|
@ -80,12 +80,25 @@
|
||||||
</analyzer>
|
</analyzer>
|
||||||
</fieldType>
|
</fieldType>
|
||||||
|
|
||||||
<fieldType name="text_lower_tokenizer" class="solr.TextField">
|
<fieldType name="text_lower_token" class="solr.TextField">
|
||||||
<analyzer>
|
<analyzer>
|
||||||
<tokenizer class="solr.LowerCaseTokenizerFactory"/>
|
<tokenizer class="solr.LowerCaseTokenizerFactory"/>
|
||||||
|
<filter class="solr.ASCIIFoldingFilterFactory"/>
|
||||||
</analyzer>
|
</analyzer>
|
||||||
</fieldType>
|
</fieldType>
|
||||||
|
|
||||||
|
<fieldType name="text_oldstyle" class="solr.TextField">
|
||||||
|
<analyzer>
|
||||||
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
|
<filter class="solr.ASCIIFoldingFilterFactory"/>
|
||||||
|
</analyzer>
|
||||||
|
<analyzer type="multiterm">
|
||||||
|
<tokenizer class="solr.KeywordTokenizerFactory" />
|
||||||
|
</analyzer>
|
||||||
|
</fieldType>
|
||||||
|
|
||||||
|
|
||||||
<fieldType name="text_charfilter" class="solr.TextField" multiValued="false">
|
<fieldType name="text_charfilter" class="solr.TextField" multiValued="false">
|
||||||
<analyzer type="index">
|
<analyzer type="index">
|
||||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
|
@ -99,19 +112,47 @@
|
||||||
</analyzer>
|
</analyzer>
|
||||||
</fieldType>
|
</fieldType>
|
||||||
|
|
||||||
<fieldType name="text_oldstyle" class="solr.TextField" multiValued="false" legacyMultiTerm="true">
|
<fieldType name="text_straight" class="solr.TextField">
|
||||||
|
<analyzer>
|
||||||
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
|
</analyzer>
|
||||||
|
</fieldType>
|
||||||
|
|
||||||
|
<fieldType name="text_lower" class="solr.TextField">
|
||||||
|
<analyzer>
|
||||||
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
|
</analyzer>
|
||||||
|
</fieldType>
|
||||||
|
|
||||||
|
<fieldType name="text_folding" class="solr.TextField">
|
||||||
|
<analyzer>
|
||||||
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
|
<filter class="solr.ASCIIFoldingFilterFactory"/>
|
||||||
|
</analyzer>
|
||||||
|
</fieldType>
|
||||||
|
|
||||||
|
<fieldType name="text_stemming" class="solr.TextField">
|
||||||
<analyzer>
|
<analyzer>
|
||||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
<filter class="solr.ASCIIFoldingFilterFactory"/>
|
<filter class="solr.ASCIIFoldingFilterFactory"/>
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
<filter class="solr.TrimFilterFactory"/>
|
<filter class="solr.PorterStemFilterFactory"/>
|
||||||
</analyzer>
|
</analyzer>
|
||||||
</fieldType>
|
</fieldType>
|
||||||
|
|
||||||
<fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
|
<fieldType name="text_keyword" class="solr.TextField" sortMissingLast="true" omitNorms="true">
|
||||||
<fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
|
<analyzer>
|
||||||
<fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
|
<tokenizer class="solr.KeywordTokenizerFactory"/>
|
||||||
<fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
|
<filter class="solr.LowerCaseFilterFactory" />
|
||||||
|
</analyzer>
|
||||||
|
</fieldType>
|
||||||
|
|
||||||
|
|
||||||
|
<fieldType name="int" class="solr.TrieIntField" precisionStep="4" omitNorms="true" positionIncrementGap="0"/>
|
||||||
|
<fieldType name="float" class="solr.TrieFloatField" precisionStep="4" omitNorms="true" positionIncrementGap="0"/>
|
||||||
|
<fieldType name="long" class="solr.TrieLongField" precisionStep="4" omitNorms="true" positionIncrementGap="0"/>
|
||||||
|
<fieldType name="double" class="solr.TrieDoubleField" precisionStep="4" omitNorms="true" positionIncrementGap="0"/>
|
||||||
<fieldType name="byte" class="solr.ByteField" omitNorms="true" positionIncrementGap="0"/>
|
<fieldType name="byte" class="solr.ByteField" omitNorms="true" positionIncrementGap="0"/>
|
||||||
<fieldType name="short" class="solr.ShortField" omitNorms="true" positionIncrementGap="0"/>
|
<fieldType name="short" class="solr.ShortField" omitNorms="true" positionIncrementGap="0"/>
|
||||||
<fieldtype name="boolean" class="solr.BoolField" sortMissingLast="true"/>
|
<fieldtype name="boolean" class="solr.BoolField" sortMissingLast="true"/>
|
||||||
|
@ -133,10 +174,17 @@
|
||||||
<field name="content_ws" type="text_ws" indexed="true" stored="true"/>
|
<field name="content_ws" type="text_ws" indexed="true" stored="true"/>
|
||||||
<field name="content_rev" type="text_rev" indexed="true" stored="true"/>
|
<field name="content_rev" type="text_rev" indexed="true" stored="true"/>
|
||||||
<field name="content_multi" type="text_multi" indexed="true" stored="true"/>
|
<field name="content_multi" type="text_multi" indexed="true" stored="true"/>
|
||||||
<field name="content_lower_token" type="text_multi" indexed="true" stored="true"/>
|
<field name="content_lower_token" type="text_lower_token" indexed="true" stored="true"/>
|
||||||
<field name="content_oldstyle" type="text_oldstyle" indexed="true" stored="true"/>
|
<field name="content_oldstyle" type="text_oldstyle" indexed="true" stored="true"/>
|
||||||
<field name="content_charfilter" type="text_charfilter" indexed="true" stored="true"/>
|
<field name="content_charfilter" type="text_charfilter" indexed="true" stored="true"/>
|
||||||
<field name="content_multi_bad" type="text_multi_bad" indexed="true" stored="true"/>
|
<field name="content_multi_bad" type="text_multi_bad" indexed="true" stored="true"/>
|
||||||
|
|
||||||
|
<dynamicField name="*_straight" type="text_straight" indexed="true" stored="true"/>
|
||||||
|
<dynamicField name="*_lower" type="text_lower" indexed="true" stored="true"/>
|
||||||
|
<dynamicField name="*_folding" type="text_folding" indexed="true" stored="true"/>
|
||||||
|
<dynamicField name="*_stemming" type="text_stemming" indexed="true" stored="true"/>
|
||||||
|
<dynamicField name="*_keyword" type="text_keyword" indexed="true" stored="true"/>
|
||||||
|
|
||||||
</fields>
|
</fields>
|
||||||
|
|
||||||
<defaultSearchField>content</defaultSearchField>
|
<defaultSearchField>content</defaultSearchField>
|
||||||
|
|
|
@ -36,7 +36,7 @@ public class MultiTermTest extends SolrTestCaseJ4 {
|
||||||
@Test
|
@Test
|
||||||
public void testMultiFound() {
|
public void testMultiFound() {
|
||||||
SchemaField field = h.getCore().getSchema().getField("content_multi");
|
SchemaField field = h.getCore().getSchema().getField("content_multi");
|
||||||
Analyzer analyzer = field.getType().getMultiTermAnalyzer();
|
Analyzer analyzer = ((TextField)field.getType()).getMultiTermAnalyzer();
|
||||||
assertTrue(analyzer instanceof TokenizerChain);
|
assertTrue(analyzer instanceof TokenizerChain);
|
||||||
assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof WhitespaceTokenizerFactory);
|
assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof WhitespaceTokenizerFactory);
|
||||||
TokenizerChain tc = (TokenizerChain) analyzer;
|
TokenizerChain tc = (TokenizerChain) analyzer;
|
||||||
|
@ -58,9 +58,9 @@ public class MultiTermTest extends SolrTestCaseJ4 {
|
||||||
@Test
|
@Test
|
||||||
public void testQueryCopiedToMulti() {
|
public void testQueryCopiedToMulti() {
|
||||||
SchemaField field = h.getCore().getSchema().getField("content_charfilter");
|
SchemaField field = h.getCore().getSchema().getField("content_charfilter");
|
||||||
Analyzer analyzer = field.getType().getMultiTermAnalyzer();
|
Analyzer analyzer = ((TextField)field.getType()).getMultiTermAnalyzer();
|
||||||
assertTrue(analyzer instanceof TokenizerChain);
|
assertTrue(analyzer instanceof TokenizerChain);
|
||||||
assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof WhitespaceTokenizerFactory);
|
assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof KeywordTokenizerFactory);
|
||||||
TokenizerChain tc = (TokenizerChain) analyzer;
|
TokenizerChain tc = (TokenizerChain) analyzer;
|
||||||
for (TokenFilterFactory factory : tc.getTokenFilterFactories()) {
|
for (TokenFilterFactory factory : tc.getTokenFilterFactories()) {
|
||||||
assertTrue(factory instanceof LowerCaseFilterFactory);
|
assertTrue(factory instanceof LowerCaseFilterFactory);
|
||||||
|
@ -73,15 +73,15 @@ public class MultiTermTest extends SolrTestCaseJ4 {
|
||||||
@Test
|
@Test
|
||||||
public void testDefaultCopiedToMulti() {
|
public void testDefaultCopiedToMulti() {
|
||||||
SchemaField field = h.getCore().getSchema().getField("content_ws");
|
SchemaField field = h.getCore().getSchema().getField("content_ws");
|
||||||
Analyzer analyzer = field.getType().getMultiTermAnalyzer();
|
Analyzer analyzer = ((TextField)field.getType()).getMultiTermAnalyzer();
|
||||||
assertTrue(analyzer instanceof TokenizerChain);
|
assertTrue(analyzer instanceof TokenizerChain);
|
||||||
assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof WhitespaceTokenizerFactory);
|
assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof KeywordTokenizerFactory);
|
||||||
TokenizerChain tc = (TokenizerChain) analyzer;
|
TokenizerChain tc = (TokenizerChain) analyzer;
|
||||||
for (TokenFilterFactory factory : tc.getTokenFilterFactories()) {
|
for (TokenFilterFactory factory : tc.getTokenFilterFactories()) {
|
||||||
assertTrue((factory instanceof ASCIIFoldingFilterFactory) || (factory instanceof LowerCaseFilterFactory));
|
assertTrue((factory instanceof ASCIIFoldingFilterFactory) || (factory instanceof LowerCaseFilterFactory));
|
||||||
}
|
}
|
||||||
|
|
||||||
assertTrue(tc.getCharFilterFactories().length == 0);
|
assertTrue(tc.getCharFilterFactories() == null);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -59,7 +59,12 @@ public class TestFoldingMultitermQuery extends SolrTestCaseJ4 {
|
||||||
"content_lower_token", docs[i],
|
"content_lower_token", docs[i],
|
||||||
"content_oldstyle", docs[i],
|
"content_oldstyle", docs[i],
|
||||||
"content_charfilter", docs[i],
|
"content_charfilter", docs[i],
|
||||||
"content_multi_bad", docs[i]
|
"content_multi_bad", docs[i],
|
||||||
|
"content_straight", docs[i],
|
||||||
|
"content_lower", docs[i],
|
||||||
|
"content_folding", docs[i],
|
||||||
|
"content_stemming", docs[i],
|
||||||
|
"content_keyword", docs[i]
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
assertU(optimize());
|
assertU(optimize());
|
||||||
|
@ -95,6 +100,8 @@ public class TestFoldingMultitermQuery extends SolrTestCaseJ4 {
|
||||||
assertQ(req("q", "content_lower_token:" + me),
|
assertQ(req("q", "content_lower_token:" + me),
|
||||||
"//result[@numFound='1']",
|
"//result[@numFound='1']",
|
||||||
"//*[@name='id'][.='" + Integer.toString(idx) + "']");
|
"//*[@name='id'][.='" + Integer.toString(idx) + "']");
|
||||||
|
assertQ(req("q", "content_oldstyle:" + me),
|
||||||
|
"//result[@numFound='0']");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (int idx = 0; idx < matchRevPrefixUpper.length; idx++) {
|
for (int idx = 0; idx < matchRevPrefixUpper.length; idx++) {
|
||||||
|
@ -128,13 +135,50 @@ public class TestFoldingMultitermQuery extends SolrTestCaseJ4 {
|
||||||
assertQ(req("q", "content_multi:" + me),
|
assertQ(req("q", "content_multi:" + me),
|
||||||
"//result[@numFound='1']",
|
"//result[@numFound='1']",
|
||||||
"//*[@name='id'][.='" + Integer.toString(idx) + "']");
|
"//*[@name='id'][.='" + Integer.toString(idx) + "']");
|
||||||
assertQ(req("q", "content_lower_token:" + me),
|
assertQ(req("q", "content_oldstyle:" + me),
|
||||||
"//result[@numFound='1']",
|
"//result[@numFound='0']");
|
||||||
"//*[@name='id'][.='" + Integer.toString(idx) + "']");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testLowerTokenizer() {
|
||||||
|
// The lowercasetokenizer will remove the '1' from the index, but not from the query, thus the special test.
|
||||||
|
assertQ(req("q", "content_lower_token:Á*C*"), "//result[@numFound='1']");
|
||||||
|
assertQ(req("q", "content_lower_token:Á*C*1"), "//result[@numFound='0']");
|
||||||
|
assertQ(req("q", "content_lower_token:h*1"), "//result[@numFound='0']");
|
||||||
|
assertQ(req("q", "content_lower_token:H*1"), "//result[@numFound='0']");
|
||||||
|
assertQ(req("q", "content_lower_token:*1"), "//result[@numFound='0']");
|
||||||
|
assertQ(req("q", "content_lower_token:HÏ*l?*"), "//result[@numFound='1']");
|
||||||
|
assertQ(req("q", "content_lower_token:hȉ*l?*"), "//result[@numFound='1']");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testRegex() throws Exception {
|
||||||
|
assertQ(req("q", "content:/Zill[a-z]/"),
|
||||||
|
"//result[@numFound='1']");
|
||||||
|
assertQ(req("q", "content:/Zill[A-Z]/"), // everything in the regex gets lowercased?
|
||||||
|
"//result[@numFound='1']");
|
||||||
|
assertQ(req("q", "content_keyword:/.*Zill[A-Z]/"),
|
||||||
|
"//result[@numFound='1']");
|
||||||
|
|
||||||
|
assertQ(req("q", "content_straight:/Zill[a-z]/"), // case preserving field shouldn't match
|
||||||
|
"//result[@numFound='0']");
|
||||||
|
assertQ(req("q", "content_folding:/Zill[a-z]/"), // case preserving field shouldn't match
|
||||||
|
"//result[@numFound='0']");
|
||||||
|
|
||||||
|
assertQ(req("q", "content_keyword:/Abcdefg1 Finger/"), // test spaces
|
||||||
|
"//result[@numFound='1']");
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGeneral() throws Exception {
|
||||||
|
assertQ(req("q", "content_stemming:fings*"), "//result[@numFound='0']"); // should not match (but would if fings* was stemmed to fing*
|
||||||
|
assertQ(req("q", "content_stemming:fing*"), "//result[@numFound='1']");
|
||||||
|
}
|
||||||
|
|
||||||
// Phrases should fail. This test is mainly a marker so if phrases ever do start working with wildcards we go
|
// Phrases should fail. This test is mainly a marker so if phrases ever do start working with wildcards we go
|
||||||
// and update the documentation
|
// and update the documentation
|
||||||
@Test
|
@Test
|
||||||
|
@ -143,17 +187,14 @@ public class TestFoldingMultitermQuery extends SolrTestCaseJ4 {
|
||||||
"//result[@numFound='0']");
|
"//result[@numFound='0']");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Make sure the legacy behavior flag is honored
|
|
||||||
@Test
|
|
||||||
public void testLegacyBehavior() {
|
|
||||||
assertQ(req("q", "content_oldstyle:ABCD*"),
|
|
||||||
"//result[@numFound='0']");
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testWildcardRange() {
|
public void testWildcardRange() {
|
||||||
assertQ(req("q", "content:[* TO *]"),
|
assertQ(req("q", "content:[* TO *]"),
|
||||||
"//result[@numFound='3']");
|
"//result[@numFound='3']");
|
||||||
|
assertQ(req("q", "content:[AB* TO Z*]"),
|
||||||
|
"//result[@numFound='3']");
|
||||||
|
assertQ(req("q", "content:[AB*E?G* TO TU*W]"),
|
||||||
|
"//result[@numFound='3']");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -222,10 +263,13 @@ public class TestFoldingMultitermQuery extends SolrTestCaseJ4 {
|
||||||
@Test
|
@Test
|
||||||
public void testMultiBad() {
|
public void testMultiBad() {
|
||||||
try {
|
try {
|
||||||
|
ignoreException("analyzer returned too many terms");
|
||||||
assertQ(req("q", "content_multi_bad:" + "abCD*"));
|
assertQ(req("q", "content_multi_bad:" + "abCD*"));
|
||||||
fail("Should throw exception when token evaluates to more than one term");
|
fail("Should throw exception when token evaluates to more than one term");
|
||||||
} catch (Exception expected) {
|
} catch (Exception expected) {
|
||||||
assertTrue(expected.getCause() instanceof IllegalArgumentException);
|
assertTrue(expected.getCause() instanceof org.apache.solr.common.SolrException);
|
||||||
|
} finally {
|
||||||
|
resetExceptionIgnores();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -427,41 +427,6 @@
|
||||||
</analyzer>
|
</analyzer>
|
||||||
</fieldType>
|
</fieldType>
|
||||||
|
|
||||||
<!-- Illustrates the new "multiterm" analyzer definition the <fieldType> can take a new
|
|
||||||
parameter legacyMultiTerm="true" if the old behvaior is desired. The new default
|
|
||||||
behavior as of 3.6+ is to automatically define a multiterm analyzer
|
|
||||||
-->
|
|
||||||
<fieldType name="text_multiterm" class="solr.TextField" positionIncrementGap="100">
|
|
||||||
<analyzer type="index">
|
|
||||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
|
||||||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
|
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
|
||||||
</analyzer>
|
|
||||||
<analyzer type="query">
|
|
||||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
|
||||||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
|
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
|
||||||
</analyzer>
|
|
||||||
<!-- Illustrates the use of a new analyzer type "multiterm". See the Wiki page "Multiterm
|
|
||||||
Query Analysis" and SOLR-2438 for full details. The short form is that this analyzer is
|
|
||||||
applied to wildcard terms (prefix, wildcard range) if specified. This allows, among other
|
|
||||||
things, not having to lowercase wildcard terms on the client.
|
|
||||||
|
|
||||||
In the absence of this section, the new default behavior (3.6, 4.0) is to construct
|
|
||||||
one of these from the query analyzer that incorporates any defined charfilters, a
|
|
||||||
WhitespaceTokenizer, a LowerCaseFilter (if defined), and an ASCIIFoldingFilter
|
|
||||||
(if defined).
|
|
||||||
|
|
||||||
Arguably, this is an expert-level analyzer, most cases will be handled by an instance
|
|
||||||
of this being automatically constructed from the queryanalyzer.
|
|
||||||
|
|
||||||
-->
|
|
||||||
<analyzer type="multiterm">
|
|
||||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
|
||||||
<filter class="solr.ASCIIFoldingFilterFactory"/>
|
|
||||||
</analyzer>
|
|
||||||
</fieldType>
|
|
||||||
|
|
||||||
<!-- since fields of this type are by default not stored or indexed,
|
<!-- since fields of this type are by default not stored or indexed,
|
||||||
any data added to them will be ignored outright. -->
|
any data added to them will be ignored outright. -->
|
||||||
|
@ -587,6 +552,7 @@
|
||||||
<dynamicField name="*_l" type="long" indexed="true" stored="true"/>
|
<dynamicField name="*_l" type="long" indexed="true" stored="true"/>
|
||||||
<dynamicField name="*_t" type="text_general" indexed="true" stored="true"/>
|
<dynamicField name="*_t" type="text_general" indexed="true" stored="true"/>
|
||||||
<dynamicField name="*_txt" type="text_general" indexed="true" stored="true" multiValued="true"/>
|
<dynamicField name="*_txt" type="text_general" indexed="true" stored="true" multiValued="true"/>
|
||||||
|
<dynamicField name="*_en" type="text_en" indexed="true" stored="true" multiValued="true" />
|
||||||
<dynamicField name="*_b" type="boolean" indexed="true" stored="true"/>
|
<dynamicField name="*_b" type="boolean" indexed="true" stored="true"/>
|
||||||
<dynamicField name="*_f" type="float" indexed="true" stored="true"/>
|
<dynamicField name="*_f" type="float" indexed="true" stored="true"/>
|
||||||
<dynamicField name="*_d" type="double" indexed="true" stored="true"/>
|
<dynamicField name="*_d" type="double" indexed="true" stored="true"/>
|
||||||
|
|
Loading…
Reference in New Issue