From c94c1c5a647b0462754d80a11e47a07df28568f1 Mon Sep 17 00:00:00 2001 From: Erick Erickson Date: Sun, 27 Nov 2011 17:04:38 +0000 Subject: [PATCH] git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1206767 13f79535-47bb-0310-9956-ffa450edef68 --- solr/CHANGES.txt | 5 + .../analysis/ASCIIFoldingFilterFactory.java | 7 +- .../solr/analysis/LowerCaseFilterFactory.java | 7 +- .../analysis/LowerCaseTokenizerFactory.java | 10 +- .../analysis/MappingCharFilterFactory.java | 7 +- .../analysis/MultiTermAwareComponent.java | 31 ++++++ .../analysis/PersianCharFilterFactory.java | 7 +- .../solr/analysis/TokenFilterFactory.java | 1 + .../apache/solr/schema/FieldProperties.java | 6 +- .../org/apache/solr/schema/FieldType.java | 51 ---------- .../solr/schema/FieldTypePluginLoader.java | 83 +++++++++++----- .../org/apache/solr/schema/SchemaField.java | 4 - .../org/apache/solr/schema/TextField.java | 81 ++++++++++++++-- .../apache/solr/search/SolrQueryParser.java | 94 +++++++++---------- .../test-files/solr/conf/schema-folding.xml | 66 +++++++++++-- .../org/apache/solr/schema/MultiTermTest.java | 12 +-- .../search/TestFoldingMultitermQuery.java | 68 +++++++++++--- solr/example/solr/conf/schema.xml | 36 +------ 18 files changed, 366 insertions(+), 210 deletions(-) create mode 100644 solr/core/src/java/org/apache/solr/analysis/MultiTermAwareComponent.java diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 65c0a4ad4bf..47ded3cbc2f 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -192,6 +192,11 @@ New Features * SOLR-2438: Case insensitive search for wildcard queries. Actually, the ability to specify a complete analysis chain for multiterm queries. (Pete Sturge Erick Erickson, Mentoring from Seeley and Muir) + +* SOLR-2918 Improvement to SOLR-2438, added MultiTermAwareComponent to the various classes + that should transform multiterm queries in various ways, and use this as the criteria for + adding them to the multiterm analyzer that is constructed if not specified in the + Optimizations diff --git a/solr/core/src/java/org/apache/solr/analysis/ASCIIFoldingFilterFactory.java b/solr/core/src/java/org/apache/solr/analysis/ASCIIFoldingFilterFactory.java index 7f336cb3e54..88d57296b43 100644 --- a/solr/core/src/java/org/apache/solr/analysis/ASCIIFoldingFilterFactory.java +++ b/solr/core/src/java/org/apache/solr/analysis/ASCIIFoldingFilterFactory.java @@ -32,9 +32,14 @@ import org.apache.lucene.analysis.TokenStream; * </fieldType> * */ -public class ASCIIFoldingFilterFactory extends BaseTokenFilterFactory { +public class ASCIIFoldingFilterFactory extends BaseTokenFilterFactory implements MultiTermAwareComponent { public ASCIIFoldingFilter create(TokenStream input) { return new ASCIIFoldingFilter(input); } + + @Override + public Object getMultiTermComponent() { + return this; + } } diff --git a/solr/core/src/java/org/apache/solr/analysis/LowerCaseFilterFactory.java b/solr/core/src/java/org/apache/solr/analysis/LowerCaseFilterFactory.java index 422ba68ad9d..f7da154845c 100644 --- a/solr/core/src/java/org/apache/solr/analysis/LowerCaseFilterFactory.java +++ b/solr/core/src/java/org/apache/solr/analysis/LowerCaseFilterFactory.java @@ -33,7 +33,7 @@ import org.apache.lucene.analysis.core.LowerCaseFilter; * </fieldType> * */ -public class LowerCaseFilterFactory extends BaseTokenFilterFactory { +public class LowerCaseFilterFactory extends BaseTokenFilterFactory implements MultiTermAwareComponent { @Override public void init(Map args) { super.init(args); @@ -43,4 +43,9 @@ public class LowerCaseFilterFactory extends BaseTokenFilterFactory { public LowerCaseFilter create(TokenStream input) { return new LowerCaseFilter(luceneMatchVersion,input); } + + @Override + public Object getMultiTermComponent() { + return this; + } } diff --git a/solr/core/src/java/org/apache/solr/analysis/LowerCaseTokenizerFactory.java b/solr/core/src/java/org/apache/solr/analysis/LowerCaseTokenizerFactory.java index e2980e75b70..535a41dce4b 100644 --- a/solr/core/src/java/org/apache/solr/analysis/LowerCaseTokenizerFactory.java +++ b/solr/core/src/java/org/apache/solr/analysis/LowerCaseTokenizerFactory.java @@ -17,6 +17,7 @@ package org.apache.solr.analysis; +import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.LowerCaseTokenizer; import java.io.Reader; @@ -32,7 +33,7 @@ import java.util.Map; * </fieldType> * */ -public class LowerCaseTokenizerFactory extends BaseTokenizerFactory { +public class LowerCaseTokenizerFactory extends BaseTokenizerFactory implements MultiTermAwareComponent { @Override public void init(Map args) { super.init(args); @@ -42,4 +43,11 @@ public class LowerCaseTokenizerFactory extends BaseTokenizerFactory { public LowerCaseTokenizer create(Reader input) { return new LowerCaseTokenizer(luceneMatchVersion,input); } + + @Override + public Object getMultiTermComponent() { + LowerCaseFilterFactory filt = new LowerCaseFilterFactory(); + filt.init(args); + return filt; + } } diff --git a/solr/core/src/java/org/apache/solr/analysis/MappingCharFilterFactory.java b/solr/core/src/java/org/apache/solr/analysis/MappingCharFilterFactory.java index aaba0430f49..2867cf57aaf 100644 --- a/solr/core/src/java/org/apache/solr/analysis/MappingCharFilterFactory.java +++ b/solr/core/src/java/org/apache/solr/analysis/MappingCharFilterFactory.java @@ -46,7 +46,7 @@ import org.apache.solr.util.plugin.ResourceLoaderAware; * */ public class MappingCharFilterFactory extends BaseCharFilterFactory implements - ResourceLoaderAware { + ResourceLoaderAware, MultiTermAwareComponent { protected NormalizeCharMap normMap; private String mapping; @@ -126,4 +126,9 @@ public class MappingCharFilterFactory extends BaseCharFilterFactory implements } return new String( out, 0, writePos ); } + + @Override + public Object getMultiTermComponent() { + return this; + } } diff --git a/solr/core/src/java/org/apache/solr/analysis/MultiTermAwareComponent.java b/solr/core/src/java/org/apache/solr/analysis/MultiTermAwareComponent.java new file mode 100644 index 00000000000..68aaf7e213f --- /dev/null +++ b/solr/core/src/java/org/apache/solr/analysis/MultiTermAwareComponent.java @@ -0,0 +1,31 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Add to any analysis factory component to allow returning an + * analysis component factory for use with partial terms in prefix queries, + * wildcard queries, range query endpoints, regex queries, etc. + * + * @lucene.experimental + */ +public interface MultiTermAwareComponent { + /** Returns an analysis component to handle analysis if multi-term queries. + * The returned component must be a TokenizerFactory, TokenFilterFactory or CharFilterFactory. + */ + public Object getMultiTermComponent(); +} diff --git a/solr/core/src/java/org/apache/solr/analysis/PersianCharFilterFactory.java b/solr/core/src/java/org/apache/solr/analysis/PersianCharFilterFactory.java index f860c1de58e..226903e794d 100644 --- a/solr/core/src/java/org/apache/solr/analysis/PersianCharFilterFactory.java +++ b/solr/core/src/java/org/apache/solr/analysis/PersianCharFilterFactory.java @@ -31,10 +31,15 @@ import org.apache.lucene.analysis.fa.PersianCharFilter; * </fieldType> * */ -public class PersianCharFilterFactory extends BaseCharFilterFactory { +public class PersianCharFilterFactory extends BaseCharFilterFactory implements MultiTermAwareComponent { @Override public CharStream create(CharStream input) { return new PersianCharFilter(input); } + + @Override + public Object getMultiTermComponent() { + return this; + } } diff --git a/solr/core/src/java/org/apache/solr/analysis/TokenFilterFactory.java b/solr/core/src/java/org/apache/solr/analysis/TokenFilterFactory.java index 1ae90e88c5b..c4cf0dca9f6 100644 --- a/solr/core/src/java/org/apache/solr/analysis/TokenFilterFactory.java +++ b/solr/core/src/java/org/apache/solr/analysis/TokenFilterFactory.java @@ -67,3 +67,4 @@ public interface TokenFilterFactory { /** Transform the specified input TokenStream */ public TokenStream create(TokenStream input); } + diff --git a/solr/core/src/java/org/apache/solr/schema/FieldProperties.java b/solr/core/src/java/org/apache/solr/schema/FieldProperties.java index 370b001ba68..91e26f44891 100644 --- a/solr/core/src/java/org/apache/solr/schema/FieldProperties.java +++ b/solr/core/src/java/org/apache/solr/schema/FieldProperties.java @@ -48,15 +48,13 @@ public abstract class FieldProperties { protected final static int REQUIRED = 0x00001000; protected final static int OMIT_POSITIONS = 0x00002000; - protected final static int LEGACY_MULTITERM = 0x00004000; - + static final String[] propertyNames = { "indexed", "tokenized", "stored", "binary", "omitNorms", "omitTermFreqAndPositions", "termVectors", "termPositions", "termOffsets", "multiValued", - "sortMissingFirst","sortMissingLast","required", "omitPositions" , - "legacyMultiTerm" + "sortMissingFirst","sortMissingLast","required", "omitPositions" }; static final Map propertyMap = new HashMap(); diff --git a/solr/core/src/java/org/apache/solr/schema/FieldType.java b/solr/core/src/java/org/apache/solr/schema/FieldType.java index 54a0206490c..4fca55aff99 100644 --- a/solr/core/src/java/org/apache/solr/schema/FieldType.java +++ b/solr/core/src/java/org/apache/solr/schema/FieldType.java @@ -428,21 +428,6 @@ public abstract class FieldType extends FieldProperties { */ protected Analyzer queryAnalyzer=analyzer; - /** - * Analyzer set by schema for text types to use when searching fields - * of this type, subclasses can set analyzer themselves or override - * getAnalyzer() - * This analyzer is used to process wildcard, prefix, regex and other multiterm queries. It - * assembles a list of tokenizer +filters that "make sense" for this, primarily accent folding and - * lowercasing filters, and charfilters. - * - * If users require old-style behavior, they can specify 'legacyMultiterm="true" ' in the schema file - * @see #getMultiTermAnalyzer - * @see #setMultiTermAnalyzer - */ - protected Analyzer multiTermAnalyzer=null; - - /** * Returns the Analyzer to be used when indexing fields of this type. *

@@ -465,20 +450,6 @@ public abstract class FieldType extends FieldProperties { return queryAnalyzer; } - /** - * Returns the Analyzer to be used when searching fields of this type when mult-term queries are specified. - *

- * This method may be called many times, at any time. - *

- * @see #getAnalyzer - */ - public Analyzer getMultiTermAnalyzer() { - return multiTermAnalyzer; - } - - private final String analyzerError = - "FieldType: " + this.getClass().getSimpleName() + - " (" + typeName + ") does not support specifying an analyzer"; /** * Sets the Analyzer to be used when indexing fields of this type. @@ -524,28 +495,6 @@ public abstract class FieldType extends FieldProperties { throw e; } - /** - * Sets the Analyzer to be used when querying fields of this type. - * - *

- * - * Subclasses that override this method need to ensure the behavior - * of the analyzer is consistent with the implementation of toInternal. - *

- * - * @see #toInternal - * @see #setAnalyzer - * @see #getQueryAnalyzer - */ - public void setMultiTermAnalyzer(Analyzer analyzer) { - SolrException e = new SolrException - (ErrorCode.SERVER_ERROR, - "FieldType: " + this.getClass().getSimpleName() + - " (" + typeName + ") does not support specifying an analyzer"); - SolrException.logOnce(log,null,e); - throw e; - } - /** @lucene.internal */ protected Similarity similarity; diff --git a/solr/core/src/java/org/apache/solr/schema/FieldTypePluginLoader.java b/solr/core/src/java/org/apache/solr/schema/FieldTypePluginLoader.java index 090c7b0ada2..37277e87cd3 100644 --- a/solr/core/src/java/org/apache/solr/schema/FieldTypePluginLoader.java +++ b/solr/core/src/java/org/apache/solr/schema/FieldTypePluginLoader.java @@ -102,15 +102,13 @@ public final class FieldTypePluginLoader if (queryAnalyzer==null) queryAnalyzer=analyzer; if (analyzer==null) analyzer=queryAnalyzer; if (multiAnalyzer == null) { - Boolean legacyMatch = ! schema.getDefaultLuceneMatchVersion().onOrAfter(Version.LUCENE_36); - legacyMatch = (DOMUtil.getAttr(node, "legacyMultiTerm", null) == null) ? legacyMatch : - Boolean.parseBoolean(DOMUtil.getAttr(node, "legacyMultiTerm", null)); - multiAnalyzer = constructMultiTermAnalyzer(queryAnalyzer, legacyMatch); + multiAnalyzer = constructMultiTermAnalyzer(queryAnalyzer); } if (analyzer!=null) { ft.setAnalyzer(analyzer); ft.setQueryAnalyzer(queryAnalyzer); - ft.setMultiTermAnalyzer(multiAnalyzer); + if (ft instanceof TextField) + ((TextField)ft).setMultiTermAnalyzer(multiAnalyzer); } if (similarity!=null) { ft.setSimilarity(similarity); @@ -143,36 +141,75 @@ public final class FieldTypePluginLoader // 2> If letacyMultiTerm == true just construct the analyzer from a KeywordTokenizer. That should mimic current behavior. // Do the same if they've specified that the old behavior is required (legacyMultiTerm="true") - private Analyzer constructMultiTermAnalyzer(Analyzer queryAnalyzer, Boolean legacyMultiTerm) { + private Analyzer constructMultiTermAnalyzer(Analyzer queryAnalyzer) { if (queryAnalyzer == null) return null; - if (legacyMultiTerm || (!(queryAnalyzer instanceof TokenizerChain))) { + if (!(queryAnalyzer instanceof TokenizerChain)) { return new KeywordAnalyzer(); } TokenizerChain tc = (TokenizerChain) queryAnalyzer; + MultiTermChainBuilder builder = new MultiTermChainBuilder(); - // we know it'll never be longer than this unless the code below is explicitly changed - TokenFilterFactory[] filters = new TokenFilterFactory[2]; - int idx = 0; - for (TokenFilterFactory factory : tc.getTokenFilterFactories()) { - if (factory instanceof LowerCaseFilterFactory) { - filters[idx] = new LowerCaseFilterFactory(); - filters[idx++].init(factory.getArgs()); - } - if (factory instanceof ASCIIFoldingFilterFactory) { - filters[idx] = new ASCIIFoldingFilterFactory(); - filters[idx++].init(factory.getArgs()); + CharFilterFactory[] charFactories = tc.getCharFilterFactories(); + if (charFactories != null) { + for (CharFilterFactory fact : charFactories) { + builder.add(fact); } } - WhitespaceTokenizerFactory white = new WhitespaceTokenizerFactory(); - white.init(tc.getTokenizerFactory().getArgs()); - return new TokenizerChain(tc.getCharFilterFactories(), - white, - Arrays.copyOfRange(filters, 0, idx)); + builder.add(tc.getTokenizerFactory()); + + for (TokenFilterFactory fact : tc.getTokenFilterFactories()) { + builder.add(fact); + } + + return builder.build(); } + private static class MultiTermChainBuilder { + static final KeywordTokenizerFactory keyFactory; + + static { + keyFactory = new KeywordTokenizerFactory(); + keyFactory.init(new HashMap()); + } + + ArrayList charFilters = null; + ArrayList filters = new ArrayList(2); + TokenizerFactory tokenizer = keyFactory; + + public void add(Object current) { + if (!(current instanceof MultiTermAwareComponent)) return; + Object newComponent = ((MultiTermAwareComponent)current).getMultiTermComponent(); + if (newComponent instanceof TokenFilterFactory) { + if (filters == null) { + filters = new ArrayList(2); + } + filters.add((TokenFilterFactory)newComponent); + } else if (newComponent instanceof TokenizerFactory) { + tokenizer = (TokenizerFactory)newComponent; + } else if (newComponent instanceof CharFilterFactory) { + if (charFilters == null) { + charFilters = new ArrayList(1); + } + charFilters.add( (CharFilterFactory)newComponent); + + } else { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown analysis component from MultiTermAwareComponent: " + newComponent); + } + } + + public TokenizerChain build() { + CharFilterFactory[] charFilterArr = charFilters == null ? null : charFilters.toArray(new CharFilterFactory[charFilters.size()]); + TokenFilterFactory[] filterArr = filters == null ? new TokenFilterFactory[0] : filters.toArray(new TokenFilterFactory[filters.size()]); + return new TokenizerChain(charFilterArr, tokenizer, filterArr); + } + + + } + + // // // diff --git a/solr/core/src/java/org/apache/solr/schema/SchemaField.java b/solr/core/src/java/org/apache/solr/schema/SchemaField.java index 34990ff9655..aaf5c06f30e 100644 --- a/solr/core/src/java/org/apache/solr/schema/SchemaField.java +++ b/solr/core/src/java/org/apache/solr/schema/SchemaField.java @@ -97,10 +97,6 @@ public final class SchemaField extends FieldProperties { boolean isTokenized() { return (properties & TOKENIZED)!=0; } boolean isBinary() { return (properties & BINARY)!=0; } - boolean legacyMultiTerm() { - return (properties & LEGACY_MULTITERM) != 0; - } - public IndexableField createField(Object val, float boost) { return type.createField(this,val,boost); } diff --git a/solr/core/src/java/org/apache/solr/schema/TextField.java b/solr/core/src/java/org/apache/solr/schema/TextField.java index c3f76324d0e..8af29f06af8 100644 --- a/solr/core/src/java/org/apache/solr/schema/TextField.java +++ b/solr/core/src/java/org/apache/solr/schema/TextField.java @@ -17,13 +17,8 @@ package org.apache.solr.schema; -import org.apache.lucene.search.SortField; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.PhraseQuery; -import org.apache.lucene.search.TermQuery; -import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.BooleanClause; -import org.apache.lucene.search.MultiPhraseQuery; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; +import org.apache.lucene.search.*; import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.Term; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; @@ -32,6 +27,7 @@ import org.apache.lucene.analysis.CachingTokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.util.BytesRef; +import org.apache.solr.common.SolrException; import org.apache.solr.response.TextResponseWriter; import org.apache.solr.search.QParser; @@ -48,6 +44,19 @@ import java.io.StringReader; public class TextField extends FieldType { protected boolean autoGeneratePhraseQueries; + /** + * Analyzer set by schema for text types to use when searching fields + * of this type, subclasses can set analyzer themselves or override + * getAnalyzer() + * This analyzer is used to process wildcard, prefix, regex and other multiterm queries. It + * assembles a list of tokenizer +filters that "make sense" for this, primarily accent folding and + * lowercasing filters, and charfilters. + * + * @see #getMultiTermAnalyzer + * @see #setMultiTermAnalyzer + */ + protected Analyzer multiTermAnalyzer=null; + @Override protected void init(IndexSchema schema, Map args) { properties |= TOKENIZED; @@ -63,6 +72,21 @@ public class TextField extends FieldType { super.init(schema, args); } + /** + * Returns the Analyzer to be used when searching fields of this type when mult-term queries are specified. + *

+ * This method may be called many times, at any time. + *

+ * @see #getAnalyzer + */ + public Analyzer getMultiTermAnalyzer() { + return multiTermAnalyzer; + } + + public void setMultiTermAnalyzer(Analyzer analyzer) { + this.multiTermAnalyzer = analyzer; + } + public boolean getAutoGeneratePhraseQueries() { return autoGeneratePhraseQueries; } @@ -98,11 +122,50 @@ public class TextField extends FieldType { this.queryAnalyzer = analyzer; } + @Override - public void setMultiTermAnalyzer(Analyzer analyzer) { - this.multiTermAnalyzer = analyzer; + public Query getRangeQuery(QParser parser, SchemaField field, String part1, String part2, boolean minInclusive, boolean maxInclusive) { + Analyzer multiAnalyzer = getMultiTermAnalyzer(); + BytesRef lower = analyzeMultiTerm(field.getName(), part1, multiAnalyzer); + BytesRef upper = analyzeMultiTerm(field.getName(), part2, multiAnalyzer); + return new TermRangeQuery(field.getName(), lower, upper, minInclusive, maxInclusive); } + public static BytesRef analyzeMultiTerm(String field, String part, Analyzer analyzerIn) { + if (part == null) return null; + + TokenStream source; + try { + source = analyzerIn.tokenStream(field, new StringReader(part)); + source.reset(); + } catch (IOException e) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Unable to initialize TokenStream to analyze multiTerm term: " + part, e); + } + + TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); + BytesRef bytes = termAtt.getBytesRef(); + + try { + if (!source.incrementToken()) + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,"analyzer returned no terms for multiTerm term: " + part); + termAtt.fillBytesRef(); + if (source.incrementToken()) + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,"analyzer returned too many terms for multiTerm term: " + part); + } catch (IOException e) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,"error analyzing range part: " + part, e); + } + + try { + source.end(); + source.close(); + } catch (IOException e) { + throw new RuntimeException("Unable to end & close TokenStream after analyzing multiTerm term: " + part, e); + } + + return BytesRef.deepCopyOf(bytes); + } + + static Query parseFieldQuery(QParser parser, Analyzer analyzer, String field, String queryText) { int phraseSlop = 0; boolean enablePositionIncrements = true; diff --git a/solr/core/src/java/org/apache/solr/search/SolrQueryParser.java b/solr/core/src/java/org/apache/solr/search/SolrQueryParser.java index 0c5584d9c3a..441a26a9137 100644 --- a/solr/core/src/java/org/apache/solr/search/SolrQueryParser.java +++ b/solr/core/src/java/org/apache/solr/search/SolrQueryParser.java @@ -58,8 +58,9 @@ public class SolrQueryParser extends QueryParser { protected final IndexSchema schema; protected final QParser parser; protected final String defaultField; - protected final Map leadingWildcards = - new HashMap(); + + // implementation detail - caching ReversedWildcardFilterFactory based on type + private Map leadingWildcards; public SolrQueryParser(QParser parser, String defaultField) { this(parser, defaultField, parser.getReq().getSchema().getQueryAnalyzer()); @@ -71,30 +72,34 @@ public class SolrQueryParser extends QueryParser { this.parser = parser; this.defaultField = defaultField; setEnablePositionIncrements(true); - checkAllowLeadingWildcards(); + setLowercaseExpandedTerms(false); + setAllowLeadingWildcard(true); } - protected void checkAllowLeadingWildcards() { - boolean allow = false; - for (Entry e : schema.getFieldTypes().entrySet()) { - Analyzer a = e.getValue().getAnalyzer(); - if (a instanceof TokenizerChain) { - // examine the indexing analysis chain if it supports leading wildcards - TokenizerChain tc = (TokenizerChain)a; - TokenFilterFactory[] factories = tc.getTokenFilterFactories(); - for (TokenFilterFactory factory : factories) { - if (factory instanceof ReversedWildcardFilterFactory) { - allow = true; - leadingWildcards.put(e.getKey(), (ReversedWildcardFilterFactory)factory); - } + protected ReversedWildcardFilterFactory getReversedWildcardFilterFactory(FieldType fieldType) { + if (leadingWildcards == null) leadingWildcards = new HashMap(); + ReversedWildcardFilterFactory fac = leadingWildcards.get(fieldType); + if (fac == null && leadingWildcards.containsKey(fac)) { + return fac; + } + + Analyzer a = fieldType.getAnalyzer(); + if (a instanceof TokenizerChain) { + // examine the indexing analysis chain if it supports leading wildcards + TokenizerChain tc = (TokenizerChain)a; + TokenFilterFactory[] factories = tc.getTokenFilterFactories(); + for (TokenFilterFactory factory : factories) { + if (factory instanceof ReversedWildcardFilterFactory) { + fac = (ReversedWildcardFilterFactory)factory; + break; } } } - // XXX should be enabled on a per-field basis - if (allow) { - setAllowLeadingWildcard(true); - } + + leadingWildcards.put(fieldType, fac); + return fac; } + private void checkNullField(String field) throws SolrException { if (field == null && defaultField == null) { @@ -104,12 +109,14 @@ public class SolrQueryParser extends QueryParser { } } - protected String analyzeIfMultitermTermText(String field, String part, Analyzer analyzer) { + protected String analyzeIfMultitermTermText(String field, String part, FieldType fieldType) { if (part == null) return part; SchemaField sf = schema.getFieldOrNull((field)); - if (sf == null || ! (sf.getType() instanceof TextField)) return part; - return analyzeMultitermTerm(field, part, analyzer).utf8ToString(); + if (sf == null || ! (fieldType instanceof TextField)) return part; + String out = TextField.analyzeMultiTerm(field, part, ((TextField)fieldType).getMultiTermAnalyzer()).utf8ToString(); + // System.out.println("INPUT="+part + " OUTPUT="+out); + return out; } @Override @@ -143,8 +150,6 @@ public class SolrQueryParser extends QueryParser { @Override protected Query getRangeQuery(String field, String part1, String part2, boolean startInclusive, boolean endInclusive) throws ParseException { checkNullField(field); - part1 = analyzeIfMultitermTermText(field, part1, schema.getFieldType(field).getMultiTermAnalyzer()); - part2 = analyzeIfMultitermTermText(field, part2, schema.getFieldType(field).getMultiTermAnalyzer()); SchemaField sf = schema.getField(field); return sf.getType().getRangeQuery(parser, sf, part1, part2, startInclusive, endInclusive); } @@ -153,21 +158,10 @@ public class SolrQueryParser extends QueryParser { protected Query getPrefixQuery(String field, String termStr) throws ParseException { checkNullField(field); - termStr = analyzeIfMultitermTermText(field, termStr, schema.getFieldType(field).getMultiTermAnalyzer()); + termStr = analyzeIfMultitermTermText(field, termStr, schema.getFieldType(field)); - // TODO: toInternal() won't necessarily work on partial - // values, so it looks like we need a getPrefix() function - // on fieldtype? Or at the minimum, a method on fieldType - // that can tell me if I should lowercase or not... - // Schema could tell if lowercase filter is in the chain, - // but a more sure way would be to run something through - // the first time and check if it got lowercased. - - // TODO: throw exception if field type doesn't support prefixes? - // (sortable numeric types don't do prefixes, but can do range queries) - Term t = new Term(field, termStr); - PrefixQuery prefixQuery = new PrefixQuery(t); - return prefixQuery; + // Solr has always used constant scoring for prefix queries. This should return constant scoring by default. + return newPrefixQuery(new Term(field, termStr)); } @Override protected Query getWildcardQuery(String field, String termStr) throws ParseException { @@ -175,10 +169,10 @@ public class SolrQueryParser extends QueryParser { if ("*".equals(field) && "*".equals(termStr)) { return newMatchAllDocsQuery(); } - termStr = analyzeIfMultitermTermText(field, termStr, schema.getFieldType(field).getMultiTermAnalyzer()); + FieldType fieldType = schema.getFieldType(field); + termStr = analyzeIfMultitermTermText(field, termStr, fieldType); // can we use reversed wildcards in this field? - String type = schema.getFieldType(field).getTypeName(); - ReversedWildcardFilterFactory factory = leadingWildcards.get(type); + ReversedWildcardFilterFactory factory = getReversedWildcardFilterFactory(fieldType); if (factory != null) { Term term = new Term(field, termStr); // fsa representing the query @@ -211,19 +205,15 @@ public class SolrQueryParser extends QueryParser { } }; } - Query q = super.getWildcardQuery(field, termStr); - if (q instanceof WildcardQuery) { - // use a constant score query to avoid overflowing clauses - WildcardQuery wildcardQuery = new WildcardQuery(((WildcardQuery)q).getTerm()); - return wildcardQuery; - } - return q; + + // Solr has always used constant scoring for wildcard queries. This should return constant scoring by default. + return newWildcardQuery(new Term(field, termStr)); } - + @Override protected Query getRegexpQuery(String field, String termStr) throws ParseException { - termStr = analyzeIfMultitermTermText(field, termStr, schema.getFieldType(field).getMultiTermAnalyzer()); - return super.getRegexpQuery(field, termStr); + termStr = analyzeIfMultitermTermText(field, termStr, schema.getFieldType(field)); + return newRegexpQuery(new Term(field, termStr)); } } diff --git a/solr/core/src/test-files/solr/conf/schema-folding.xml b/solr/core/src/test-files/solr/conf/schema-folding.xml index 798ca301217..0e77b8b59a6 100644 --- a/solr/core/src/test-files/solr/conf/schema-folding.xml +++ b/solr/core/src/test-files/solr/conf/schema-folding.xml @@ -64,7 +64,7 @@
- + @@ -80,12 +80,25 @@ - + + + + + + + + + + + + + + @@ -99,19 +112,47 @@ - + + + + + + + + + + + + + + + + + + + + + - + - - - - + + + + + + + + + + + + @@ -133,10 +174,17 @@ - + + + + + + + + content diff --git a/solr/core/src/test/org/apache/solr/schema/MultiTermTest.java b/solr/core/src/test/org/apache/solr/schema/MultiTermTest.java index 7e64aff3c54..792ee6df246 100644 --- a/solr/core/src/test/org/apache/solr/schema/MultiTermTest.java +++ b/solr/core/src/test/org/apache/solr/schema/MultiTermTest.java @@ -36,7 +36,7 @@ public class MultiTermTest extends SolrTestCaseJ4 { @Test public void testMultiFound() { SchemaField field = h.getCore().getSchema().getField("content_multi"); - Analyzer analyzer = field.getType().getMultiTermAnalyzer(); + Analyzer analyzer = ((TextField)field.getType()).getMultiTermAnalyzer(); assertTrue(analyzer instanceof TokenizerChain); assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof WhitespaceTokenizerFactory); TokenizerChain tc = (TokenizerChain) analyzer; @@ -58,9 +58,9 @@ public class MultiTermTest extends SolrTestCaseJ4 { @Test public void testQueryCopiedToMulti() { SchemaField field = h.getCore().getSchema().getField("content_charfilter"); - Analyzer analyzer = field.getType().getMultiTermAnalyzer(); + Analyzer analyzer = ((TextField)field.getType()).getMultiTermAnalyzer(); assertTrue(analyzer instanceof TokenizerChain); - assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof WhitespaceTokenizerFactory); + assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof KeywordTokenizerFactory); TokenizerChain tc = (TokenizerChain) analyzer; for (TokenFilterFactory factory : tc.getTokenFilterFactories()) { assertTrue(factory instanceof LowerCaseFilterFactory); @@ -73,15 +73,15 @@ public class MultiTermTest extends SolrTestCaseJ4 { @Test public void testDefaultCopiedToMulti() { SchemaField field = h.getCore().getSchema().getField("content_ws"); - Analyzer analyzer = field.getType().getMultiTermAnalyzer(); + Analyzer analyzer = ((TextField)field.getType()).getMultiTermAnalyzer(); assertTrue(analyzer instanceof TokenizerChain); - assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof WhitespaceTokenizerFactory); + assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof KeywordTokenizerFactory); TokenizerChain tc = (TokenizerChain) analyzer; for (TokenFilterFactory factory : tc.getTokenFilterFactories()) { assertTrue((factory instanceof ASCIIFoldingFilterFactory) || (factory instanceof LowerCaseFilterFactory)); } - assertTrue(tc.getCharFilterFactories().length == 0); + assertTrue(tc.getCharFilterFactories() == null); } } diff --git a/solr/core/src/test/org/apache/solr/search/TestFoldingMultitermQuery.java b/solr/core/src/test/org/apache/solr/search/TestFoldingMultitermQuery.java index 2d02e922135..888f6047291 100644 --- a/solr/core/src/test/org/apache/solr/search/TestFoldingMultitermQuery.java +++ b/solr/core/src/test/org/apache/solr/search/TestFoldingMultitermQuery.java @@ -59,7 +59,12 @@ public class TestFoldingMultitermQuery extends SolrTestCaseJ4 { "content_lower_token", docs[i], "content_oldstyle", docs[i], "content_charfilter", docs[i], - "content_multi_bad", docs[i] + "content_multi_bad", docs[i], + "content_straight", docs[i], + "content_lower", docs[i], + "content_folding", docs[i], + "content_stemming", docs[i], + "content_keyword", docs[i] )); } assertU(optimize()); @@ -95,6 +100,8 @@ public class TestFoldingMultitermQuery extends SolrTestCaseJ4 { assertQ(req("q", "content_lower_token:" + me), "//result[@numFound='1']", "//*[@name='id'][.='" + Integer.toString(idx) + "']"); + assertQ(req("q", "content_oldstyle:" + me), + "//result[@numFound='0']"); } } for (int idx = 0; idx < matchRevPrefixUpper.length; idx++) { @@ -128,13 +135,50 @@ public class TestFoldingMultitermQuery extends SolrTestCaseJ4 { assertQ(req("q", "content_multi:" + me), "//result[@numFound='1']", "//*[@name='id'][.='" + Integer.toString(idx) + "']"); - assertQ(req("q", "content_lower_token:" + me), - "//result[@numFound='1']", - "//*[@name='id'][.='" + Integer.toString(idx) + "']"); + assertQ(req("q", "content_oldstyle:" + me), + "//result[@numFound='0']"); } } } + @Test + public void testLowerTokenizer() { + // The lowercasetokenizer will remove the '1' from the index, but not from the query, thus the special test. + assertQ(req("q", "content_lower_token:Á*C*"), "//result[@numFound='1']"); + assertQ(req("q", "content_lower_token:Á*C*1"), "//result[@numFound='0']"); + assertQ(req("q", "content_lower_token:h*1"), "//result[@numFound='0']"); + assertQ(req("q", "content_lower_token:H*1"), "//result[@numFound='0']"); + assertQ(req("q", "content_lower_token:*1"), "//result[@numFound='0']"); + assertQ(req("q", "content_lower_token:HÏ*l?*"), "//result[@numFound='1']"); + assertQ(req("q", "content_lower_token:hȉ*l?*"), "//result[@numFound='1']"); + } + + @Test + public void testRegex() throws Exception { + assertQ(req("q", "content:/Zill[a-z]/"), + "//result[@numFound='1']"); + assertQ(req("q", "content:/Zill[A-Z]/"), // everything in the regex gets lowercased? + "//result[@numFound='1']"); + assertQ(req("q", "content_keyword:/.*Zill[A-Z]/"), + "//result[@numFound='1']"); + + assertQ(req("q", "content_straight:/Zill[a-z]/"), // case preserving field shouldn't match + "//result[@numFound='0']"); + assertQ(req("q", "content_folding:/Zill[a-z]/"), // case preserving field shouldn't match + "//result[@numFound='0']"); + + assertQ(req("q", "content_keyword:/Abcdefg1 Finger/"), // test spaces + "//result[@numFound='1']"); + + } + + + @Test + public void testGeneral() throws Exception { + assertQ(req("q", "content_stemming:fings*"), "//result[@numFound='0']"); // should not match (but would if fings* was stemmed to fing* + assertQ(req("q", "content_stemming:fing*"), "//result[@numFound='1']"); + } + // Phrases should fail. This test is mainly a marker so if phrases ever do start working with wildcards we go // and update the documentation @Test @@ -143,17 +187,14 @@ public class TestFoldingMultitermQuery extends SolrTestCaseJ4 { "//result[@numFound='0']"); } - // Make sure the legacy behavior flag is honored - @Test - public void testLegacyBehavior() { - assertQ(req("q", "content_oldstyle:ABCD*"), - "//result[@numFound='0']"); - } - @Test public void testWildcardRange() { assertQ(req("q", "content:[* TO *]"), "//result[@numFound='3']"); + assertQ(req("q", "content:[AB* TO Z*]"), + "//result[@numFound='3']"); + assertQ(req("q", "content:[AB*E?G* TO TU*W]"), + "//result[@numFound='3']"); } @@ -222,10 +263,13 @@ public class TestFoldingMultitermQuery extends SolrTestCaseJ4 { @Test public void testMultiBad() { try { + ignoreException("analyzer returned too many terms"); assertQ(req("q", "content_multi_bad:" + "abCD*")); fail("Should throw exception when token evaluates to more than one term"); } catch (Exception expected) { - assertTrue(expected.getCause() instanceof IllegalArgumentException); + assertTrue(expected.getCause() instanceof org.apache.solr.common.SolrException); + } finally { + resetExceptionIgnores(); } } } \ No newline at end of file diff --git a/solr/example/solr/conf/schema.xml b/solr/example/solr/conf/schema.xml index 794c5e05b71..6a584d3da4e 100755 --- a/solr/example/solr/conf/schema.xml +++ b/solr/example/solr/conf/schema.xml @@ -427,41 +427,6 @@ - - - - - - - - - - - - - - - - - - - @@ -587,6 +552,7 @@ +