SOLR-2438, allow an analysis chain to be created for multiterm query terms or synthesize one if not defined explicitly

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1206229 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Erick Erickson 2011-11-25 15:46:26 +00:00
parent 6870592252
commit 098371446a
12 changed files with 655 additions and 26 deletions

View File

@ -290,7 +290,6 @@ public abstract class QueryParserBase {
this.lowercaseExpandedTerms = lowercaseExpandedTerms;
}
/**
* @see #setLowercaseExpandedTerms(boolean)
*/
@ -778,14 +777,21 @@ public abstract class QueryParserBase {
return new FuzzyQuery(term,minimumSimilarity,prefixLength);
}
private BytesRef analyzeRangePart(String field, String part) {
// TODO: Should this be protected instead?
private BytesRef analyzeMultitermTerm(String field, String part) {
return analyzeMultitermTerm(field, part, analyzer);
}
protected BytesRef analyzeMultitermTerm(String field, String part, Analyzer analyzerIn) {
TokenStream source;
if (analyzerIn == null) analyzerIn = analyzer;
try {
source = analyzer.tokenStream(field, new StringReader(part));
source = analyzerIn.tokenStream(field, new StringReader(part));
source.reset();
} catch (IOException e) {
throw new RuntimeException("Unable to initialize TokenStream to analyze range part: " + part, e);
throw new RuntimeException("Unable to initialize TokenStream to analyze multiTerm term: " + part, e);
}
TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
@ -793,10 +799,10 @@ public abstract class QueryParserBase {
try {
if (!source.incrementToken())
throw new IllegalArgumentException("analyzer returned no terms for range part: " + part);
throw new IllegalArgumentException("analyzer returned no terms for multiTerm term: " + part);
termAtt.fillBytesRef();
if (source.incrementToken())
throw new IllegalArgumentException("analyzer returned too many terms for range part: " + part);
throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + part);
} catch (IOException e) {
throw new RuntimeException("error analyzing range part: " + part, e);
}
@ -805,7 +811,7 @@ public abstract class QueryParserBase {
source.end();
source.close();
} catch (IOException e) {
throw new RuntimeException("Unable to end & close TokenStream after analyzing range part: " + part, e);
throw new RuntimeException("Unable to end & close TokenStream after analyzing multiTerm term: " + part, e);
}
return BytesRef.deepCopyOf(bytes);
@ -827,13 +833,13 @@ public abstract class QueryParserBase {
if (part1 == null) {
start = null;
} else {
start = analyzeRangeTerms ? analyzeRangePart(field, part1) : new BytesRef(part1);
start = analyzeRangeTerms ? analyzeMultitermTerm(field, part1) : new BytesRef(part1);
}
if (part2 == null) {
end = null;
} else {
end = analyzeRangeTerms ? analyzeRangePart(field, part2) : new BytesRef(part2);
end = analyzeRangeTerms ? analyzeMultitermTerm(field, part2) : new BytesRef(part2);
}
final TermRangeQuery query = new TermRangeQuery(field, start, end, startInclusive, endInclusive);

View File

@ -188,6 +188,11 @@ New Features
* SOLR-2134 Trie* fields should support sortMissingLast=true, and deprecate Sortable* Field Types
(Ryan McKinley, Mike McCandless, Uwe Schindler, Erick Erickson)
* SOLR-2438: Case insensitive search for wildcard queries. Actually, the ability to specify
a complete analysis chain for multiterm queries.
(Pete Sturge Erick Erickson, Mentoring from Seeley and Muir)
Optimizations
----------------------
@ -383,6 +388,11 @@ New Features
* SOLR-1565: StreamingUpdateSolrServer supports RequestWriter API and therefore, javabin update
format (shalin)
* SOLR-2438: Case insensitive search for wildcard queries. Actually, the ability to specify
a complete analysis chain for multiterm queries.
(Pete Sturge Erick Erickson, Mentoring from Seeley and Muir)
Bug Fixes
----------------------
* SOLR-2912: Fixed File descriptor leak in ShowFileRequestHandler (Michael Ryan, shalin)

View File

@ -48,13 +48,15 @@ public abstract class FieldProperties {
protected final static int REQUIRED = 0x00001000;
protected final static int OMIT_POSITIONS = 0x00002000;
protected final static int LEGACY_MULTITERM = 0x00004000;
static final String[] propertyNames = {
"indexed", "tokenized", "stored",
"binary", "omitNorms", "omitTermFreqAndPositions",
"termVectors", "termPositions", "termOffsets",
"multiValued",
"sortMissingFirst","sortMissingLast","required", "omitPositions"
"sortMissingFirst","sortMissingLast","required", "omitPositions" ,
"legacyMultiTerm"
};
static final Map<String,Integer> propertyMap = new HashMap<String,Integer>();

View File

@ -428,6 +428,21 @@ public abstract class FieldType extends FieldProperties {
*/
protected Analyzer queryAnalyzer=analyzer;
/**
* Analyzer set by schema for text types to use when searching fields
* of this type, subclasses can set analyzer themselves or override
* getAnalyzer()
* This analyzer is used to process wildcard, prefix, regex and other multiterm queries. It
* assembles a list of tokenizer +filters that "make sense" for this, primarily accent folding and
* lowercasing filters, and charfilters.
*
* If users require old-style behavior, they can specify 'legacyMultiterm="true" ' in the schema file
* @see #getMultiTermAnalyzer
* @see #setMultiTermAnalyzer
*/
protected Analyzer multiTermAnalyzer=null;
/**
* Returns the Analyzer to be used when indexing fields of this type.
* <p>
@ -450,6 +465,17 @@ public abstract class FieldType extends FieldProperties {
return queryAnalyzer;
}
/**
* Returns the Analyzer to be used when searching fields of this type when mult-term queries are specified.
* <p>
* This method may be called many times, at any time.
* </p>
* @see #getAnalyzer
*/
public Analyzer getMultiTermAnalyzer() {
return multiTermAnalyzer;
}
private final String analyzerError =
"FieldType: " + this.getClass().getSimpleName() +
" (" + typeName + ") does not support specifying an analyzer";
@ -498,6 +524,28 @@ public abstract class FieldType extends FieldProperties {
throw e;
}
/**
* Sets the Analyzer to be used when querying fields of this type.
*
* <p>
*
* Subclasses that override this method need to ensure the behavior
* of the analyzer is consistent with the implementation of toInternal.
* </p>
*
* @see #toInternal
* @see #setAnalyzer
* @see #getQueryAnalyzer
*/
public void setMultiTermAnalyzer(Analyzer analyzer) {
SolrException e = new SolrException
(ErrorCode.SERVER_ERROR,
"FieldType: " + this.getClass().getSimpleName() +
" (" + typeName + ") does not support specifying an analyzer");
SolrException.logOnce(log,null,e);
throw e;
}
/** @lucene.internal */
protected Similarity similarity;

View File

@ -18,19 +18,15 @@
package org.apache.solr.schema;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.util.Version;
import org.apache.solr.analysis.*;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.DOMUtil;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.Config;
import org.apache.solr.core.SolrResourceLoader;
import org.apache.solr.analysis.CharFilterFactory;
import org.apache.solr.analysis.TokenFilterFactory;
import org.apache.solr.analysis.TokenizerChain;
import org.apache.solr.analysis.TokenizerFactory;
import org.apache.solr.util.plugin.AbstractPluginLoader;
import org.w3c.dom.*;
@ -88,12 +84,16 @@ public final class FieldTypePluginLoader
String expression = "./analyzer[@type='query']";
Node anode = (Node)xpath.evaluate(expression, node, XPathConstants.NODE);
Analyzer queryAnalyzer = readAnalyzer(anode);
expression = "./analyzer[@type='multiterm']";
anode = (Node)xpath.evaluate(expression, node, XPathConstants.NODE);
Analyzer multiAnalyzer = readAnalyzer(anode);
// An analyzer without a type specified, or with type="index"
expression = "./analyzer[not(@type)] | ./analyzer[@type='index']";
anode = (Node)xpath.evaluate(expression, node, XPathConstants.NODE);
Analyzer analyzer = readAnalyzer(anode);
// a custom similarity[Factory]
expression = "./similarity";
anode = (Node)xpath.evaluate(expression, node, XPathConstants.NODE);
@ -101,9 +101,16 @@ public final class FieldTypePluginLoader
if (queryAnalyzer==null) queryAnalyzer=analyzer;
if (analyzer==null) analyzer=queryAnalyzer;
if (multiAnalyzer == null) {
Boolean legacyMatch = ! schema.getDefaultLuceneMatchVersion().onOrAfter(Version.LUCENE_36);
legacyMatch = (DOMUtil.getAttr(node, "legacyMultiTerm", null) == null) ? legacyMatch :
Boolean.parseBoolean(DOMUtil.getAttr(node, "legacyMultiTerm", null));
multiAnalyzer = constructMultiTermAnalyzer(queryAnalyzer, legacyMatch);
}
if (analyzer!=null) {
ft.setAnalyzer(analyzer);
ft.setQueryAnalyzer(queryAnalyzer);
ft.setMultiTermAnalyzer(multiAnalyzer);
}
if (similarity!=null) {
ft.setSimilarity(similarity);
@ -130,6 +137,42 @@ public final class FieldTypePluginLoader
return fieldTypes.put( name, plugin );
}
// The point here is that, if no multitermanalyzer was specified in the schema file, do one of several things:
// 1> If legacyMultiTerm == false, assemble a new analyzer composed of all of the charfilters,
// lowercase filters and asciifoldingfilter.
// 2> If letacyMultiTerm == true just construct the analyzer from a KeywordTokenizer. That should mimic current behavior.
// Do the same if they've specified that the old behavior is required (legacyMultiTerm="true")
private Analyzer constructMultiTermAnalyzer(Analyzer queryAnalyzer, Boolean legacyMultiTerm) {
if (queryAnalyzer == null) return null;
if (legacyMultiTerm || (!(queryAnalyzer instanceof TokenizerChain))) {
return new KeywordAnalyzer();
}
TokenizerChain tc = (TokenizerChain) queryAnalyzer;
// we know it'll never be longer than this unless the code below is explicitly changed
TokenFilterFactory[] filters = new TokenFilterFactory[2];
int idx = 0;
for (TokenFilterFactory factory : tc.getTokenFilterFactories()) {
if (factory instanceof LowerCaseFilterFactory) {
filters[idx] = new LowerCaseFilterFactory();
filters[idx++].init(factory.getArgs());
}
if (factory instanceof ASCIIFoldingFilterFactory) {
filters[idx] = new ASCIIFoldingFilterFactory();
filters[idx++].init(factory.getArgs());
}
}
WhitespaceTokenizerFactory white = new WhitespaceTokenizerFactory();
white.init(tc.getTokenizerFactory().getArgs());
return new TokenizerChain(tc.getCharFilterFactories(),
white,
Arrays.copyOfRange(filters, 0, idx));
}
//
// <analyzer><tokenizer class="...."/><tokenizer class="...." arg="....">
//

View File

@ -97,6 +97,9 @@ public final class SchemaField extends FieldProperties {
boolean isTokenized() { return (properties & TOKENIZED)!=0; }
boolean isBinary() { return (properties & BINARY)!=0; }
boolean legacyMultiTerm() {
return (properties & LEGACY_MULTITERM) != 0;
}
public IndexableField createField(Object val, float boost) {
return type.createField(this,val,boost);

View File

@ -98,6 +98,11 @@ public class TextField extends FieldType {
this.queryAnalyzer = analyzer;
}
@Override
public void setMultiTermAnalyzer(Analyzer analyzer) {
this.multiTermAnalyzer = analyzer;
}
static Query parseFieldQuery(QParser parser, Analyzer analyzer, String field, String queryText) {
int phraseSlop = 0;
boolean enablePositionIncrements = true;

View File

@ -26,7 +26,6 @@ import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.*;
import org.apache.lucene.util.ToStringUtils;
import org.apache.lucene.util.Version;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.BasicAutomata;
import org.apache.lucene.util.automaton.BasicOperations;
@ -71,7 +70,6 @@ public class SolrQueryParser extends QueryParser {
this.schema = parser.getReq().getSchema();
this.parser = parser;
this.defaultField = defaultField;
setLowercaseExpandedTerms(false);
setEnablePositionIncrements(true);
checkAllowLeadingWildcards();
}
@ -106,6 +104,14 @@ public class SolrQueryParser extends QueryParser {
}
}
protected String analyzeIfMultitermTermText(String field, String part, Analyzer analyzer) {
if (part == null) return part;
SchemaField sf = schema.getFieldOrNull((field));
if (sf == null || ! (sf.getType() instanceof TextField)) return part;
return analyzeMultitermTerm(field, part, analyzer).utf8ToString();
}
@Override
protected Query getFieldQuery(String field, String queryText, boolean quoted) throws ParseException {
checkNullField(field);
@ -137,6 +143,8 @@ public class SolrQueryParser extends QueryParser {
@Override
protected Query getRangeQuery(String field, String part1, String part2, boolean startInclusive, boolean endInclusive) throws ParseException {
checkNullField(field);
part1 = analyzeIfMultitermTermText(field, part1, schema.getFieldType(field).getMultiTermAnalyzer());
part2 = analyzeIfMultitermTermText(field, part2, schema.getFieldType(field).getMultiTermAnalyzer());
SchemaField sf = schema.getField(field);
return sf.getType().getRangeQuery(parser, sf, part1, part2, startInclusive, endInclusive);
}
@ -144,9 +152,8 @@ public class SolrQueryParser extends QueryParser {
@Override
protected Query getPrefixQuery(String field, String termStr) throws ParseException {
checkNullField(field);
if (getLowercaseExpandedTerms()) {
termStr = termStr.toLowerCase();
}
termStr = analyzeIfMultitermTermText(field, termStr, schema.getFieldType(field).getMultiTermAnalyzer());
// TODO: toInternal() won't necessarily work on partial
// values, so it looks like we need a getPrefix() function
@ -162,14 +169,13 @@ public class SolrQueryParser extends QueryParser {
PrefixQuery prefixQuery = new PrefixQuery(t);
return prefixQuery;
}
@Override
protected Query getWildcardQuery(String field, String termStr) throws ParseException {
// *:* -> MatchAllDocsQuery
if ("*".equals(field) && "*".equals(termStr)) {
return newMatchAllDocsQuery();
}
termStr = analyzeIfMultitermTermText(field, termStr, schema.getFieldType(field).getMultiTermAnalyzer());
// can we use reversed wildcards in this field?
String type = schema.getFieldType(field).getTypeName();
ReversedWildcardFilterFactory factory = leadingWildcards.get(type);
@ -213,4 +219,11 @@ public class SolrQueryParser extends QueryParser {
}
return q;
}
protected Query getRegexpQuery(String field, String termStr) throws ParseException
{
termStr = analyzeIfMultitermTermText(field, termStr, schema.getFieldType(field).getMultiTermAnalyzer());
return super.getRegexpQuery(field, termStr);
}
}

View File

@ -0,0 +1,145 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<schema name="test" version="1.0">
<types>
<fieldtype name="string" class="solr.StrField" sortMissingLast="true" multiValued="false"/>
<fieldType name="text" class="solr.TextField" multiValued="false">
<analyzer>
<tokenizer class="solr.PatternTokenizerFactory" pattern="\s+"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_multi" class="solr.TextField" multiValued="true">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.TrimFilterFactory"/>
</analyzer>
<analyzer type="multiterm"> <!-- Intentionally different to test that these are kept distinct -->
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_multi_bad" class="solr.TextField" multiValued="false">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.TrimFilterFactory"/>
</analyzer>
<analyzer type="multiterm"> <!-- Intentionally different to test that these are kept distinct -->
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_ws" class="solr.TextField" multiValued="true">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_rev" class="solr.TextField" legacyMultiTerm="false">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.ReversedWildcardFilterFactory" withOriginal="false"
maxPosAsterisk="1" maxPosQuestion="2" maxFractionAsterisk="0.99"
minTrailing="1"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_lower_tokenizer" class="solr.TextField">
<analyzer>
<tokenizer class="solr.LowerCaseTokenizerFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_charfilter" class="solr.TextField" multiValued="false">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
<analyzer type="query">
<charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt"/>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_oldstyle" class="solr.TextField" multiValued="false" legacyMultiTerm="true">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.TrimFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="byte" class="solr.ByteField" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="short" class="solr.ShortField" omitNorms="true" positionIncrementGap="0"/>
<fieldtype name="boolean" class="solr.BoolField" sortMissingLast="true"/>
<fieldtype name="date" class="solr.TrieDateField" precisionStep="0"/>
</types>
<fields>
<field name="id" type="string" indexed="true" stored="true" required="true"/>
<field name="int_f" type="int"/>
<field name="float_f" type="float"/>
<field name="long_f" type="long"/>
<field name="double_f" type="double"/>
<field name="byte_f" type="byte"/>
<field name="short_f" type="short"/>
<field name="bool_f" type="boolean"/>
<field name="date_f" type="date"/>
<field name="content" type="text" indexed="true" stored="true"/>
<field name="content_ws" type="text_ws" indexed="true" stored="true"/>
<field name="content_rev" type="text_rev" indexed="true" stored="true"/>
<field name="content_multi" type="text_multi" indexed="true" stored="true"/>
<field name="content_lower_token" type="text_multi" indexed="true" stored="true"/>
<field name="content_oldstyle" type="text_oldstyle" indexed="true" stored="true"/>
<field name="content_charfilter" type="text_charfilter" indexed="true" stored="true"/>
<field name="content_multi_bad" type="text_multi_bad" indexed="true" stored="true"/>
</fields>
<defaultSearchField>content</defaultSearchField>
<uniqueKey>id</uniqueKey>
</schema>

View File

@ -0,0 +1,87 @@
package org.apache.solr.schema;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.Analyzer;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.analysis.*;
import org.junit.BeforeClass;
import org.junit.Test;
public class MultiTermTest extends SolrTestCaseJ4 {
public String getCoreName() {
return "basic";
}
@BeforeClass
public static void beforeTests() throws Exception {
initCore("solrconfig-basic.xml", "schema-folding.xml");
}
@Test
public void testMultiFound() {
SchemaField field = h.getCore().getSchema().getField("content_multi");
Analyzer analyzer = field.getType().getMultiTermAnalyzer();
assertTrue(analyzer instanceof TokenizerChain);
assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof WhitespaceTokenizerFactory);
TokenizerChain tc = (TokenizerChain) analyzer;
for (TokenFilterFactory factory : tc.getTokenFilterFactories()) {
assertTrue((factory instanceof ASCIIFoldingFilterFactory) || (factory instanceof LowerCaseFilterFactory));
}
analyzer = field.getType().getAnalyzer();
assertTrue(analyzer instanceof TokenizerChain);
assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof WhitespaceTokenizerFactory);
tc = (TokenizerChain) analyzer;
for (TokenFilterFactory factory : tc.getTokenFilterFactories()) {
assertTrue((factory instanceof ASCIIFoldingFilterFactory) || (factory instanceof TrimFilterFactory));
}
assertTrue(tc.getCharFilterFactories().length == 0);
}
@Test
public void testQueryCopiedToMulti() {
SchemaField field = h.getCore().getSchema().getField("content_charfilter");
Analyzer analyzer = field.getType().getMultiTermAnalyzer();
assertTrue(analyzer instanceof TokenizerChain);
assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof WhitespaceTokenizerFactory);
TokenizerChain tc = (TokenizerChain) analyzer;
for (TokenFilterFactory factory : tc.getTokenFilterFactories()) {
assertTrue(factory instanceof LowerCaseFilterFactory);
}
assertTrue(tc.getCharFilterFactories().length == 1);
assertTrue(tc.getCharFilterFactories()[0] instanceof MappingCharFilterFactory);
}
@Test
public void testDefaultCopiedToMulti() {
SchemaField field = h.getCore().getSchema().getField("content_ws");
Analyzer analyzer = field.getType().getMultiTermAnalyzer();
assertTrue(analyzer instanceof TokenizerChain);
assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof WhitespaceTokenizerFactory);
TokenizerChain tc = (TokenizerChain) analyzer;
for (TokenFilterFactory factory : tc.getTokenFilterFactories()) {
assertTrue((factory instanceof ASCIIFoldingFilterFactory) || (factory instanceof LowerCaseFilterFactory));
}
assertTrue(tc.getCharFilterFactories().length == 0);
}
}

View File

@ -0,0 +1,231 @@
package org.apache.solr.search;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.IndexWriter;
import org.apache.solr.SolrTestCaseJ4;
import org.junit.BeforeClass;
import org.junit.Test;
public class TestFoldingMultitermQuery extends SolrTestCaseJ4 {
public String getCoreName() {
return "basic";
}
@BeforeClass
public static void beforeTests() throws Exception {
initCore("solrconfig-basic.xml", "schema-folding.xml");
IndexWriter iw;
String docs[] = {
"abcdefg1 finger",
"gangs hijklmn1",
"opqrstu1 zilly",
};
// prepare the index
for (int i = 0; i < docs.length; i++) {
String num = Integer.toString(i);
String boolVal = ((i % 2) == 0) ? "true" : "false";
assertU(adoc("id", num,
"int_f", num,
"float_f", num,
"long_f", num,
"double_f", num,
"byte_f", num,
"short_f", num,
"bool_f", boolVal,
"date_f", "200" + Integer.toString(i % 10) + "-01-01T00:00:00Z",
"content", docs[i],
"content_ws", docs[i],
"content_rev", docs[i],
"content_multi", docs[i],
"content_lower_token", docs[i],
"content_oldstyle", docs[i],
"content_charfilter", docs[i],
"content_multi_bad", docs[i]
));
}
assertU(optimize());
}
@Test
public void testPrefixCaseAccentFolding() throws Exception {
String matchOneDocPrefixUpper[][] = {
{"A*", "ÁB*", "ABÇ*"}, // these should find only doc 0
{"H*", "HÏ*", "HìJ*"}, // these should find only doc 1
{"O*", "ÖP*", "OPQ*"}, // these should find only doc 2
};
String matchRevPrefixUpper[][] = {
{"*Ğ1", "*DEfG1", "*EfG1"},
{"*N1", "*LmŊ1", "*MÑ1"},
{"*Ǖ1", "*sTu1", "*RŠTU1"}
};
// test the prefix queries find only one doc where the query is uppercased. Must go through query parser here!
for (int idx = 0; idx < matchOneDocPrefixUpper.length; idx++) {
for (int jdx = 0; jdx < matchOneDocPrefixUpper[idx].length; jdx++) {
String me = matchOneDocPrefixUpper[idx][jdx];
assertQ(req("q", "content:" + me),
"//*[@numFound='1']",
"//*[@name='id'][.='" + Integer.toString(idx) + "']");
assertQ(req("q", "content_ws:" + me),
"//*[@numFound='1']",
"//*[@name='id'][.='" + Integer.toString(idx) + "']");
assertQ(req("q", "content_multi:" + me),
"//*[@numFound='1']",
"//*[@name='id'][.='" + Integer.toString(idx) + "']");
assertQ(req("q", "content_lower_token:" + me),
"//result[@numFound='1']",
"//*[@name='id'][.='" + Integer.toString(idx) + "']");
}
}
for (int idx = 0; idx < matchRevPrefixUpper.length; idx++) {
for (int jdx = 0; jdx < matchRevPrefixUpper[idx].length; jdx++) {
String me = matchRevPrefixUpper[idx][jdx];
assertQ(req("q", "content_rev:" + me),
"//*[@numFound='1']",
"//*[@name='id'][.='" + Integer.toString(idx) + "']");
}
}
}
// test the wildcard queries find only one doc where the query is uppercased and/or accented.
@Test
public void testWildcardCaseAccentFolding() throws Exception {
String matchOneDocWildUpper[][] = {
{"Á*C*", "ÁB*1", "ABÇ*g1", "Á*FG1"}, // these should find only doc 0
{"H*k*", "HÏ*l?*", "HìJ*n*", "HìJ*m*"}, // these should find only doc 1
{"O*ř*", "ÖP*ş???", "OPQ*S?Ů*", "ÖP*1"}, // these should find only doc 2
};
for (int idx = 0; idx < matchOneDocWildUpper.length; idx++) {
for (int jdx = 0; jdx < matchOneDocWildUpper[idx].length; jdx++) {
String me = matchOneDocWildUpper[idx][jdx];
assertQ("Error with " + me, req("q", "content:" + me),
"//result[@numFound='1']",
"//*[@name='id'][.='" + Integer.toString(idx) + "']");
assertQ(req("q", "content_ws:" + me),
"//result[@numFound='1']",
"//*[@name='id'][.='" + Integer.toString(idx) + "']");
assertQ(req("q", "content_multi:" + me),
"//result[@numFound='1']",
"//*[@name='id'][.='" + Integer.toString(idx) + "']");
assertQ(req("q", "content_lower_token:" + me),
"//result[@numFound='1']",
"//*[@name='id'][.='" + Integer.toString(idx) + "']");
}
}
}
// Phrases should fail. This test is mainly a marker so if phrases ever do start working with wildcards we go
// and update the documentation
@Test
public void testPhrase() {
assertQ(req("q", "content:\"silly ABCD*\""),
"//result[@numFound='0']");
}
// Make sure the legacy behavior flag is honored
@Test
public void testLegacyBehavior() {
assertQ(req("q", "content_oldstyle:ABCD*"),
"//result[@numFound='0']");
}
@Test
public void testWildcardRange() {
assertQ(req("q", "content:[* TO *]"),
"//result[@numFound='3']");
}
// Does the char filter get correctly handled?
@Test
public void testCharFilter() {
assertQ(req("q", "content_charfilter:" + "Á*C*"),
"//result[@numFound='1']",
"//*[@name='id'][.='0']");
assertQ(req("q", "content_charfilter:" + "ABÇ*g1"),
"//result[@numFound='1']",
"//*[@name='id'][.='0']");
assertQ(req("q", "content_charfilter:" + "HÏ*l?*"),
"//result[@numFound='1']",
"//*[@name='id'][.='1']");
}
@Test
public void testRangeQuery() {
assertQ(req("q", "content:" + "{Ȫp*1 TO QŮ*}"),
"//result[@numFound='1']",
"//*[@name='id'][.='2']");
assertQ(req("q", "content:" + "[Áb* TO f?Ñg?r]"),
"//result[@numFound='1']",
"//*[@name='id'][.='0']");
}
@Test
public void testNonTextTypes() {
String[] intTypes = {"int_f", "float_f", "long_f", "double_f", "byte_f", "short_f"};
for (String str : intTypes) {
assertQ(req("q", str + ":" + "0"),
"//result[@numFound='1']",
"//*[@name='id'][.='0']");
assertQ(req("q", str + ":" + "[0 TO 2]"),
"//result[@numFound='3']",
"//*[@name='id'][.='0']",
"//*[@name='id'][.='1']",
"//*[@name='id'][.='2']");
}
assertQ(req("q", "bool_f:true"),
"//result[@numFound='2']",
"//*[@name='id'][.='0']",
"//*[@name='id'][.='2']");
assertQ(req("q", "bool_f:[false TO true]"),
"//result[@numFound='3']",
"//*[@name='id'][.='0']",
"//*[@name='id'][.='1']",
"//*[@name='id'][.='2']");
assertQ(req("q", "date_f:2000-01-01T00\\:00\\:00Z"),
"//result[@numFound='1']",
"//*[@name='id'][.='0']");
assertQ(req("q", "date_f:[2000-12-31T23:59:59.999Z TO 2002-01-02T00:00:01Z]"),
"//result[@numFound='2']",
"//*[@name='id'][.='1']",
"//*[@name='id'][.='2']");
}
@Test
public void testMultiBad() {
try {
assertQ(req("q", "content_multi_bad:" + "abCD*"));
fail("Should throw exception when token evaluates to more than one term");
} catch (Exception expected) {
assertTrue(expected.getCause() instanceof IllegalArgumentException);
}
}
}

View File

@ -427,6 +427,42 @@
</analyzer>
</fieldType>
<!-- Illustrates the new "multiterm" analyzer definition the <fieldType> can take a new
parameter legacyMultiTerm="true" if the old behvaior is desired. The new default
behavior as of 3.6+ is to automatically define a multiterm analyzer
-->
<fieldType name="text_multiterm" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
<!-- Illustrates the use of a new analyzer type "multiterm". See the Wiki page "Multiterm
Query Analysis" and SOLR-2438 for full details. The short form is that this analyzer is
applied to wildcard terms (prefix, wildcard range) if specified. This allows, among other
things, not having to lowercase wildcard terms on the client.
In the absence of this section, the new default behavior (3.6, 4.0) is to construct
one of these from the query analyzer that incorporates any defined charfilters, a
WhitespaceTokenizer, a LowerCaseFilter (if defined), and an ASCIIFoldingFilter
(if defined).
Arguably, this is an expert-level analyzer, most cases will be handled by an instance
of this being automatically constructed from the queryanalyzer.
-->
<analyzer type="multiterm">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
</analyzer>
</fieldType>
<!-- since fields of this type are by default not stored or indexed,
any data added to them will be ignored outright. -->
<fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" />