SOLR-1677: Add support for luceneMatchVersion in Analyzers, Tokenizers and TokenFilters.

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/branches/solr@923028 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Uwe Schindler 2010-03-15 02:07:09 +00:00
parent 2e35d9a8c6
commit 5cc19567b9
13 changed files with 257 additions and 12 deletions

View File

@ -17,10 +17,13 @@
package org.apache.solr.analysis;
import org.apache.solr.core.Config;
import org.apache.solr.schema.IndexSchema;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.lucene.util.Version;
/**
@ -34,8 +37,15 @@ public abstract class BaseTokenFilterFactory implements TokenFilterFactory {
/** The init args */
protected Map<String,String> args;
/** the luceneVersion arg */
protected Version luceneMatchVersion = null;
public void init(Map<String,String> args) {
this.args=args;
String matchVersion = args.get(IndexSchema.LUCENE_MATCH_VERSION_PARAM);
if (matchVersion != null) {
luceneMatchVersion = Config.parseLuceneVersionString(matchVersion);
}
}
public Map<String,String> getArgs() {

View File

@ -17,9 +17,13 @@
package org.apache.solr.analysis;
import org.apache.solr.core.Config;
import org.apache.solr.schema.IndexSchema;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.lucene.util.Version;
/**
@ -33,8 +37,15 @@ public abstract class BaseTokenizerFactory implements TokenizerFactory {
/** The init args */
protected Map<String,String> args;
/** the luceneVersion arg */
protected Version luceneMatchVersion = null;
public void init(Map<String,String> args) {
this.args=args;
String matchVersion = args.get(IndexSchema.LUCENE_MATCH_VERSION_PARAM);
if (matchVersion != null) {
luceneMatchVersion = Config.parseLuceneVersionString(matchVersion);
}
}
public Map<String,String> getArgs() {

View File

@ -19,7 +19,6 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.util.Version;
import java.io.Reader;
@ -29,6 +28,6 @@ import java.io.Reader;
public class StandardTokenizerFactory extends BaseTokenizerFactory {
public StandardTokenizer create(Reader input) {
return new StandardTokenizer(Version.LUCENE_24, input);
return new StandardTokenizer(luceneMatchVersion, input);
}
}

View File

@ -51,8 +51,7 @@ public class StopFilterFactory extends BaseTokenFilterFactory implements Resourc
}
for (String file : files) {
List<String> wlist = loader.getLines(file.trim());
//TODO: once StopFilter.makeStopSet(List) method is available, switch to using that so we can avoid a toArray() call
stopWords.addAll(StopFilter.makeStopSet((String[])wlist.toArray(new String[0]), ignoreCase));
stopWords.addAll(StopFilter.makeStopSet(wlist, ignoreCase));
}
} catch (IOException e) {
throw new RuntimeException(e);
@ -61,7 +60,7 @@ public class StopFilterFactory extends BaseTokenFilterFactory implements Resourc
stopWords = new CharArraySet(StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase);
}
}
//Force the use of a char array set, as it is the most performant, although this may break things if Lucene ever goes away from it. See SOLR-1095
private CharArraySet stopWords;
private boolean ignoreCase;
private boolean enablePositionIncrements;
@ -74,12 +73,12 @@ public class StopFilterFactory extends BaseTokenFilterFactory implements Resourc
return ignoreCase;
}
public Set getStopWords() {
public Set<?> getStopWords() {
return stopWords;
}
public StopFilter create(TokenStream input) {
StopFilter stopFilter = new StopFilter(enablePositionIncrements, input,stopWords,ignoreCase);
StopFilter stopFilter = new StopFilter(enablePositionIncrements,input,stopWords,ignoreCase);
return stopFilter;
}
}

View File

@ -29,9 +29,15 @@ import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.namespace.QName;
import java.io.*;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.LinkedHashMap;
import java.util.Collections;
import java.util.concurrent.atomic.AtomicBoolean;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.lucene.util.Version;
/**
* @version $Id$
@ -267,6 +273,41 @@ public class Config {
return val!=null ? Double.parseDouble(val) : def;
}
public Version getLuceneVersion(String path) {
return parseLuceneVersionString(getVal(path, true));
}
public Version getLuceneVersion(String path, Version def) {
String val = getVal(path, false);
return val!=null ? parseLuceneVersionString(val) : def;
}
private static final AtomicBoolean versionWarningAlreadyLogged = new AtomicBoolean(false);
public static final Version parseLuceneVersionString(String matchVersion) {
matchVersion = matchVersion.toUpperCase();
final Version version;
try {
version = Version.valueOf(matchVersion);
} catch (IllegalArgumentException iae) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
"Invalid luceneMatchVersion '" + matchVersion +
"' property, valid values are: " + Arrays.toString(Version.values()), iae, false);
}
if (version == Version.LUCENE_CURRENT && !versionWarningAlreadyLogged.getAndSet(true)) {
log.warn(
"You should not use LUCENE_CURRENT as luceneMatchVersion property: "+
"if you use this setting, and then Solr upgrades to a newer release of Lucene, "+
"sizable changes may happen. If precise back compatibility is important "+
"then you should instead explicitly specify an actual Lucene version."
);
}
return version;
}
// The following functions were moved to ResourceLoader
//-----------------------------------------------------------------------------

View File

@ -37,6 +37,7 @@ import org.apache.solr.spelling.QueryConverter;
import org.apache.solr.highlight.SolrHighlighter;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.index.IndexDeletionPolicy;
import org.apache.lucene.util.Version;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -134,6 +135,8 @@ public class SolrConfig extends Config {
reopenReaders = getBool("mainIndex/reopenReaders", true);
booleanQueryMaxClauseCount = getInt("query/maxBooleanClauses", BooleanQuery.getMaxClauseCount());
luceneMatchVersion = getLuceneVersion("luceneMatchVersion", Version.LUCENE_24);
filtOptEnabled = getBool("query/boolTofilterOptimizer/@enabled", false);
filtOptCacheSize = getInt("query/boolTofilterOptimizer/@cacheSize",32);
filtOptThreshold = getFloat("query/boolTofilterOptimizer/@threshold",.05f);
@ -261,6 +264,7 @@ public class SolrConfig extends Config {
public final int maxWarmingSearchers;
public final boolean unlockOnStartup;
public final boolean useColdSearcher;
public final Version luceneMatchVersion;
protected String dataDir;
//JMX configuration

View File

@ -22,6 +22,7 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.util.Version;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.SolrParams;
@ -46,6 +47,7 @@ import java.io.InputStream;
import java.io.Reader;
import java.io.IOException;
import java.util.*;
import java.lang.reflect.Constructor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -57,6 +59,7 @@ import org.slf4j.LoggerFactory;
*/
public final class IndexSchema {
public static final String DEFAULT_SCHEMA_FILE = "schema.xml";
public static final String LUCENE_MATCH_VERSION_PARAM = "luceneMatchVersion";
final static Logger log = LoggerFactory.getLogger(IndexSchema.class);
private final SolrConfig solrConfig;
@ -818,7 +821,24 @@ public final class IndexSchema {
NamedNodeMap attrs = node.getAttributes();
String analyzerName = DOMUtil.getAttr(attrs,"class");
if (analyzerName != null) {
return (Analyzer)loader.newInstance(analyzerName);
// nocommit: add support for CoreAware & Co here?
final Class<? extends Analyzer> clazz = loader.findClass(analyzerName).asSubclass(Analyzer.class);
try {
try {
// first try to use a ctor with version parameter (needed for many new Analyzers that have no default one anymore)
Constructor<? extends Analyzer> cnstr = clazz.getConstructor(Version.class);
final String matchVersionStr = DOMUtil.getAttr(attrs, LUCENE_MATCH_VERSION_PARAM);
final Version luceneMatchVersion = (matchVersionStr == null) ?
solrConfig.luceneMatchVersion : Config.parseLuceneVersionString(matchVersionStr);
return cnstr.newInstance(luceneMatchVersion);
} catch (NoSuchMethodException nsme) {
// otherwise use default ctor
return clazz.newInstance();
}
} catch (Exception e) {
throw new SolrException( SolrException.ErrorCode.SERVER_ERROR,
"Cannot load analyzer: "+analyzerName );
}
}
XPath xpath = XPathFactory.newInstance().newXPath();
@ -832,7 +852,11 @@ public final class IndexSchema {
@Override
protected void init(CharFilterFactory plugin, Node node) throws Exception {
if( plugin != null ) {
plugin.init( DOMUtil.toMapExcept(node.getAttributes(),"class") );
final Map<String,String> params = DOMUtil.toMapExcept(node.getAttributes(),"class");
// copy the luceneMatchVersion from config, if not set
if (!params.containsKey(LUCENE_MATCH_VERSION_PARAM))
params.put(LUCENE_MATCH_VERSION_PARAM, solrConfig.luceneMatchVersion.toString());
plugin.init( params );
charFilters.add( plugin );
}
}
@ -858,7 +882,11 @@ public final class IndexSchema {
throw new SolrException( SolrException.ErrorCode.SERVER_ERROR,
"The schema defines multiple tokenizers for: "+node );
}
plugin.init( DOMUtil.toMapExcept(node.getAttributes(),"class") );
final Map<String,String> params = DOMUtil.toMapExcept(node.getAttributes(),"class");
// copy the luceneMatchVersion from config, if not set
if (!params.containsKey(LUCENE_MATCH_VERSION_PARAM))
params.put(LUCENE_MATCH_VERSION_PARAM, solrConfig.luceneMatchVersion.toString());
plugin.init( params );
tokenizers.add( plugin );
}
@ -884,7 +912,11 @@ public final class IndexSchema {
@Override
protected void init(TokenFilterFactory plugin, Node node) throws Exception {
if( plugin != null ) {
plugin.init( DOMUtil.toMapExcept(node.getAttributes(),"class") );
final Map<String,String> params = DOMUtil.toMapExcept(node.getAttributes(),"class");
// copy the luceneMatchVersion from config, if not set
if (!params.containsKey(LUCENE_MATCH_VERSION_PARAM))
params.put(LUCENE_MATCH_VERSION_PARAM, solrConfig.luceneMatchVersion.toString());
plugin.init( params );
filters.add( plugin );
}
}

View File

@ -80,7 +80,7 @@ public class SolrInfoMBeanTest extends TestCase
}
}
}
assertTrue( "there are at least 10 SolrInfoMBean that should be found in the classpath.", checked > 10 );
assertTrue( "there are at least 10 SolrInfoMBean that should be found in the classpath, found " + checked, checked > 10 );
}
private static List<Class> getClassesForPackage(String pckgname) throws Exception {

View File

@ -19,6 +19,8 @@ package org.apache.solr.analysis;
import java.io.IOException;
import java.io.StringReader;
import java.util.Map;
import java.util.Collections;
import junit.framework.TestCase;
@ -34,6 +36,9 @@ import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
*/
public abstract class BaseTokenTestCase extends TestCase
{
protected static final Map<String,String> DEFAULT_VERSION_PARAM =
Collections.singletonMap("luceneMatchVersion", "LUCENE_30");
// some helpers to test Analyzers and TokenStreams:
// these are taken from Lucene's BaseTokenStreamTestCase

View File

@ -0,0 +1,82 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import java.io.StringReader;
import java.lang.reflect.Field;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.FieldType;
import org.apache.solr.util.AbstractSolrTestCase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.util.Version;
/**
* Tests for luceneMatchVersion property for analyzers
*/
public class TestLuceneMatchVersion extends AbstractSolrTestCase {
@Override
public String getSchemaFile() {
return "schema-luceneMatchVersion.xml";
}
@Override
public String getSolrConfigFile() {
return "solrconfig.xml";
}
public static final Version DEFAULT_VERSION = Version.LUCENE_30;
public void testStandardTokenizerVersions() throws Exception {
assertEquals(DEFAULT_VERSION, solrConfig.luceneMatchVersion);
final IndexSchema schema = h.getCore().getSchema();
FieldType type = schema.getFieldType("textDefault");
TokenizerChain ana = (TokenizerChain) type.getAnalyzer();
assertEquals(DEFAULT_VERSION, ((BaseTokenizerFactory) ana.getTokenizerFactory()).luceneMatchVersion);
assertEquals(DEFAULT_VERSION, ((BaseTokenFilterFactory) ana.getTokenFilterFactories()[2]).luceneMatchVersion);
TokenizerChain.TokenStreamInfo tsi = ana.getStream("textDefault",new StringReader(""));
StandardTokenizer tok = (StandardTokenizer) tsi.getTokenizer();
assertTrue(tok.isReplaceInvalidAcronym());
type = schema.getFieldType("text20");
ana = (TokenizerChain) type.getAnalyzer();
assertEquals(Version.LUCENE_20, ((BaseTokenizerFactory) ana.getTokenizerFactory()).luceneMatchVersion);
assertEquals(Version.LUCENE_24, ((BaseTokenFilterFactory) ana.getTokenFilterFactories()[2]).luceneMatchVersion);
tsi = ana.getStream("text20",new StringReader(""));
tok = (StandardTokenizer) tsi.getTokenizer();
assertFalse(tok.isReplaceInvalidAcronym());
// this is a hack to get the private matchVersion field in StandardAnalyzer, may break in later lucene versions - we have no getter :(
final Field matchVersionField = StandardAnalyzer.class.getDeclaredField("matchVersion");
matchVersionField.setAccessible(true);
type = schema.getFieldType("textStandardAnalyzerDefault");
Analyzer ana1 = type.getAnalyzer();
assertTrue(ana1 instanceof StandardAnalyzer);
assertEquals(DEFAULT_VERSION, matchVersionField.get(ana1));
type = schema.getFieldType("textStandardAnalyzer20");
ana1 = type.getAnalyzer();
assertTrue(ana1 instanceof StandardAnalyzer);
assertEquals(Version.LUCENE_20, matchVersionField.get(ana1));
}
}

View File

@ -34,6 +34,7 @@ public class TestStandardFactories extends BaseTokenTestCase {
public void testStandardTokenizer() throws Exception {
Reader reader = new StringReader("What's this thing do?");
StandardTokenizerFactory factory = new StandardTokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"What's", "this", "thing", "do" });
@ -45,7 +46,9 @@ public class TestStandardFactories extends BaseTokenTestCase {
public void testStandardFilter() throws Exception {
Reader reader = new StringReader("What's this thing do?");
StandardTokenizerFactory factory = new StandardTokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
StandardFilterFactory filterFactory = new StandardFilterFactory();
filterFactory.init(DEFAULT_VERSION_PARAM);
Tokenizer tokenizer = factory.create(reader);
TokenStream stream = filterFactory.create(tokenizer);
assertTokenStreamContents(stream,
@ -58,6 +61,7 @@ public class TestStandardFactories extends BaseTokenTestCase {
public void testKeywordTokenizer() throws Exception {
Reader reader = new StringReader("What's this thing do?");
KeywordTokenizerFactory factory = new KeywordTokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"What's this thing do?"});
@ -69,6 +73,7 @@ public class TestStandardFactories extends BaseTokenTestCase {
public void testWhitespaceTokenizer() throws Exception {
Reader reader = new StringReader("What's this thing do?");
WhitespaceTokenizerFactory factory = new WhitespaceTokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"What's", "this", "thing", "do?"});
@ -80,6 +85,7 @@ public class TestStandardFactories extends BaseTokenTestCase {
public void testLetterTokenizer() throws Exception {
Reader reader = new StringReader("What's this thing do?");
LetterTokenizerFactory factory = new LetterTokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"What", "s", "this", "thing", "do"});
@ -91,6 +97,7 @@ public class TestStandardFactories extends BaseTokenTestCase {
public void testLowerCaseTokenizer() throws Exception {
Reader reader = new StringReader("What's this thing do?");
LowerCaseTokenizerFactory factory = new LowerCaseTokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"what", "s", "this", "thing", "do"});
@ -103,6 +110,7 @@ public class TestStandardFactories extends BaseTokenTestCase {
Reader reader = new StringReader("Česká");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
ASCIIFoldingFilterFactory factory = new ASCIIFoldingFilterFactory();
factory.init(DEFAULT_VERSION_PARAM);
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "Ceska" });
}
@ -115,6 +123,7 @@ public class TestStandardFactories extends BaseTokenTestCase {
Reader reader = new StringReader("Česká");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
ISOLatin1AccentFilterFactory factory = new ISOLatin1AccentFilterFactory();
factory.init(DEFAULT_VERSION_PARAM);
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "Česka" });
}

View File

@ -0,0 +1,51 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<schema name="luceneMatchVersionTest" version="1.1">
<types>
<fieldtype name="text20" class="solr.TextField">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory" luceneMatchVersion="LUCENE_20"/>
<filter class="solr.StandardFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" luceneMatchVersion="LUCENE_24"/>
<filter class="solr.EnglishPorterFilterFactory"/>
</analyzer>
</fieldtype>
<fieldtype name="textDefault" class="solr.TextField">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StandardFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory"/>
</analyzer>
</fieldtype>
<fieldtype name="textStandardAnalyzer20" class="solr.TextField">
<analyzer class="org.apache.lucene.analysis.standard.StandardAnalyzer" luceneMatchVersion="LUCENE_20"/>
</fieldtype>
<fieldtype name="textStandardAnalyzerDefault" class="solr.TextField">
<analyzer class="org.apache.lucene.analysis.standard.StandardAnalyzer"/>
</fieldtype>
</types>
<fields>
<field name="text20" type="text20" indexed="true" stored="false" />
<field name="textDefault" type="textDefault" indexed="true" stored="false" />
<field name="textStandardAnalyzer20" type="textStandardAnalyzer20" indexed="true" stored="false" />
<field name="textStandardAnalyzerDefault" type="textStandardAnalyzerDefault" indexed="true" stored="false" />
</fields>
</schema>

View File

@ -45,6 +45,8 @@
not be changed if replication is in use. -->
<dataDir>${solr.data.dir:./solr/data}</dataDir>
<luceneMatchVersion>LUCENE_30</luceneMatchVersion>
<indexDefaults>
<!-- Values here affect all index writers and act as a default
unless overridden. -->