mirror of https://github.com/apache/lucene.git
SOLR-2769: Added factory for the new Hunspell stemmer (janhoy, cmale)
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1175200 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5d4502ad0a
commit
48e53be99a
|
@ -349,6 +349,9 @@ New Features
|
|||
* SOLR-2066,SOLR-2776: Added support for distributed grouping.
|
||||
(Martijn van Groningen, Jasper van Veghel, Matt Beaumont)
|
||||
|
||||
* SOLR-2769: Added factory for the new Hunspell stemmer capable of doing stemming
|
||||
for 99 languages (janhoy, cmale)
|
||||
|
||||
Bug Fixes
|
||||
----------------------
|
||||
* SOLR-2748: The CommitTracker used for commitWith or autoCommit by maxTime
|
||||
|
|
|
@ -0,0 +1,75 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.hunspell.HunspellDictionary;
|
||||
import org.apache.lucene.analysis.hunspell.HunspellStemFilter;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||
|
||||
/**
|
||||
* TokenFilterFactory that creates instances of {@link org.apache.lucene.analysis.hunspell.HunspellStemFilter}.
|
||||
* Example config for British English including a custom dictionary:
|
||||
* <pre class="prettyprint" >
|
||||
* <filter class="solr.HunspellStemFilterFactory"
|
||||
* dictionary="en_GB.dic,my_custom.dic"
|
||||
* affix="en_GB.aff"/></pre>
|
||||
* Dictionaries for many languages are available through the OpenOffice project
|
||||
* @see http://wiki.services.openoffice.org/wiki/Dictionaries
|
||||
*/
|
||||
public class HunspellStemFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
||||
|
||||
private HunspellDictionary dictionary;
|
||||
|
||||
/**
|
||||
* Loads the hunspell dictionary and affix files defined in the configuration
|
||||
*
|
||||
* @param loader ResourceLoader used to load the files
|
||||
*/
|
||||
public void inform(ResourceLoader loader) {
|
||||
assureMatchVersion();
|
||||
String dictionaryFiles[] = args.get("dictionary").split(",");
|
||||
String affixFile = args.get("affix");
|
||||
|
||||
try {
|
||||
List<InputStream> dictionaries = new ArrayList<InputStream>();
|
||||
for (String file : dictionaryFiles) {
|
||||
dictionaries.add(loader.openResource(file));
|
||||
}
|
||||
this.dictionary = new HunspellDictionary(loader.openResource(affixFile), dictionaries, luceneMatchVersion);
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Unable to load hunspell data! [dictionary=" + args.get("dictionary") + ",affix=" + affixFile + "]", e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an instance of {@link org.apache.lucene.analysis.hunspell.HunspellStemFilter} that will filter the given
|
||||
* TokenStream
|
||||
*
|
||||
* @param tokenStream TokenStream that will be filtered
|
||||
* @return HunspellStemFilter that filters the TokenStream
|
||||
*/
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
return new HunspellStemFilter(tokenStream, dictionary);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,13 @@
|
|||
SET UTF-8
|
||||
TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
|
||||
|
||||
SFX A Y 2
|
||||
SFX A 0 e n
|
||||
SFX A 0 e t
|
||||
|
||||
SFX C Y 2
|
||||
SFX C 0 d/C c
|
||||
SFX C 0 c b
|
||||
|
||||
PFX B Y 1
|
||||
PFX B 0 s o
|
|
@ -0,0 +1,6 @@
|
|||
5
|
||||
lucen/A
|
||||
lucene
|
||||
mahout/A
|
||||
olr/B
|
||||
ab/C
|
|
@ -0,0 +1,47 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.solr.core.SolrResourceLoader;
|
||||
import org.apache.solr.schema.IndexSchema;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure the Hunspell stemmer loads from factory
|
||||
*/
|
||||
public class TestHunspellStemFilterFactory extends BaseTokenTestCase {
|
||||
public void testStemming() throws Exception {
|
||||
HunspellStemFilterFactory factory = new HunspellStemFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("dictionary", "hunspell-test.dic");
|
||||
args.put("affix", "hunspell-test.aff");
|
||||
args.put(IndexSchema.LUCENE_MATCH_VERSION_PARAM, DEFAULT_VERSION.name());
|
||||
factory.init(args);
|
||||
factory.inform(new SolrResourceLoader("solr"));
|
||||
|
||||
Reader reader = new StringReader("abc");
|
||||
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
|
||||
assertTokenStreamContents(stream, new String[] { "ab" });
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue