mirror of https://github.com/apache/lucene.git
SOLR-2792: Allow case insensitive Hunspell stemming
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1179459 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2c6623a3b3
commit
22dcd39d9e
|
@ -27,6 +27,7 @@ import java.text.ParseException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Locale;
|
||||||
|
|
||||||
public class HunspellDictionary {
|
public class HunspellDictionary {
|
||||||
|
|
||||||
|
@ -43,11 +44,15 @@ public class HunspellDictionary {
|
||||||
private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
|
private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
|
||||||
private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
|
private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
|
||||||
|
|
||||||
|
private static final boolean IGNORE_CASE_DEFAULT = false;
|
||||||
|
|
||||||
private CharArrayMap<List<HunspellWord>> words;
|
private CharArrayMap<List<HunspellWord>> words;
|
||||||
private CharArrayMap<List<HunspellAffix>> prefixes;
|
private CharArrayMap<List<HunspellAffix>> prefixes;
|
||||||
private CharArrayMap<List<HunspellAffix>> suffixes;
|
private CharArrayMap<List<HunspellAffix>> suffixes;
|
||||||
|
|
||||||
private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy
|
private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy
|
||||||
|
private boolean ignoreCase = IGNORE_CASE_DEFAULT;
|
||||||
|
|
||||||
private final Version version;
|
private final Version version;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -61,7 +66,22 @@ public class HunspellDictionary {
|
||||||
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
|
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
|
||||||
*/
|
*/
|
||||||
public HunspellDictionary(InputStream affix, InputStream dictionary, Version version) throws IOException, ParseException {
|
public HunspellDictionary(InputStream affix, InputStream dictionary, Version version) throws IOException, ParseException {
|
||||||
this(affix, Arrays.asList(dictionary), version);
|
this(affix, Arrays.asList(dictionary), version, IGNORE_CASE_DEFAULT);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
|
||||||
|
* and dictionary files
|
||||||
|
*
|
||||||
|
* @param affix InputStream for reading the hunspell affix file
|
||||||
|
* @param dictionary InputStream for reading the hunspell dictionary file
|
||||||
|
* @param version Lucene Version
|
||||||
|
* @param ignoreCase If true, dictionary matching will be case insensitive
|
||||||
|
* @throws IOException Can be thrown while reading from the InputStreams
|
||||||
|
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
|
||||||
|
*/
|
||||||
|
public HunspellDictionary(InputStream affix, InputStream dictionary, Version version, boolean ignoreCase) throws IOException, ParseException {
|
||||||
|
this(affix, Arrays.asList(dictionary), version, ignoreCase);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -71,15 +91,17 @@ public class HunspellDictionary {
|
||||||
* @param affix InputStream for reading the hunspell affix file
|
* @param affix InputStream for reading the hunspell affix file
|
||||||
* @param dictionaries InputStreams for reading the hunspell dictionary file
|
* @param dictionaries InputStreams for reading the hunspell dictionary file
|
||||||
* @param version Lucene Version
|
* @param version Lucene Version
|
||||||
|
* @param ignoreCase If true, dictionary matching will be case insensitive
|
||||||
* @throws IOException Can be thrown while reading from the InputStreams
|
* @throws IOException Can be thrown while reading from the InputStreams
|
||||||
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
|
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
|
||||||
*/
|
*/
|
||||||
public HunspellDictionary(InputStream affix, List<InputStream> dictionaries, Version version) throws IOException, ParseException {
|
public HunspellDictionary(InputStream affix, List<InputStream> dictionaries, Version version, boolean ignoreCase) throws IOException, ParseException {
|
||||||
this.version = version;
|
this.version = version;
|
||||||
|
this.ignoreCase = ignoreCase;
|
||||||
String encoding = getDictionaryEncoding(affix);
|
String encoding = getDictionaryEncoding(affix);
|
||||||
CharsetDecoder decoder = getJavaEncoding(encoding);
|
CharsetDecoder decoder = getJavaEncoding(encoding);
|
||||||
readAffixFile(affix, decoder);
|
readAffixFile(affix, decoder);
|
||||||
words = new CharArrayMap<List<HunspellWord>>(version, 65535 /* guess */, false);
|
words = new CharArrayMap<List<HunspellWord>>(version, 65535 /* guess */, this.ignoreCase);
|
||||||
for (InputStream dictionary : dictionaries) {
|
for (InputStream dictionary : dictionaries) {
|
||||||
readDictionaryFile(dictionary, decoder);
|
readDictionaryFile(dictionary, decoder);
|
||||||
}
|
}
|
||||||
|
@ -129,8 +151,8 @@ public class HunspellDictionary {
|
||||||
* @throws IOException Can be thrown while reading from the InputStream
|
* @throws IOException Can be thrown while reading from the InputStream
|
||||||
*/
|
*/
|
||||||
private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException {
|
private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException {
|
||||||
prefixes = new CharArrayMap<List<HunspellAffix>>(version, 8, false);
|
prefixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase);
|
||||||
suffixes = new CharArrayMap<List<HunspellAffix>>(version, 8, false);
|
suffixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase);
|
||||||
|
|
||||||
BufferedReader reader = new BufferedReader(new InputStreamReader(affixStream, decoder));
|
BufferedReader reader = new BufferedReader(new InputStreamReader(affixStream, decoder));
|
||||||
String line = null;
|
String line = null;
|
||||||
|
@ -308,6 +330,9 @@ public class HunspellDictionary {
|
||||||
wordForm = new HunspellWord(flagParsingStrategy.parseFlags(line.substring(flagSep + 1, end)));
|
wordForm = new HunspellWord(flagParsingStrategy.parseFlags(line.substring(flagSep + 1, end)));
|
||||||
Arrays.sort(wordForm.getFlags());
|
Arrays.sort(wordForm.getFlags());
|
||||||
entry = line.substring(0, flagSep);
|
entry = line.substring(0, flagSep);
|
||||||
|
if(ignoreCase) {
|
||||||
|
entry = entry.toLowerCase(Locale.ENGLISH);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
List<HunspellWord> entries = words.get(entry);
|
List<HunspellWord> entries = words.get(entry);
|
||||||
|
@ -408,4 +433,8 @@ public class HunspellDictionary {
|
||||||
return flags;
|
return flags;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean isIgnoreCase() {
|
||||||
|
return ignoreCase;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,9 +21,14 @@ import java.io.FileInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.text.ParseException;
|
import java.text.ParseException;
|
||||||
import java.util.*;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Scanner;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.util.CharArraySet;
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
import org.apache.lucene.analysis.util.CharacterUtils;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -36,6 +41,7 @@ public class HunspellStemmer {
|
||||||
|
|
||||||
private final HunspellDictionary dictionary;
|
private final HunspellDictionary dictionary;
|
||||||
private final StringBuilder segment = new StringBuilder();
|
private final StringBuilder segment = new StringBuilder();
|
||||||
|
private CharacterUtils charUtils = CharacterUtils.getInstance(Version.LUCENE_40);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems
|
* Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems
|
||||||
|
@ -79,7 +85,7 @@ public class HunspellStemmer {
|
||||||
*/
|
*/
|
||||||
public List<Stem> uniqueStems(char word[], int length) {
|
public List<Stem> uniqueStems(char word[], int length) {
|
||||||
List<Stem> stems = new ArrayList<Stem>();
|
List<Stem> stems = new ArrayList<Stem>();
|
||||||
CharArraySet terms = new CharArraySet(dictionary.getVersion(), 8, false);
|
CharArraySet terms = new CharArraySet(dictionary.getVersion(), 8, dictionary.isIgnoreCase());
|
||||||
if (dictionary.lookupWord(word, 0, length) != null) {
|
if (dictionary.lookupWord(word, 0, length) != null) {
|
||||||
stems.add(new Stem(word, length));
|
stems.add(new Stem(word, length));
|
||||||
terms.add(word);
|
terms.add(word);
|
||||||
|
@ -167,6 +173,12 @@ public class HunspellStemmer {
|
||||||
*/
|
*/
|
||||||
@SuppressWarnings("unchecked")
|
@SuppressWarnings("unchecked")
|
||||||
public List<Stem> applyAffix(char strippedWord[], int length, HunspellAffix affix, int recursionDepth) {
|
public List<Stem> applyAffix(char strippedWord[], int length, HunspellAffix affix, int recursionDepth) {
|
||||||
|
if(dictionary.isIgnoreCase()) {
|
||||||
|
for(int i=0;i<strippedWord.length;){
|
||||||
|
i += Character.toChars(
|
||||||
|
Character.toLowerCase(charUtils.codePointAt(strippedWord, i)), strippedWord, i);
|
||||||
|
}
|
||||||
|
}
|
||||||
segment.setLength(0);
|
segment.setLength(0);
|
||||||
segment.append(strippedWord, 0, length);
|
segment.append(strippedWord, 0, length);
|
||||||
if (!affix.checkCondition(segment)) {
|
if (!affix.checkCondition(segment)) {
|
||||||
|
@ -174,7 +186,7 @@ public class HunspellStemmer {
|
||||||
}
|
}
|
||||||
|
|
||||||
List<Stem> stems = new ArrayList<Stem>();
|
List<Stem> stems = new ArrayList<Stem>();
|
||||||
|
|
||||||
List<HunspellWord> words = dictionary.lookupWord(strippedWord, 0, length);
|
List<HunspellWord> words = dictionary.lookupWord(strippedWord, 0, length);
|
||||||
if (words != null) {
|
if (words != null) {
|
||||||
for (HunspellWord hunspellWord : words) {
|
for (HunspellWord hunspellWord : words) {
|
||||||
|
@ -294,15 +306,24 @@ public class HunspellStemmer {
|
||||||
* @throws ParseException Can be thrown while parsing the files
|
* @throws ParseException Can be thrown while parsing the files
|
||||||
*/
|
*/
|
||||||
public static void main(String[] args) throws IOException, ParseException {
|
public static void main(String[] args) throws IOException, ParseException {
|
||||||
if (args.length != 2) {
|
boolean ignoreCase = false;
|
||||||
System.out.println("usage: HunspellStemmer <affix location> <dic location>");
|
int offset = 0;
|
||||||
|
|
||||||
|
if (args.length < 2) {
|
||||||
|
System.out.println("usage: HunspellStemmer [-i] <affix location> <dic location>");
|
||||||
System.exit(1);
|
System.exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
InputStream affixInputStream = new FileInputStream(args[0]);
|
if(args[offset].equals("-i")) {
|
||||||
InputStream dicInputStream = new FileInputStream(args[1]);
|
ignoreCase = true;
|
||||||
|
System.out.println("Ignoring case. All stems will be returned lowercased");
|
||||||
|
offset++;
|
||||||
|
}
|
||||||
|
|
||||||
|
InputStream affixInputStream = new FileInputStream(args[offset++]);
|
||||||
|
InputStream dicInputStream = new FileInputStream(args[offset++]);
|
||||||
|
|
||||||
HunspellDictionary dictionary = new HunspellDictionary(affixInputStream, dicInputStream, Version.LUCENE_40);
|
HunspellDictionary dictionary = new HunspellDictionary(affixInputStream, dicInputStream, Version.LUCENE_40, ignoreCase);
|
||||||
|
|
||||||
affixInputStream.close();
|
affixInputStream.close();
|
||||||
dicInputStream.close();
|
dicInputStream.close();
|
||||||
|
|
|
@ -34,7 +34,7 @@ public class HunspellDictionaryTest {
|
||||||
InputStream dictStream = getClass().getResourceAsStream("test.dic");
|
InputStream dictStream = getClass().getResourceAsStream("test.dic");
|
||||||
|
|
||||||
HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, Version.LUCENE_40);
|
HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, Version.LUCENE_40);
|
||||||
assertEquals(2, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
|
assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
|
||||||
assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
|
assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
|
||||||
assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3).size());
|
assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3).size());
|
||||||
|
|
||||||
|
|
|
@ -34,14 +34,7 @@ public class HunspellStemmerTest {
|
||||||
|
|
||||||
@BeforeClass
|
@BeforeClass
|
||||||
public static void beforeClass() throws IOException, ParseException {
|
public static void beforeClass() throws IOException, ParseException {
|
||||||
InputStream affixStream = HunspellStemmerTest.class.getResourceAsStream("test.aff");
|
createStemmer(true);
|
||||||
InputStream dictStream = HunspellStemmerTest.class.getResourceAsStream("test.dic");
|
|
||||||
|
|
||||||
HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, Version.LUCENE_40);
|
|
||||||
stemmer = new HunspellStemmer(dictionary);
|
|
||||||
|
|
||||||
affixStream.close();
|
|
||||||
dictStream.close();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -73,4 +66,61 @@ public class HunspellStemmerTest {
|
||||||
assertEquals("ab", stems.get(0).getStemString());
|
assertEquals("ab", stems.get(0).getStemString());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testStem_ignoreCase() throws IOException, ParseException {
|
||||||
|
List<HunspellStemmer.Stem> stems;
|
||||||
|
createStemmer(true);
|
||||||
|
|
||||||
|
stems = stemmer.stem("apache");
|
||||||
|
assertEquals(1, stems.size());
|
||||||
|
assertEquals("apach", stems.get(0).getStemString());
|
||||||
|
|
||||||
|
stems = stemmer.stem("APACHE");
|
||||||
|
assertEquals(1, stems.size());
|
||||||
|
assertEquals("apach", stems.get(0).getStemString());
|
||||||
|
|
||||||
|
stems = stemmer.stem("Apache");
|
||||||
|
assertEquals(1, stems.size());
|
||||||
|
assertEquals("apach", stems.get(0).getStemString());
|
||||||
|
|
||||||
|
stems = stemmer.stem("foos");
|
||||||
|
assertEquals(1, stems.size());
|
||||||
|
assertEquals("foo", stems.get(0).getStemString());
|
||||||
|
|
||||||
|
stems = stemmer.stem("food");
|
||||||
|
assertEquals(1, stems.size());
|
||||||
|
assertEquals("foo", stems.get(0).getStemString());
|
||||||
|
|
||||||
|
stems = stemmer.stem("Foos");
|
||||||
|
assertEquals(1, stems.size());
|
||||||
|
assertEquals("foo", stems.get(0).getStemString());
|
||||||
|
|
||||||
|
stems = stemmer.stem("Food");
|
||||||
|
assertEquals(1, stems.size());
|
||||||
|
assertEquals("foo", stems.get(0).getStemString());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testStem_caseSensitive() throws IOException, ParseException {
|
||||||
|
createStemmer(false);
|
||||||
|
List<HunspellStemmer.Stem> stems = stemmer.stem("apache");
|
||||||
|
assertEquals(0, stems.size());
|
||||||
|
|
||||||
|
stems = stemmer.stem("Apache");
|
||||||
|
assertEquals(1, stems.size());
|
||||||
|
assertEquals("Apach", stems.get(0).getStemString());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static void createStemmer(boolean ignoreCase) throws IOException, ParseException {
|
||||||
|
InputStream affixStream = HunspellStemmerTest.class.getResourceAsStream("test.aff");
|
||||||
|
InputStream dictStream = HunspellStemmerTest.class.getResourceAsStream("test.dic");
|
||||||
|
|
||||||
|
HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, Version.LUCENE_40, ignoreCase);
|
||||||
|
stemmer = new HunspellStemmer(dictionary);
|
||||||
|
|
||||||
|
affixStream.close();
|
||||||
|
dictStream.close();
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,13 +1,20 @@
|
||||||
SET UTF-8
|
SET UTF-8
|
||||||
TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
|
TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
|
||||||
|
|
||||||
SFX A Y 2
|
SFX A Y 3
|
||||||
SFX A 0 e n
|
SFX A 0 e n
|
||||||
SFX A 0 e t
|
SFX A 0 e t
|
||||||
|
SFX A 0 e h
|
||||||
|
|
||||||
SFX C Y 2
|
SFX C Y 2
|
||||||
SFX C 0 d/C c
|
SFX C 0 d/C c
|
||||||
SFX C 0 c b
|
SFX C 0 c b
|
||||||
|
|
||||||
|
SFX D Y 1
|
||||||
|
SFX D 0 s o
|
||||||
|
|
||||||
|
SFX E Y 1
|
||||||
|
SFX E 0 d o
|
||||||
|
|
||||||
PFX B Y 1
|
PFX B Y 1
|
||||||
PFX B 0 s o
|
PFX B 0 s o
|
|
@ -1,6 +1,9 @@
|
||||||
5
|
6
|
||||||
lucen/A
|
lucen/A
|
||||||
lucene
|
lucene
|
||||||
mahout/A
|
mahout/A
|
||||||
olr/B
|
olr/B
|
||||||
ab/C
|
ab/C
|
||||||
|
Apach/A
|
||||||
|
foo/D
|
||||||
|
Foo/E
|
|
@ -408,6 +408,8 @@ Bug Fixes
|
||||||
|
|
||||||
* SOLR-2372: Upgrade Solr to Tika 0.10 (janhoy)
|
* SOLR-2372: Upgrade Solr to Tika 0.10 (janhoy)
|
||||||
|
|
||||||
|
* SOLR-2792: Allow case insensitive Hunspell stemming (janhoy, rmuir)
|
||||||
|
|
||||||
================== 3.4.0 ==================
|
================== 3.4.0 ==================
|
||||||
|
|
||||||
Upgrading from Solr 3.3
|
Upgrading from Solr 3.3
|
||||||
|
|
|
@ -25,21 +25,36 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.hunspell.HunspellDictionary;
|
import org.apache.lucene.analysis.hunspell.HunspellDictionary;
|
||||||
import org.apache.lucene.analysis.hunspell.HunspellStemFilter;
|
import org.apache.lucene.analysis.hunspell.HunspellStemFilter;
|
||||||
import org.apache.solr.common.ResourceLoader;
|
import org.apache.solr.common.ResourceLoader;
|
||||||
|
import org.apache.solr.common.SolrException;
|
||||||
|
import org.apache.solr.common.SolrException.ErrorCode;
|
||||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* TokenFilterFactory that creates instances of {@link org.apache.lucene.analysis.hunspell.HunspellStemFilter}.
|
* TokenFilterFactory that creates instances of {@link org.apache.lucene.analysis.hunspell.HunspellStemFilter}.
|
||||||
* Example config for British English including a custom dictionary:
|
* Example config for British English including a custom dictionary, case insensitive matching:
|
||||||
* <pre class="prettyprint" >
|
* <pre class="prettyprint" >
|
||||||
* <filter class="solr.HunspellStemFilterFactory"
|
* <filter class="solr.HunspellStemFilterFactory"
|
||||||
* dictionary="en_GB.dic,my_custom.dic"
|
* dictionary="en_GB.dic,my_custom.dic"
|
||||||
* affix="en_GB.aff"/></pre>
|
* affix="en_GB.aff"
|
||||||
* Dictionaries for many languages are available through the OpenOffice project
|
* ignoreCase="true" /></pre>
|
||||||
* <p>See: <a href="http://wiki.services.openoffice.org/wiki/Dictionaries">OpenOffice Dictionaries</a>
|
* Both parameters dictionary and affix are mandatory.
|
||||||
|
* <br/>
|
||||||
|
* The parameter ignoreCase (true/false) controls whether matching is case sensitive or not. Default false.
|
||||||
|
* <br/>
|
||||||
|
* Dictionaries for many languages are available through the OpenOffice project.
|
||||||
|
*
|
||||||
|
* See <a href="http://wiki.apache.org/solr/Hunspell">http://wiki.apache.org/solr/Hunspell</a>
|
||||||
*/
|
*/
|
||||||
public class HunspellStemFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
public class HunspellStemFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
||||||
|
|
||||||
|
private static final String PARAM_DICTIONARY = "dictionary";
|
||||||
|
private static final String PARAM_AFFIX = "affix";
|
||||||
|
private static final String PARAM_IGNORE_CASE = "ignoreCase";
|
||||||
|
private static final String TRUE = "true";
|
||||||
|
private static final String FALSE = "false";
|
||||||
|
|
||||||
private HunspellDictionary dictionary;
|
private HunspellDictionary dictionary;
|
||||||
|
private boolean ignoreCase = false;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Loads the hunspell dictionary and affix files defined in the configuration
|
* Loads the hunspell dictionary and affix files defined in the configuration
|
||||||
|
@ -48,15 +63,21 @@ public class HunspellStemFilterFactory extends BaseTokenFilterFactory implements
|
||||||
*/
|
*/
|
||||||
public void inform(ResourceLoader loader) {
|
public void inform(ResourceLoader loader) {
|
||||||
assureMatchVersion();
|
assureMatchVersion();
|
||||||
String dictionaryFiles[] = args.get("dictionary").split(",");
|
String dictionaryFiles[] = args.get(PARAM_DICTIONARY).split(",");
|
||||||
String affixFile = args.get("affix");
|
String affixFile = args.get(PARAM_AFFIX);
|
||||||
|
String pic = args.get(PARAM_IGNORE_CASE);
|
||||||
|
if(pic != null) {
|
||||||
|
if(pic.equalsIgnoreCase(TRUE)) ignoreCase = true;
|
||||||
|
else if(pic.equalsIgnoreCase(FALSE)) ignoreCase = false;
|
||||||
|
else throw new SolrException(ErrorCode.UNKNOWN, "Unknown value for "+PARAM_IGNORE_CASE+": "+pic+". Must be true or false");
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
List<InputStream> dictionaries = new ArrayList<InputStream>();
|
List<InputStream> dictionaries = new ArrayList<InputStream>();
|
||||||
for (String file : dictionaryFiles) {
|
for (String file : dictionaryFiles) {
|
||||||
dictionaries.add(loader.openResource(file));
|
dictionaries.add(loader.openResource(file));
|
||||||
}
|
}
|
||||||
this.dictionary = new HunspellDictionary(loader.openResource(affixFile), dictionaries, luceneMatchVersion);
|
this.dictionary = new HunspellDictionary(loader.openResource(affixFile), dictionaries, luceneMatchVersion, ignoreCase);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
throw new RuntimeException("Unable to load hunspell data! [dictionary=" + args.get("dictionary") + ",affix=" + affixFile + "]", e);
|
throw new RuntimeException("Unable to load hunspell data! [dictionary=" + args.get("dictionary") + ",affix=" + affixFile + "]", e);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue